In [82]:
from accelerate import Accelerator, DeepSpeedPlugin
from accelerate import DistributedDataParallelKwargs
import torch
from models import TimeLLM
import argparse

In [46]:
path = "/home/brunodifranco/mestrado/Time-LLM/checkpoints/long_term_forecast_ETTh1_512_720_TimeLLM_ETTh1_ftM_sl512_ll48_pl720_dm32_nh8_el2_dl1_df128_fc3_ebtimeF_Exp_0-TimeLLM-ETTh1/checkpoint"
params = torch.load(path)

In [35]:
ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
deepspeed_plugin = DeepSpeedPlugin(hf_ds_config='./ds_config_zero2.json')
accelerator = Accelerator(kwargs_handlers=[ddp_kwargs], deepspeed_plugin=deepspeed_plugin)

In [75]:
configs = argparse.Namespace(
    task_name="long_term_forecast",
    pred_len=96,
    seq_len=512,
    d_ff=128,
    patch_len=16,
    stride=8,
    llm_model="GPT2",
    llm_layers=8,
    llm_dim=768,
    dropout=0.1,
    n_heads=7,
    d_model=32,
    enc_in=7,
    prompt_domain=True,
    content="The Electricity Transformer Temperature (ETT) is a crucial indicator in the electric power long-term deployment. This dataset consists of 2 years data from two separated counties in China. To explore the granularity on the Long sequence time-series forecasting (LSTF) problem, different subsets are created, {ETTh1, ETTh2} for 1-hour-level and ETTm1 for 15-minutes-level. Each data point consists of the target value ”oil temperature” and 6 power load features. The train/val/test is 12/4/4 months."
)


In [77]:
model_instance = TimeLLM.Model(configs=configs)

In [78]:
model_instance

Model(
  (llm_model): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-7): 8 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (patch_embedding): PatchEmbedding(
    (padding_patch_layer): ReplicationPad1d()
    (value_embedding): TokenEmbedding(
      (tokenConv): Conv1d(16, 32

In [76]:
path = "/home/brunodifranco/mestrado/Time-LLM/checkpoints/long_term_forecast_ETTh1_512_720_TimeLLM_ETTh1_ftM_sl512_ll48_pl720_dm32_nh8_el2_dl1_df128_fc3_ebtimeF_Exp_0-TimeLLM-ETTh1/checkpoint"
params = torch.load(path)

In [81]:
# # Carregar o estado do modelo
# state_dict = torch.load(path)
TimeLLM.Model.load_state_dict(state_dict=model_instance)

TypeError: Module.load_state_dict() missing 1 required positional argument: 'self'

In [74]:
state_dict[""]

OrderedDict([('word_embeddings',
              tensor([[-0.1099, -0.0393,  0.0332,  ..., -0.1367,  0.0151,  0.0454],
                      [ 0.0403, -0.0486,  0.0461,  ...,  0.0859,  0.0025,  0.0432],
                      [-0.1279,  0.0479,  0.1846,  ...,  0.0898, -0.1299, -0.0879],
                      ...,
                      [-0.0444, -0.0549,  0.0123,  ...,  0.1045,  0.0977, -0.0693],
                      [ 0.1855,  0.0166,  0.0461,  ..., -0.0962,  0.0786, -0.0225],
                      [ 0.0513, -0.0277,  0.0500,  ...,  0.0070,  0.1553,  0.1206]],
                     device='cuda:0', dtype=torch.bfloat16)),
             ('llm_model.wte.weight',
              tensor([[-0.1099, -0.0393,  0.0332,  ..., -0.1367,  0.0151,  0.0454],
                      [ 0.0403, -0.0486,  0.0461,  ...,  0.0859,  0.0025,  0.0432],
                      [-0.1279,  0.0479,  0.1846,  ...,  0.0898, -0.1299, -0.0879],
                      ...,
                      [-0.0444, -0.0549,  0.0123,  ..., 

In [27]:
type(model)

collections.OrderedDict

In [32]:
model.keys()

odict_keys(['word_embeddings', 'llm_model.wte.weight', 'llm_model.wpe.weight', 'llm_model.h.0.ln_1.weight', 'llm_model.h.0.ln_1.bias', 'llm_model.h.0.attn.c_attn.weight', 'llm_model.h.0.attn.c_attn.bias', 'llm_model.h.0.attn.c_proj.weight', 'llm_model.h.0.attn.c_proj.bias', 'llm_model.h.0.ln_2.weight', 'llm_model.h.0.ln_2.bias', 'llm_model.h.0.mlp.c_fc.weight', 'llm_model.h.0.mlp.c_fc.bias', 'llm_model.h.0.mlp.c_proj.weight', 'llm_model.h.0.mlp.c_proj.bias', 'llm_model.h.1.ln_1.weight', 'llm_model.h.1.ln_1.bias', 'llm_model.h.1.attn.c_attn.weight', 'llm_model.h.1.attn.c_attn.bias', 'llm_model.h.1.attn.c_proj.weight', 'llm_model.h.1.attn.c_proj.bias', 'llm_model.h.1.ln_2.weight', 'llm_model.h.1.ln_2.bias', 'llm_model.h.1.mlp.c_fc.weight', 'llm_model.h.1.mlp.c_fc.bias', 'llm_model.h.1.mlp.c_proj.weight', 'llm_model.h.1.mlp.c_proj.bias', 'llm_model.h.2.ln_1.weight', 'llm_model.h.2.ln_1.bias', 'llm_model.h.2.attn.c_attn.weight', 'llm_model.h.2.attn.c_attn.bias', 'llm_model.h.2.attn.c_proj.

In [19]:
ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
deepspeed_plugin = DeepSpeedPlugin(hf_ds_config='./ds_config_zero2.json')
accelerator = Accelerator(kwargs_handlers=[ddp_kwargs], deepspeed_plugin=deepspeed_plugin)

In [22]:
unwrapped_model = accelerator.unwrap_model(model=model)

In [25]:
unwrapped_model.load_state_dict()

AttributeError: 'collections.OrderedDict' object has no attribute '_load_state_dict'

In [23]:
unwrapped_model.load_state_dict(
    torch.load(path), map_location=lambda storage, loc: storage)



AttributeError: 'collections.OrderedDict' object has no attribute 'load_state_dict'

In [17]:
import os

OrderedDict([('word_embeddings',
              tensor([[-0.1099, -0.0393,  0.0332,  ..., -0.1367,  0.0151,  0.0454],
                      [ 0.0403, -0.0486,  0.0461,  ...,  0.0859,  0.0025,  0.0432],
                      [-0.1279,  0.0479,  0.1846,  ...,  0.0898, -0.1299, -0.0879],
                      ...,
                      [-0.0444, -0.0549,  0.0123,  ...,  0.1045,  0.0977, -0.0693],
                      [ 0.1855,  0.0166,  0.0461,  ..., -0.0962,  0.0786, -0.0225],
                      [ 0.0513, -0.0277,  0.0500,  ...,  0.0070,  0.1553,  0.1206]],
                     device='cuda:0', dtype=torch.bfloat16)),
             ('llm_model.wte.weight',
              tensor([[-0.1099, -0.0393,  0.0332,  ..., -0.1367,  0.0151,  0.0454],
                      [ 0.0403, -0.0486,  0.0461,  ...,  0.0859,  0.0025,  0.0432],
                      [-0.1279,  0.0479,  0.1846,  ...,  0.0898, -0.1299, -0.0879],
                      ...,
                      [-0.0444, -0.0549,  0.0123,  ..., 