In [17]:
%load_ext autoreload
%autoreload 2
import sys 

sys.path.append('..')
from omegaconf import OmegaConf
from pprint import pprint
from dacite import from_dict
from dacite import Config as DaciteConfig
import torch

from xlstm.xlstm_lm_model import xLSTMLMModel, xLSTMLMModelConfig

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
# create new model to load the checkpoint into
xlstm_cfg = """ 
vocab_size: 600
context_length: 64      
num_blocks: 24 #!
embedding_dim: 600 #!
tie_weights: false
weight_decay_on_embedding: false
mlstm_block:
  mlstm:
    conv1d_kernel_size: 4
    qkv_proj_blocksize: 4
    num_heads: 4
"""
cfg = OmegaConf.create(xlstm_cfg)
cfg = from_dict(data_class=xLSTMLMModelConfig, data=OmegaConf.to_container(cfg), config=DaciteConfig(strict=True))
model_new = xLSTMLMModel(cfg)

In [19]:
DEVICE = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")

In [20]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer
import pandas as pd

CUDA_LAUNCH_BLOCKING="1"
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

tokenizer = Tokenizer(BPE(unk_token='[UNK]'))
tokenizer.pre_tokenizer = Whitespace()
tokenizer.enable_truncation(max_length=64)
tokenizer.enable_padding(direction='left',length=64)
trainer = BpeTrainer(vocab_size=600, min_frequency=2, special_tokens=['[PAD]','[UNK]'])

df = pd.read_csv('/root/projects/SmilesTuneLLM/language_models/chembl_alpaca.txt')
tokenizer.train_from_iterator(df['output'], trainer)

KeyboardInterrupt: 

In [None]:
len(tokenizer.encode('CH').ids)

64

In [None]:
class EssayDataset:
    def __init__(self):
        self.texts = self.formatting_prompts_func(pd.read_csv('/root/projects/SmilesTuneLLM/language_models/chembl_alpaca.txt'))
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        y = self.tokenizer.encode(text).ids
        return torch.tensor(self.tokenizer.encode('[PAD]').ids), torch.tensor(y)
    
    def formatting_prompts_func(self, examples):
        outputs = examples["output"]
        texts = []
        for output in outputs:
            text = '[PAD]' + output + '[UNK]'
            texts.append(text)
        return texts
    
train_ds = EssayDataset()
train_loader = torch.utils.data.DataLoader(train_ds, batch_size=8, shuffle=False)



In [None]:
import torch
import torch.optim as optim
import torch
import torch.optim as optim
from dacite import from_dict

from experiments.data.utils import DataGen
from experiments.lr_scheduler import LinearWarmupCosineAnnealing
from omegaconf import DictConfig, OmegaConf
from torch import nn
from torch.utils.data import DataLoader
from tqdm import tqdm

from xlstm.xlstm_lm_model import xLSTMLMModel, xLSTMLMModelConfig

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
model_new.reset_parameters()
model = model_new.to('cuda')

optim_groups = model._create_weight_decay_optim_groups()
# optimizer = optim.AdamW(
#     (
#         {"weight_decay": 0.1, "params": optim_groups[0]},
#         {"weight_decay": 0.0, "params": optim_groups[1]},
#     ),
#     lr=0.001,
# )
# lr_scheduler = LinearWarmupCosineAnnealing(
#     optimizer,
#     1000,
#     1000,
#     0.001,
#     0.001 * 0.001,
# )

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()



In [29]:
# Training loop
step = 0
epoch = 0.0005
running_loss = 0.0
# torch_dtype_map: dict[str, torch.dtype] = {
#     "float32": torch.float32,
#     "float16": torch.float16,
# }

In [33]:
len(train_loader) * 8 * 0.0001

175.32160000000002

In [35]:
# for i in range(int(len(train_loader) / 8 * epoch)):
for i in range(100):
    for inputs, labels in tqdm(train_loader):
        inputs = inputs.to(device='cuda')
        labels = labels.to(device='cuda')

        model.train()
        optimizer.zero_grad()
        with torch.autocast(device_type='cuda', enabled=True):
            outputs = model(inputs.to(device='cuda'))
            # loss = nn.functional.cross_entropy(
            #         outputs.view(-1, 600),
            #         labels.view(-1),
            #         ignore_index=-1,
            # )
            loss = criterion
            loss.backward()
            optimizer.step()
            running_loss = running_loss * step / (step + 1) + loss.item() * 1 / (step + 1)

        if step % 50 == 0:
            print(
                f"\nStep [{step+1}/{1000}] (Epoch: {epoch}), Loss: {running_loss:.4f},"
            )
        step += 1
        
    epoch += 1

  0%|          | 0/219152 [00:00<?, ?it/s]

  0%|          | 31/219152 [00:05<9:53:39,  6.15it/s] 


Step [1201/1000] (Epoch: 0.0005), Loss: 1.9651,


  0%|          | 81/219152 [00:13<9:57:49,  6.11it/s] 


Step [1251/1000] (Epoch: 0.0005), Loss: 1.9644,


  0%|          | 131/219152 [00:21<10:10:33,  5.98it/s]


Step [1301/1000] (Epoch: 0.0005), Loss: 1.9622,


  0%|          | 181/219152 [00:29<10:11:39,  5.97it/s]


Step [1351/1000] (Epoch: 0.0005), Loss: 1.9630,


  0%|          | 202/219152 [00:33<10:03:36,  6.05it/s]


KeyboardInterrupt: 

In [50]:
outputs.view(-1, 600).size()


torch.Size([512, 600])

In [51]:
labels.view(-1).size()

torch.Size([512])

In [54]:
outputs[0]

tensor([[12.2812, -0.5088, -0.5786,  ..., -0.9326, -0.2507, -0.0715],
        [11.6016, -1.0752, -0.6284,  ..., -0.9995,  0.0534, -0.2812],
        [11.5625, -0.9150,  0.3298,  ..., -1.2275, -1.1846, -0.9448],
        ...,
        [ 1.3496,  2.2363,  0.6045,  ..., -1.3662, -1.0938, -2.7246],
        [-0.6812,  6.5664, -1.1836,  ..., -1.3779, -1.1787, -2.6680],
        [-2.3066,  9.8359, -2.6230,  ..., -1.0732, -0.9731, -2.2539]],
       device='cuda:0', dtype=torch.float16, grad_fn=<SelectBackward0>)

In [None]:
y_new = model_new(x_in)

In [None]:
y_new.shape

torch.Size([1, 32, 50304])

In [None]:
y_new_step = []
state = None
for x in x_in.split(1, dim=1):
    y, state = model_new.step(x, state)
    y_new_step.append(y)
y_new_step = torch.cat(y_new_step, dim=1)
print(x.shape)

torch.Size([1, 1])


In [None]:
y_new_step.shape

torch.Size([1, 32, 50304])

In [None]:
y_new - y_new_step

tensor([[[ 4.1723e-07,  2.0862e-07, -5.9605e-08,  ...,  0.0000e+00,
           5.9605e-08,  2.9802e-08],
         [ 2.0862e-07, -1.4901e-07, -1.1921e-07,  ..., -3.5390e-07,
           5.9605e-08, -1.4901e-07],
         [ 1.1921e-07,  1.1921e-07,  0.0000e+00,  ...,  3.5763e-07,
           1.1921e-07,  3.5763e-07],
         ...,
         [-5.6624e-07,  1.6391e-07, -2.9802e-08,  ...,  0.0000e+00,
           8.9407e-08, -1.6391e-07],
         [ 6.1095e-07, -4.1723e-07,  3.8743e-07,  ...,  2.3842e-07,
           1.7881e-07, -3.5763e-07],
         [ 1.5646e-07, -2.3842e-07,  4.7684e-07,  ..., -9.6858e-08,
          -1.0431e-07, -1.0431e-07]]], device='cuda:0', grad_fn=<SubBackward0>)

In [None]:
torch.allclose(y_new, y_new_step, atol=1e-5)

True