In [1]:
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from tqdm import tqdm
import sys, os, math

sys.path.insert(0, '../dlp')
from data_access import PQDataAccess
from data_process import *

pd.set_option('future.no_silent_downcasting', True)
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(device)

batch_size = 64
block_size = 32
da = PQDataAccess("/home/aac/Alireza/datasets/taxseq/corpus_1000", batch_size)
epochs= 10_000
val_epoch = 100
num_val = 25

model_name = "GPT"
checkpoint_dir = f"../checkpoints/{model_name}_checkpoints"

if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)
print(checkpoint_dir)

 WORLD_SIZE=1 , LOCAL_WORLD_SIZE=1,RANK =0,LOCAL_RANK = 0 


  from .autonotebook import tqdm as notebook_tqdm


Loaded dictionary.
cuda:0
../checkpoints/GPT_checkpoints


In [2]:
from models.GPTModel import DecoderOnlyModel

# Define model parameters
vocab_size = 23
embed_size = 512
num_heads = 8
num_layers = 6

# Instantiate the model
model = DecoderOnlyModel(vocab_size, embed_size, num_heads, num_layers, chunk_size).to(device)
# print(model)

criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Assuming optimizer is defined as optimizer_cls or optimizer based on your model
scheduler = StepLR(optimizer, step_size=5, gamma=0.1)  # Adjust step_size and gamma as needed
print("model:", sum(p.numel() for p in model.parameters()) / 1e6, 'M parameters')

model: 25.247767 M parameters


In [3]:
def evaluate(model, test_loader):
    model.to(device)
    model.eval()  # Set model to evaluation mode
    running_loss = 0.0
    
    all_preds = []
    all_labels = []

    with torch.no_grad():  # Disable gradient computation during evaluation
        for batch_idx, (i_sequences, t_sequences, taxonomy_ids) in tqdm(enumerate(test_loader)):
            i_sequences = i_sequences.to(device).float()
            t_sequences = t_sequences.to(device).float()
            # taxonomy_ids = taxonomy_ids.to(device)

            outputs = model(i_sequences)
            
            # Calculate loss
            loss = criterion(outputs, t_sequences)

            outputs = nn.Softmax(dim=1)(outputs)
            index = torch.argmax(outputs, dim=1)

            all_preds.append(index.cpu())
            all_labels.append(taxonomy_ids.cpu())
            
            running_loss += loss.item()

    # Concatenate all batches into single tensors
    all_preds = torch.cat(all_preds)
    all_labels = torch.cat(all_labels)

    # Calculate accuracy, precision, recall, and F1-score with best threshold
    accuracy = np.mean([accuracy_score(l, p) for p, l in zip(all_preds, all_labels)])    
    f1 = f1_score(all_labels.numpy(), all_preds.numpy(), average='micro')
    precision = precision_score(all_labels.numpy(), all_preds.numpy(), average='micro')
    recall = recall_score(all_labels.numpy(), all_preds.numpy(), average='micro')
        
    avg_loss = running_loss / len(test_loader)
    return avg_loss, accuracy, f1, precision, recall

In [10]:
def train_step(model, optimizer, da, device):
    # Zero the gradients
    optimizer.zero_grad()
    
    # Get batch and convert to tensor
    tensor_batch = GPT_data_to_tensor_batch(da.get_batch())
    tensor_batch.gpu(device)
    
    src = tensor_batch.input_ids
    tgt = tensor_batch.output_ids

    # print(src)
    # print(src.shape)
    output = model(src)
    # print(output.shape)
    # print(tgt.shape)
    # Calculate loss
    loss = criterion(output.view(-1, output.size(-1)), tgt.view(-1))
    # loss = criterion(output, tgt)
    
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    
    optimizer.step()

    # scheduler.step()
    return loss.item()

In [11]:
def generate(max_length=100):
    model.eval()
    generated_seq = [1]
    
    for _ in range(max_length - 1):
        input_seq = torch.tensor(generated_seq).unsqueeze(0).to(device)  # Add batch dimension
        with torch.no_grad():
            output = model(input_seq)
        
        # Get the predicted next token
        next_token = torch.argmax(output[:, -1, :], dim=-1).item()
        generated_seq.append(next_token)
        
        # Stop if we predict the padding token
        if next_token <= 2:
            break
    model.train()
    print(*[special_idx_to_char[s] for s in generated_seq])
    
    return generated_seq

generated_seq = generate()

<s> <s>


In [None]:
model.train()

train_losses = []

for epoch in range(epochs):
    train_loss = train_step(model, optimizer, da, device)
    train_losses.append(train_loss)
    
    if (epoch + 1) % val_epoch == 0:
        mean_train_loss = sum(train_losses[-val_epoch:]) / val_epoch
        print(f"Epoch {epoch+1}, Train Loss: {mean_train_loss:.4f}")
        generate()