In [2]:
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import sys, os, math

sys.path.insert(0, '../dlp')
from data_access import PQDataAccess

pd.set_option('future.no_silent_downcasting', True)
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(device)

batch_size = 32
da = PQDataAccess("/home/aac/Alireza/datasets/taxseq/corpus_1000", batch_size)
epochs= 10_000
val_epoch = 50
num_val = 25

model_name = "tokenizer" # "FNN", "hierarchy", "T5"
checkpoint_dir = f"../checkpoints/{model_name}_checkpoints"

if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)
print(checkpoint_dir)

from data_process import *

 WORLD_SIZE=1 , LOCAL_WORLD_SIZE=1,RANK =0,LOCAL_RANK = 0 
cuda:0
../checkpoints/tokenizer_checkpoints


  df = pd.read_csv("../data/rank_tax.csv")


Loaded dictionary.


In [11]:
from models.TokenizerClassifier import TokenizerClassifier

model = TokenizerClassifier(output_dim=len_tokenizer, max_tax_len=max_tax_len).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
print("model:", sum(p.numel() for p in model.parameters()) / 1e6, 'M parameters')



model: 8.021248 M parameters


In [33]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from data_process import *

def train_step(model, optimizer, da, device):
    # Zero the gradients
    optimizer.zero_grad()
    
    # Get batch and convert to tensor
    tensor_batch = tokenizer_data_to_tensor_batch(da.get_batch())
    tensor_batch.gpu(device)
    
    prediction = model(tensor_batch.seq_ids)
    labels = tensor_batch.taxes
    
    loss = nn.BCEWithLogitsLoss()(prediction, labels.float())
        
    # Backward pass and optimization step
    loss.backward()
    optimizer.step()

    return loss.item()

In [51]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import f1_score, accuracy_score
from data_process import *
from sklearn.metrics import confusion_matrix


def evaluate(model, da, device, len_tokenizer, num_val_batches=1):
    model.eval()  # Set the model to evaluation mode
    
    total_loss = 0
    
    with torch.no_grad():  # Disable gradient computation for evaluation
        for _ in range(num_val_batches):
            tensor_batch = tokenizer_data_to_tensor_batch(da.get_batch())
            tensor_batch.gpu(device)
            
            prediction = model(tensor_batch.seq_ids)
            labels = tensor_batch.taxes
            
            loss = nn.BCEWithLogitsLoss()(prediction, labels.float())
            total_loss += loss

            # Calculate accuracy
            predicted_classes = (torch.sigmoid(prediction) > 0.6).int()
            f1 = f1_score(labels.cpu(), predicted_classes.cpu(), average='micro')
            accuracy = accuracy_score(labels.cpu(), predicted_classes.cpu())
            cm = confusion_matrix(labels.cpu(), predicted_classes.cpu(), labels=[_ for _ in range(len_tokenizer)])

    # Average losses
    val_loss = total_loss / num_val_batches
    
    model.train()  # Set the model back to training mode
    return val_loss, accuracy, f1, cm

In [52]:
model.train()

train_losses = []

for epoch in range(epochs):
    train_loss = train_step(model, optimizer, da, device)
    train_losses.append(train_loss)
    
    if (epoch + 1) % val_epoch == 0:
        val_loss, acc, f1, cms = evaluate(model, da, device, len_tokenizer, num_val)
        
        print("cms", cms)
        mean_train_loss = sum(train_losses[-val_epoch:]) / val_epoch
        print(f"Epoch {epoch+1}, Train Loss: {mean_train_loss:.4f}, Val Loss: {val_loss:.4f}, val acc: {acc:.4f}, val f1: {f1:.4f}")
    
        checkpoint_path = os.path.join(checkpoint_dir, f"checkpoint_step_{epoch + 1}.pt")
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'train_loss': train_loss,
            'val_loss': val_loss,
            'accuracy': acc,
            'f1_score': f1
        }, checkpoint_path)

TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
len_tokenizer = len(tokenizer.vocab)

def encode_lineage_tokenizer(tax_lineage):
    return tokenizer.encode(tax_lineage.split(", "), add_special_tokens=False, padding='max_length', max_length=max_tax_len, is_split_into_words= True)

def tokenizer_data_to_tensor_batch(b):
    # if model_name in ["new_hierarchy", "hierarchy"]:
    sequences = [encode_sequence(e['sequence']) for e in b]
    tax_ids = [encode_lineage_tokenizer(e['tax_lineage']) for e in b]
    encoded_list = [[1 if _ in tax_id else 0 for _ in range(len_tokenizer)] for tax_id in tax_ids]

    return Batch(torch.LongTensor(sequences), torch.LongTensor(encoded_list))
    
sum(tokenizer_data_to_tensor_batch(da.get_batch()).taxes[0])


In [None]:
torch.full([2, 3], 1.5)