In [None]:
! pip install datasets transformers[sentencepiece]

In [None]:
import torch, os, pickle
import numpy                    as np
import pandas                   as pd

from google.colab               import drive
from sklearn.preprocessing      import MultiLabelBinarizer
from datasets                   import load_from_disk

In [None]:
print("Working Directory:", os.getcwd())
Folder_name = 'NLP_class'
# Mount Google Drive
drive.mount('/content/drive')

# Define the folder path
folder_path = f"/content/drive/MyDrive/{Folder_name}"

# Change the working directory to a specific path
os.chdir(folder_path)
# Print the updated working directory
print("Updated Working Directory:", os.getcwd())

In [None]:
import utils
from   utils  import df_to_DatasetDict, MyDataset, Trainer, Evaluator

# Dataset

In [None]:
# Select which dataset you want to use
## Research Paper Subjects Dataset
# dataset_dir_name = 'ResearchPaper_dataset'
# out_dir_name = 'ResearchPaper_results'
## Movie Genres Dataset
dataset_dir_name = 'MovieGenres_dataset'
out_dir_name = 'MovieGenres_results'


# dataset_dir
dataset_dir = os.path.join(folder_path,dataset_dir_name)
# output dir
out_dir = os.path.join(folder_path,out_dir_name)

dataset = load_from_disk(os.path.join(dataset_dir))
dataset

## Load the MultiLabelBinarizer

In [None]:
with open(os.path.join(out_dir,'multi-label-binarizer.pkl'), "rb") as f:
  multilabel = pickle.load(f)

# Training

In [None]:
num_classes = len(dataset['train']['labels'][0])
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
trainer = Trainer(
                  dataset_dir,
                  out_dir,
                  num_classes,
                  patience = 10,
                  model_ckpt = "distilbert-base-uncased",
                  problem_type = "multi_label_classification",
                  max_len = 256,
                  optimizer = 'Adam',
                  init_lr = 1e-5,
                  weight_decay = 0,
                  scheduler_type = "linear",
                  num_epochs = 2,
                  train_bs = 32,
                  val_bs = 32,
                  device = device,
                  clf_thrshold = 0.3)

In [None]:
# trainer.train()

In [None]:
tunned_model_name = "run_0"

ckpt_name = f"dump/{tunned_model_name}/training_checkpoint.pth"
ckpt = torch.load(os.path.join(out_dir, ckpt_name))

print(ckpt.keys())
print(ckpt['val_metrics'])

# Evaluation

In [None]:
model_dir = os.path.join(out_dir, f"dump/{tunned_model_name}")
test_dataset_dir = os.path.join(dataset_dir, 'validation')

evaluator = Evaluator(dataset_dir  = test_dataset_dir,
                      model_dir    = model_dir,
                      num_classes  = num_classes,
                      ckpt_name    = "training_checkpoint.pth",
                      model_ckpt   = "distilbert-base-uncased",
                      problem_type = "multi_label_classification",
                      device       = device,
                      clf_thrshold = 0.3)

In [None]:
evaluator.evaluate()

# Prediction

In [None]:
# Generate 10 random integers between 1 and 1000
row_indices = np.random.randint(1, dataset["validation"].shape[0], size=10)

for row in row_indices:
  text  = dataset['train']['text'][row]
  label = dataset['train']['labels'][row]
  print(f"df row index: {row}")
  print(f"Groundtruth:  {multilabel.inverse_transform(label.reshape(1,-1))}")
  print(f"Prediction:   {evaluator.prediction(text,multilabel)} \n")