# Mount

In [1]:
from google.colab import drive
drive.mount('/content/drive')
basedir = "/content/drive/MyDrive/NLP/"

!nvidia-smi

Mounted at /content/drive
Tue Feb 13 00:21:28 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   55C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                          

# Load Modules

In [None]:
!pip install accelerate -U
!pip install transformers
!pip install transformers[sentencepiece]
!pip install datasets

In [26]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

import torchvision
from transformers import AutoModel, AutoTokenizer
from transformers import TrainingArguments, Trainer
from datasets import load_dataset
from datasets import load_metric

# Utilis

In [None]:
def tokenize_dataset(dataset):
    return tokenizer(dataset["text"])

# Tasks

## Text Classification

In [4]:
from transformers import AutoModelForSequenceClassification

### Load Datasets

In [None]:
mydata = load_dataset("dair-ai/emotion")
num_labels = len(np.unique(mydata["train"]["label"]))

tokenized_datasets = mydata.map(tokenize_dataset, batched=True)
tokenized_datasets

In [None]:
tokenized_datasets["train"][:3]

In [7]:
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
inputs = tokenizer(test_point, padding=True, truncation=True, return_tensors="pt")
outputs = model(**inputs)

print(f"inputs = {inputs}\noutputs = {outputs}")

inputs = {'input_ids': tensor([[  101,  1045,  2134,  2102,  2514, 26608,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [  101,  1045,  2064,  2175,  2013,  3110,  2061, 20625,  2000,  2061,
          9636, 17772,  2074,  2013,  2108,  2105,  2619,  2040, 14977,  1998,
          2003,  8300,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
outputs = SequenceClassifierOutput(loss=None, logits=tensor([[ 0.0989,  0.0809, -0.0831,  0.2546,  0.0896,  0.0716],
        [ 0.0936,  0.0564, -0.1089,  0.2003,  0.0849,  0.0780]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


### Fine-tuning

In [20]:
training_args = TrainingArguments(
    output_dir=f"{basedir}/results",
    evaluation_strategy="epochs",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    logging_dir=f"{basedir}/logs"
)

mytrainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer)

In [22]:
mytrainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.185267
2,0.215800,0.149495
3,0.215800,0.139999


TrainOutput(global_step=750, training_loss=0.18258136749267578, metrics={'train_runtime': 248.3727, 'train_samples_per_second': 193.258, 'train_steps_per_second': 3.02, 'total_flos': 703485182771712.0, 'train_loss': 0.18258136749267578, 'epoch': 3.0})

### Prediction

In [None]:
test_result = mytrainer.predict(tokenized_datasets["test"])
test_result

In [35]:
pred = test_result.predictions.argmax(-1)
np.mean(pred == test_result.label_ids)


0.9175