<a href="https://colab.research.google.com/github/Zhou198/NLP/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mount

In [36]:
from google.colab import drive
drive.mount('/content/drive')
basedir = "/content/drive/MyDrive/NLP/"

!nvidia-smi

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Tue Feb 13 02:20:34 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   63C    P0              30W /  70W |   4529MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+----------------------------

# Load Modules

In [None]:
!pip install accelerate -U
!pip install transformers
!pip install transformers[sentencepiece]
!pip install datasets

In [96]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

import torchvision
from transformers import AutoModel, AutoTokenizer
from transformers import TrainingArguments, Trainer
from datasets import load_dataset
from datasets import load_metric

device = "cuda" if torch.cuda.is_available() else "cpu"

# Utilis

In [97]:
def tokenize_dataset(dataset):
    return tokenizer(dataset["text"], truncation=True)

# Tasks

## Text Classification

In [98]:
from transformers import AutoModelForSequenceClassification

### Load Datasets

In [99]:
mydata = load_dataset("dair-ai/emotion")
num_labels = len(np.unique(mydata["train"]["label"]))

tokenized_datasets = mydata.map(tokenize_dataset, batched=True)
print(tokenized_datasets)

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
})


In [100]:
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [102]:
inputs = tokenizer(mydata["train"]["text"][:3], padding=True, truncation=True, return_tensors="pt")
outputs = model(**inputs)

print(f"inputs = {inputs}\noutputs = {outputs}")

inputs = {'input_ids': tensor([[  101,  1045,  2134,  2102,  2514, 26608,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [  101,  1045,  2064,  2175,  2013,  3110,  2061, 20625,  2000,  2061,
          9636, 17772,  2074,  2013,  2108,  2105,  2619,  2040, 14977,  1998,
          2003,  8300,   102],
        [  101, 10047,  9775,  1037,  3371,  2000,  2695,  1045,  2514, 20505,
          3308,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}
outputs = SequenceClassifierOutput(loss=None, logits=tensor([[ 0.0247,  0.0382, -0.0119, -0.0828,  0.0110,  0.0382],
        [ 0.0273,  0.0383, -0.0049

### Fine-tuning

In [103]:
training_args = TrainingArguments(
    output_dir=f"{basedir}/results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=3,
    logging_dir=f"{basedir}/logs"
)

mytrainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer)

In [104]:
mytrainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,0.549194
2,No log,0.26441
3,No log,0.223814


TrainOutput(global_step=375, training_loss=0.5670075276692709, metrics={'train_runtime': 235.7191, 'train_samples_per_second': 203.632, 'train_steps_per_second': 1.591, 'total_flos': 748096761378816.0, 'train_loss': 0.5670075276692709, 'epoch': 3.0})

### Prediction

In [105]:
test_result = mytrainer.predict(tokenized_datasets["test"])
test_result

PredictionOutput(predictions=array([[ 4.276497  , -0.5622775 , -1.4437035 , -0.44548923, -1.2106957 ,
        -1.3231283 ],
       [ 4.388058  , -0.96319544, -1.3785189 , -0.59489036, -1.0821667 ,
        -1.1482    ],
       [ 4.3640084 , -1.002495  , -1.3999224 , -0.99853605, -0.8171334 ,
        -0.95676684],
       ...,
       [-0.8583079 ,  4.8101144 , -0.06063928, -1.1036084 , -1.314213  ,
        -0.8168227 ],
       [-0.75287116,  4.635616  , -0.41671416, -1.0901785 , -1.0710588 ,
        -0.684662  ],
       [-0.7698281 , -1.1776756 , -1.0019193 , -1.1316811 ,  2.173919  ,
         2.1390722 ]], dtype=float32), label_ids=array([0, 0, 0, ..., 1, 1, 4]), metrics={'test_loss': 0.22172601521015167, 'test_runtime': 3.3213, 'test_samples_per_second': 602.181, 'test_steps_per_second': 4.817})

In [106]:
pred = test_result.predictions.argmax(-1)
np.mean(pred == test_result.label_ids)

0.917