### Fine-Tuning for Sentiment Analysis ###

In [60]:
import os
import json
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sn
from pprint import pprint
import textwrap
from pathlib import Path
from pprint import pprint

# Appearance of the Notebook
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# PyTorch
import torch
from torchinfo import summary

# Hugging Face 
from transformers import pipeline, set_seed, AutoTokenizer
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification

# This HuggingFace community-driven open-source library of datasets
from datasets import load_dataset, load_metric

# Import this module with autoreload
%load_ext autoreload
%autoreload 2
import transformermodels as tm
print(f'Package version: {tm.__version__}')
print(f'PyTorch version: {torch.__version__}')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Package version: 0.0.post1.dev8+gd983d5a.d20240721
PyTorch version: 2.3.1+cu121


In [2]:
# GPU checks
is_cuda = torch.cuda.is_available()
print(f'CUDA available: {is_cuda}')
print(f'Number of GPUs found:  {torch.cuda.device_count()}')

if is_cuda:
    print(f'Current device ID:     {torch.cuda.current_device()}')
    print(f'GPU device name:       {torch.cuda.get_device_name(0)}')
    print(f'CUDNN version:         {torch.backends.cudnn.version()}')
    device_str = 'cuda:0'
    torch.cuda.empty_cache() 
else:
    device_str = 'cpu'
device = torch.device(device_str)
print()
print(f'Device for model training/inference: {device}')

CUDA available: True
Number of GPUs found:  1
Current device ID:     0
GPU device name:       NVIDIA GeForce GTX 1080 with Max-Q Design
CUDNN version:         8902

Device for model training/inference: cuda:0


In [49]:
# Helper functions and parameters
def wrap(x):
    return textwrap.fill(x, replace_whitespace=False, fix_sentence_endings=True)

# Directories
data_dir = os.path.join(os.environ.get('HOME'), 'data', 'transformers')
Path(data_dir).mkdir(parents=True, exist_ok=True)
model_dir = os.path.join(data_dir, 'model_trained', 'sentiment')
Path(model_dir).mkdir(parents=True, exist_ok=True)
# Load the HuggingFace datasets
# Full list of datasets
# https://huggingface.co/datasets
# dataset = load_dataset('amazon_polarity')

In [4]:
# Dataset 'glue' with subtastk 'sst2'
raw_datasets = load_dataset('glue', 'sst2')

In [5]:
print(raw_datasets.get('train'))

Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 67349
})


In [6]:
# The DIR functions shows all of the methods of the object
pprint(dir(raw_datasets.get('train')))

['_TF_DATASET_REFS',
 '__class__',
 '__del__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getitems__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_build_local_temp_path',
 '_check_index_is_initialized',
 '_data',
 '_estimate_nbytes',
 '_fingerprint',
 '_format_columns',
 '_format_kwargs',
 '_format_type',
 '_generate_tables_from_cache_file',
 '_generate_tables_from_shards',
 '_get_cache_file_path',
 '_get_output_signature',
 '_getitem',
 '_indexes',
 '_indices',
 '_info',
 '_map_single',
 '_new_dataset_with_indices',
 '_output_all_columns',
 '_push_parquet_shards_to_hub',
 '_save_to_disk_single',
 '_select_contiguo

In [7]:
raw_datasets.get('train').to_pandas()

Unnamed: 0,sentence,label,idx
0,hide new secretions from the parental units,0,0
1,"contains no wit , only labored gags",0,1
2,that loves its characters and communicates som...,1,2
3,remains utterly satisfied to remain the same t...,0,3
4,on the worst revenge-of-the-nerds clichés the ...,0,4
...,...,...,...
67344,a delightful comedy,1,67344
67345,"anguish , anger and frustration",0,67345
67346,"at achieving the modest , crowd-pleasing goals...",1,67346
67347,a patient viewer,1,67347


In [8]:
print(type(raw_datasets.get('train')))

<class 'datasets.arrow_dataset.Dataset'>


In [9]:
print(raw_datasets.get('train')[102])

{'sentence': 'yet this grating showcase ', 'label': 0, 'idx': 102}


In [10]:
display(raw_datasets.get('train')[100:103])

{'sentence': ['in memory ',
  'respectable new one ',
  'yet this grating showcase '],
 'label': [1, 1, 0],
 'idx': [100, 101, 102]}

In [11]:
display(raw_datasets.get('train').features)

{'sentence': Value(dtype='string', id=None),
 'label': ClassLabel(names=['negative', 'positive'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [12]:
# AutoTokenizer
# checkpoint = 'bert-base-uncased'
checkpoint = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Test the tokenizer
tokenized_sentences = tokenizer(raw_datasets.get('train')[:3].get('sentence'))
pprint(tokenized_sentences)

{'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
 'input_ids': [[101, 5342, 2047, 3595, 8496, 2013, 1996, 18643, 3197, 102],
               [101,
                3397,
                2053,
                15966,
                1010,
                2069,
                4450,
                2098,
                18201,
                2015,
                102],
               [101,
                2008,
                7459,
                2049,
                3494,
                1998,
                10639,
                2015,
                2242,
                2738,
                3376,
                2055,
                2529,
                3267,
                102]]}


In [13]:
# We need to pass the truncation argument
def tokenize_fn(batch):
    return tokenizer(batch.get('sentence'), truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_fn, batched=True)

In [14]:
pprint(tokenized_datasets)

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 1821
    })
})


### Model training for text sequence classification ###

In [15]:
# Training arguments
training_args = TrainingArguments('my_trainer',
                                  eval_strategy='epoch',
                                  save_strategy='epoch',
                                  num_train_epochs=1)

In [16]:
# Tokenizer and model checkpoint
# checkpoint = 'bert-base-uncased'
checkpoint = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [38]:
# The AutoModelForSequnceClassification returns many types of models
# Depending on the checkpoint
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) 
print(type(model))
display(summary(model))
# pprint(model)
# OUTPUT:(classifier): Linear(in_features=768, out_features=2, bias=True)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<class 'transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification'>


Layer (type:depth-idx)                                  Param #
DistilBertForSequenceClassification                     --
├─DistilBertModel: 1-1                                  --
│    └─Embeddings: 2-1                                  --
│    │    └─Embedding: 3-1                              23,440,896
│    │    └─Embedding: 3-2                              393,216
│    │    └─LayerNorm: 3-3                              1,536
│    │    └─Dropout: 3-4                                --
│    └─Transformer: 2-2                                 --
│    │    └─ModuleList: 3-5                             42,527,232
├─Linear: 1-2                                           590,592
├─Linear: 1-3                                           1,538
├─Dropout: 1-4                                          --
Total params: 66,955,010
Trainable params: 66,955,010
Non-trainable params: 0

In [39]:
# Fine-tuning: train all the weights in the network
# We want to make sure that we are training all parameters
# We will save all of the parameters before the training
params_before = []
params_before_dict = {}
for name, p in model.named_parameters():
    params_before.append(p.detach().cpu().numpy())
    params_before_dict.update({name: p})
print(len(params_before))
# Lets take the parameters from one of the layers
layer_name = list(params_before_dict.keys())[10]
layer_p = params_before_dict.get(layer_name)

print(layer_name)
print(layer_p[:5])

104
distilbert.transformer.layer.0.attention.out_lin.weight
tensor([[-0.0283, -0.0414,  0.0004,  ..., -0.0333, -0.0190,  0.0438],
        [ 0.0440,  0.0149,  0.0072,  ..., -0.0220,  0.0383,  0.0030],
        [-0.0457, -0.0289,  0.0271,  ...,  0.0017,  0.0291, -0.0178],
        [ 0.0166, -0.0392, -0.0019,  ...,  0.0073, -0.0266, -0.0688],
        [ 0.0041, -0.0368, -0.0095,  ..., -0.0111, -0.0342, -0.0254]],
       grad_fn=<SliceBackward0>)


In [41]:
metric = load_metric('glue', 'sst2', trust_remote_code=True)

In [45]:
metric.compute(predictions=[1, 0, 1], references=[1, 0, 0])
# We need to create a function for the metric so that we can use it in the training loop
def compute_metrics(logits_and_labels):
    logits, labels = logits_and_labels
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [46]:
# create the trainier object
trainer = Trainer(model=model, 
                  train_dataset=tokenized_datasets.get('train'),
                  eval_dataset=tokenized_datasets.get('validation'),
                  tokenizer=tokenizer,
                  compute_metrics=compute_metrics)

In [61]:
#trainer.train()

In [62]:
# Save the model
model_name = 'trained_sentiment'
output_dir = os.path.join(model_dir, model_name)
print(output_dir)
# trainer.save_model(output_dir)

/home/andreas/data/transformers/model_trained/sentiment/trained_sentiment


In [63]:
# Use the model
newmodel = pipeline(task='text-classification', model=output_dir, device=device)
print(newmodel('This is a great movie.'))
print(newmodel('This movie is not so good.'))

[{'label': 'LABEL_1', 'score': 0.9999430179595947}]
[{'label': 'LABEL_0', 'score': 0.9996452331542969}]


### Change the output to the correct label names ###

In [70]:
# We get the label names from the training data sets
print(raw_datasets.get('train').features)
print(raw_datasets.get('train').features.get('label'))
print()
label_names = raw_datasets.get('train').features.get('label').names
print(label_names)

# Let's edit the config file of the trained model
config_file = os.path.join(output_dir, 'config.json')

# Open the config file
with open(config_file) as fl:
    js = json.load(fl)

# Add the labels mappings
js.update({'id2label': {0: 'negative', 1: 'positive'}})
display(js)

# Save the new configuration file
with open(config_file, 'w') as fl:
    json.dump(js, fl, indent=2)

{'sentence': Value(dtype='string', id=None), 'label': ClassLabel(names=['negative', 'positive'], id=None), 'idx': Value(dtype='int32', id=None)}
ClassLabel(names=['negative', 'positive'], id=None)

['negative', 'positive']


{'_name_or_path': 'distilbert-base-uncased',
 'activation': 'gelu',
 'architectures': ['DistilBertForSequenceClassification'],
 'attention_dropout': 0.1,
 'dim': 768,
 'dropout': 0.1,
 'hidden_dim': 3072,
 'initializer_range': 0.02,
 'max_position_embeddings': 512,
 'model_type': 'distilbert',
 'n_heads': 12,
 'n_layers': 6,
 'pad_token_id': 0,
 'problem_type': 'single_label_classification',
 'qa_dropout': 0.1,
 'seq_classif_dropout': 0.2,
 'sinusoidal_pos_embds': False,
 'tie_weights_': True,
 'torch_dtype': 'float32',
 'transformers_version': '4.42.4',
 'vocab_size': 30522,
 'id2label': {0: 'negative', 1: 'positive'}}

In [72]:
# Load the new pipeline
newmodel = pipeline(task='text-classification', model=output_dir, device=device)

In [73]:
newmodel('This movie is great.')

[{'label': 'positive', 'score': 0.9999401569366455}]

In [77]:
# Let's confirm if the weights were changed during training
params_after = []
params_after_dict = {}
for name, p in model.named_parameters():
    params_after.append(p.detach().cpu().numpy())
    params_after_dict.update({name: p})
print(len(params_after))
# Lets take the parameters from one of the layers
layer_name_before = list(params_before_dict.keys())[10]
layer_p_before = params_before_dict.get(layer_name)

layer_name_after = list(params_after_dict.keys())[10]
layer_p_after = params_after_dict.get(layer_name)

print(layer_name_before)
print(layer_name_after)

display(layer_p_before[0, 5])
display(layer_p_after[0, 5])

104
distilbert.transformer.layer.0.attention.out_lin.weight
distilbert.transformer.layer.0.attention.out_lin.weight


tensor(-0.0012, device='cuda:0', grad_fn=<SelectBackward0>)

tensor(-0.0012, device='cuda:0', grad_fn=<SelectBackward0>)

In [79]:
# Compute the sum of the absolute differences between the untrained and trained model parameters
for p1, p2 in zip(params_before, params_after):
    print(np.sum(np.abs(p1 - p2)))

# This output confirms that all of the model weights were updated during the training process

24723.473
141.43802
2.8297658
1.913683
2169.042
2.5821939
2169.3225
0.0056937574
1967.1642
1.6832343
1885.6625
1.2072973
2.7268405
1.1853302
8225.339
8.787317
7659.0737
1.0334922
2.6662364
1.2093645
2132.775
2.3922868
2138.4028
0.00527232
1912.6146
1.3291999
1839.9622
1.0666238
2.6803317
1.0824041
8176.903
8.3823
7523.8022
0.9768689
2.447663
1.1156534
2109.289
2.6267498
2122.0803
0.0048502535
1885.4092
1.1503797
1853.506
1.1245847
2.7050886
1.2291026
8209.357
8.93066
7386.6978
1.0882995
2.425788
0.9850048
2080.8481
2.2816079
2083.7625
0.0056488216
1867.6721
1.0481541
1784.9973
1.1396999
2.4749026
1.1881702
8045.6313
9.136528
7007.7036
1.2292962
2.25871
1.0910001
1972.2903
2.4923775
1959.4954
0.0031186645
1639.4523
1.1057569
1626.8718
1.6985041
2.2145581
1.6097157
7267.7744
8.912457
5950.014
1.595479
2.35741
1.9419782
1825.0217
2.106335
1849.2722
0.002102476
1534.1357
2.6422424
1497.6665
1.856708
2.5620012
2.2382953
6438.0845
9.444357
5286.92
1.675566
3.5083866
1.1125644
1393.4172
1.799