# Bert
This file contains the code to fine-tune a bert model on the IMDB dataset for movie genres classification.

In [None]:
# Install commands for colab.
%pip install -U transformers

# Imports
import torch
import random
import numpy as np
import pandas as pd

from google.colab import drive
from tqdm.notebook import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup

from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

from transformers import BertForSequenceClassification

# Mount to our drive.
drive.mount('/content/drive')

# Select the device.
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 4.7 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 81.2 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 76.0 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1
Mounted at /content/drive
cuda:0


# Data Preparation

In [None]:
# Here we read in the data

df_train = pd.read_csv('/content/drive/MyDrive/Datasets/train_data.txt', sep=":::", header=None, names=["Id","Title","Genre","Description"],
                 index_col=None)
df_test = pd.read_csv('/content/drive/MyDrive/Datasets/test_data_solution.txt', sep=":::", header=None, names=["Id","Title","Genre","Description"],
                 index_col=None)
df_train = df_train.iloc[: , 1:]
df_test = df_test.iloc[: , 1:]

  return func(*args, **kwargs)


In [None]:
# Here we select the different labels from the dataset

possible_labels = df_train.Genre.unique()

label_dict = {}
for index, possible_labels in enumerate(possible_labels):
    label_dict[possible_labels] = index
label_dict

{' drama ': 0,
 ' thriller ': 1,
 ' adult ': 2,
 ' documentary ': 3,
 ' comedy ': 4,
 ' crime ': 5,
 ' reality-tv ': 6,
 ' horror ': 7,
 ' sport ': 8,
 ' animation ': 9,
 ' action ': 10,
 ' fantasy ': 11,
 ' short ': 12,
 ' sci-fi ': 13,
 ' music ': 14,
 ' adventure ': 15,
 ' talk-show ': 16,
 ' western ': 17,
 ' family ': 18,
 ' mystery ': 19,
 ' history ': 20,
 ' news ': 21,
 ' biography ': 22,
 ' romance ': 23,
 ' game-show ': 24,
 ' musical ': 25,
 ' war ': 26}

In [None]:
# Here we select the size of the dataset
DATASET_SIZE = {'ten': 10000, 'thirty': 30000, 'fifty': df_train.shape[0]}

df_train = df_train.sample(n=DATASET_SIZE['thirty'])
print(df_train.shape[0])

30000


In [None]:
# Here we rebalance the training dataset
# g = df_train.groupby('Genre')
# df_train = g.apply(lambda x: x.sample(g.size().min()).reset_index(drop="Genre"))

# df_train.head()

In [None]:
# Here we combine the train and test sets for ease of use when training

# x_train, x_val, y_train, y_val = train_test_split(df.index.values,
#                                                   df.label.values,
#                                                   test_size=0.15,
#                                                   random_state=42,
#                                                   stratify=df.label.values)

# Mark training and test data separately
df_train['data_type'] = ['train']*df_train.shape[0]
df_test['data_type'] = ['val']*df_test.shape[0]

# Now concatenate these two dataframes into one
df = pd.concat([df_train, df_test], ignore_index=True)
df['label'] = df.Genre.replace(label_dict)

df.groupby(['Genre', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Title,Description
Genre,label,data_type,Unnamed: 3_level_1,Unnamed: 4_level_1
action,10,train,758,758
action,10,val,1314,1314
adult,2,train,331,331
adult,2,val,590,590
adventure,15,train,414,414
adventure,15,val,775,775
animation,9,train,278,278
animation,9,val,498,498
biography,22,train,141,141
biography,22,val,264,264


In [None]:
# Quick sanity check as to what the data looks like
df.head()

Unnamed: 0,Title,Genre,Description,data_type,label
0,Mari conita de Jesús (2001),short,A powerful and hilarious mockumentary about o...,train,12
1,Waking David (2016),drama,"A family drama of powerful intensity, Kevin N...",train,0
2,Raising Valhalla (2007),documentary,Witness a remarkable architectural achievemen...,train,3
3,Plateia Amerikis (2016),drama,Tattoo artist Billy and unemployed Nakos are ...,train,0
4,The Cars: Heartbeat City (1984),music,The Cars have always been on the cutting edge...,train,14


# Pre-processing

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
import re
import string

# Function to remove numeric values in the text 
def remove_numbers(text):
    number_pattern = r'\d+'
    without_number = re.sub(pattern=number_pattern, repl=" ", string=text)
    return without_number
# Function to remove punctuation in the text 
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# Function to remove all the topwords from a text 
def remove_stopwords(text):
    removed = []
    stop_words = list(stopwords.words("english"))
    tokens = word_tokenize(text)
    for i in range(len(tokens)):
        if tokens[i] not in stop_words:
            removed.append(tokens[i])
    return " ".join(removed)

# Function to remove extra white space
def remove_extra_white_spaces(text):
    single_char_pattern = r'\s+[a-zA-Z]\s+'
    without_sc = re.sub(pattern=single_char_pattern, repl=" ", string=text)
    return without_sc

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Converting text to lowercase
df['Description'] = df['Description'].apply(lambda x: x.lower())
# # Removing numeric values in text 
df['Description'] = df['Description'].apply(lambda x: remove_numbers(x))
# # Removing punctuation in text 
df['Description'] = df['Description'].apply(lambda x: remove_punctuation(x))
# # Removing extra white space in text 
df['Description'] = df['Description'].apply(lambda x: remove_extra_white_spaces(x))
# Removing stopwords in text 
df['Description'] = df['Description'].apply(lambda x: remove_stopwords(x))

In [None]:
# Another quick sanity check
df.head()

Unnamed: 0,Title,Genre,Description,data_type,label
0,Mari conita de Jesús (2001),short,powerful hilarious mockumentary cultures obses...,train,12
1,Waking David (2016),drama,family drama powerful intensity kevin nashs de...,train,0
2,Raising Valhalla (2007),documentary,witness remarkable architectural achievement b...,train,3
3,Plateia Amerikis (2016),drama,tattoo artist billy unemployed nakos best frie...,train,0
4,The Cars: Heartbeat City (1984),music,cars always cutting edge comes music video hea...,train,14


# Model Preparation

In [None]:
# Here we tokenize the data into a form BERT can understand.

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                          do_lower_case=True)

encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].Description.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=512,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].Description.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=512,
    return_tensors='pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
# Here we prepare the pre-trained model with our data.

model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)
model.to(device)

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
# Here we prepare the dataloaders for training and validation.

batch_size = 15

dataloader_train = DataLoader(dataset_train,
                              sampler=RandomSampler(dataset_train),
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val,
                                   sampler=SequentialSampler(dataset_val),
                                   batch_size=batch_size)

In [None]:
# Here we prepare the optimizer for training.

from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=1e-5,
                  eps=1e-8)

epochs = 10

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)



# Model Training

In [None]:
# Here we define two functions for computing the accuracy and f1 score.

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='macro')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}

    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [None]:
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

def evaluate(dataloader_val):

    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in dataloader_val:

        batch = tuple(b.to(device) for  b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total/len(dataloader_val)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals

for epoch in tqdm(range(1, epochs+1)):

    model.train()

    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        outputs = model(**inputs)

        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})


    # torch.save(model.state_dict(), f'/content/drive/MyDrive/Datasets/finetuned_BERT_epoch_{epoch}.model')

    tqdm.write(f'\nEpoch {epoch}')

    loss_train_avg = loss_train_total/len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')

    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/2000 [00:00<?, ?it/s]


Epoch 1
Training loss: 1.66103066752851
Validation loss: 1.3511960104924245
F1 Score (Weighted): 0.2877226392973596


Epoch 2:   0%|          | 0/2000 [00:00<?, ?it/s]


Epoch 2
Training loss: 1.1929228488057853
Validation loss: 1.2440359404338415
F1 Score (Weighted): 0.33779789077671263


Epoch 3:   0%|          | 0/2000 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.976537993222475
Validation loss: 1.210808595159005
F1 Score (Weighted): 0.3730243680139949


Epoch 4:   0%|          | 0/2000 [00:00<?, ?it/s]


Epoch 4
Training loss: 0.7931312650814653
Validation loss: 1.2813416126565045
F1 Score (Weighted): 0.3978945810598138


Epoch 5:   0%|          | 0/2000 [00:00<?, ?it/s]


Epoch 5
Training loss: 0.6346126782000064
Validation loss: 1.3587424626453317
F1 Score (Weighted): 0.40696325302547254


Epoch 6:   0%|          | 0/2000 [00:00<?, ?it/s]


Epoch 6
Training loss: 0.5049310047980398
Validation loss: 1.479364458830253
F1 Score (Weighted): 0.4102367301453686


Epoch 7:   0%|          | 0/2000 [00:00<?, ?it/s]


Epoch 7
Training loss: 0.41218363687582316
Validation loss: 1.576195628382781
F1 Score (Weighted): 0.4149479976899872


Epoch 8:   0%|          | 0/2000 [00:00<?, ?it/s]


Epoch 8
Training loss: 0.3449791372818872
Validation loss: 1.6619819711685577
F1 Score (Weighted): 0.41378352438670096


Epoch 9:   0%|          | 0/2000 [00:00<?, ?it/s]


Epoch 9
Training loss: 0.2918302150620148
Validation loss: 1.7322030764487513
F1 Score (Weighted): 0.4152816609045111


Epoch 10:   0%|          | 0/2000 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [None]:
# Evaluating the model

model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)

model.load_state_dict(torch.load('/content/drive/MyDrive/Datasets/finetuned_BERT_epoch_4.model', map_location=torch.device('cpu')))

_, predictions, true_vals = evaluate(dataloader_validation)
accuracy_per_class(predictions, true_vals)