# Deep Learning
## HW4 - Problem 3

Name: Amin Robatian

Student Number: 400301075

In [1]:
# Install Required Packages

#!pip install transformers
#!pip install -q hazm

In [2]:
# Download Dataset from Github

#!git clone https://github.com/aminrobatian/Persian_poems_corpus.git

%cd ./Persian_poems_corpus/normalized 

/home/dllabsharif/Robatian/Persian_poems_corpus/normalized


In [3]:
# Import required packages
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
%matplotlib inline

import gc

import torch
from torch import nn
import torch.nn.functional as F
from torch.optim import Adam, SGD
from torchsummary import summary

from transformers import BertTokenizer
from transformers import BertModel

import hazm

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


# Preprocessing Data

In [4]:
poets = [['Vahshi Bafqi', 'vahshi_norm.txt', 0],
         ['Jami', 'jami_norm.txt', 1],
         ['Asadi Tusi', 'asadi_norm.txt', 2],
         ['Attar of Nishapur', 'attar_norm.txt', 3],
         ['Mohammad Taqi Bahar', 'bahar_norm.txt', 4],
         ['Farrokhi Yazdi', 'farrokhi_norm.txt', 5],
         ['Ferdowsi', 'ferdousi_norm.txt', 6],
         ['Shah Nimatullah Wali', 'shahnematollah_norm.txt', 7],
         ['Khwaju Kermani', 'khajoo_norm.txt', 8],
         ['Rumi', 'moulavi_norm.txt', 9]]
poets_df = pd.DataFrame(poets, columns=['PoetName', 'Nameinfolders', 'PoetID'])
poets_df

Unnamed: 0,PoetName,Nameinfolders,PoetID
0,Vahshi Bafqi,vahshi_norm.txt,0
1,Jami,jami_norm.txt,1
2,Asadi Tusi,asadi_norm.txt,2
3,Attar of Nishapur,attar_norm.txt,3
4,Mohammad Taqi Bahar,bahar_norm.txt,4
5,Farrokhi Yazdi,farrokhi_norm.txt,5
6,Ferdowsi,ferdousi_norm.txt,6
7,Shah Nimatullah Wali,shahnematollah_norm.txt,7
8,Khwaju Kermani,khajoo_norm.txt,8
9,Rumi,moulavi_norm.txt,9


In [5]:
data = pd.DataFrame(columns=['beyt', 'poet'])


for i in range(10):
  df = pd.read_csv(poets_df.Nameinfolders[i], header = None)
    
  lst = int(len(df)/2) * [' ']
  df_beyt = pd.DataFrame(lst)

  for k in range(int(len(df)/2)):
    df_beyt.iloc[k] = df.iloc[2*k] + ' [SEP] ' + df.iloc[2*k+1]
  

  df_beyt["poet"] = poets_df.PoetName[i]
  df_beyt.columns = ['beyt', 'poet']
  data = pd.concat([data, df_beyt], ignore_index=True, sort=False)


data = data.sample(frac=1).reset_index(drop=True)

data

Unnamed: 0,beyt,poet
0,اگر در سخن موی کافد همی [SEP] به تاریکی اندر ب...,Ferdowsi
1,آن شفاعت و آن دعا نه از رحم خود [SEP] می کند آ...,Rumi
2,ز بس طپانچه که هر شب بروی برزدمی [SEP] بزور بو...,Farrokhi Yazdi
3,یکی باشد در آنجا هر چه بینی [SEP] اگر تو مرد ر...,Attar of Nishapur
4,تو هم ای دایه زبن هنر بشکن [SEP] دل ما سوختی د...,Mohammad Taqi Bahar
...,...,...
280251,اگر تو پاکباز آیی درین راه [SEP] چو ما بیشک رس...,Attar of Nishapur
280252,ببیند که آن دو دلاور کیند [SEP] بران کوه سر بر...,Ferdowsi
280253,وآنک او آن نور را بینا بود [SEP] شرح او کی کار...,Rumi
280254,همه عشقست اگر خود بازیابی [SEP] ز عشق اینجا حق...,Attar of Nishapur


## Train, Validation, Test split

In [6]:
df_train, df_val, df_test = np.split(data.sample(frac=1, random_state=42), 
                                     [int(.8*len(data)), int(.9*len(data))])


print(f"Train, Validation, Test split (80:10:10)")
print(f"Train Length: {len(df_train):>17,} \nValidation Length: \
      {len(df_val):,} \nTest Length: {len(df_test):>17,}")

Train, Validation, Test split (80:10:10)
Train Length:           224,204 
Validation Length:       28,026 
Test Length:            28,026


In [7]:
# calculate the length of comments based on their words
data['beyt_len_by_words'] = data['beyt'].apply(lambda t: len(hazm.word_tokenize(t)))

min_max_len = data["beyt_len_by_words"].min(), data["beyt_len_by_words"].max()
print(f'Min Beyt Len by Words: {min_max_len[0]} \tMax Beyt Len by Words: {min_max_len[1]}')

Min Beyt Len by Words: 6 	Max Beyt Len by Words: 29


# Part (A) - Pretrained BERT

## Tokenizer

In [8]:
tokenizer = BertTokenizer.from_pretrained("HooshvareLab/bert-fa-base-uncased")

In [9]:
example_text = df_train.beyt[0]
bert_input = tokenizer(example_text,padding='max_length', max_length = 32, 
                       truncation=True, return_tensors="pt")


print(f"input_ids: \n{bert_input['input_ids']}\n")
print(f"token_type_ids: \n{bert_input['token_type_ids']}\n")
print(f"attention_mask: \n{bert_input['attention_mask']}\n")

example_text = tokenizer.decode(bert_input.input_ids[0])

print(f"Formatted Sequence: \n{example_text}")

input_ids: 
tensor([[    2,  3064,  2786,  4402, 12378,  6941,  2013, 27131,     4,  2789,
         10973, 11840, 65322,  2013, 27131,     4,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0]])

token_type_ids: 
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]])

attention_mask: 
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]])

Formatted Sequence: 
[CLS] اگر در سخن موی کافد همی [SEP] به تاریکی اندر ببافد همی [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


## Dataset Class

In [10]:
labels = dict(zip(poets_df.PoetName, poets_df.PoetID))
labels

{'Vahshi Bafqi': 0,
 'Jami': 1,
 'Asadi Tusi': 2,
 'Attar of Nishapur': 3,
 'Mohammad Taqi Bahar': 4,
 'Farrokhi Yazdi': 5,
 'Ferdowsi': 6,
 'Shah Nimatullah Wali': 7,
 'Khwaju Kermani': 8,
 'Rumi': 9}

In [11]:
class PersianPoemsDataset(torch.utils.data.Dataset):
    """ Create a PyTorch dataset for Persian poems corpus. """

    def __init__(self, df):

        self.labels = [labels[label] for label in df['poet']]
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 32, truncation=True,
                                return_tensors="pt") for text in df['beyt']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

## Model Building

In [12]:
ParsBERT = BertModel.from_pretrained('HooshvareLab/bert-fa-base-uncased')
print(ParsBERT)

Some weights of the model checkpoint at HooshvareLab/bert-fa-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(100000, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
         

In [13]:
pytorch_total_params = sum(p.numel() for p in ParsBERT.parameters() if p.requires_grad)
print(f"Number of Trainable Parameters Before Freezing the Model: {pytorch_total_params:,}")

for param in ParsBERT.parameters():
  param.requires_grad = False

pytorch_total_params = sum(p.numel() for p in ParsBERT.parameters() if p.requires_grad)
print(f"Number of Trainable Parameters After Freezing the Model: {pytorch_total_params:,}")

Number of Trainable Parameters Before Freezing the Model: 162,841,344
Number of Trainable Parameters After Freezing the Model: 0


In [14]:
class BertClassifier(nn.Module):

    def __init__(self):

        super(BertClassifier, self).__init__()

        self.bert = ParsBERT
        self.linear = nn.Linear(768, 10)

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        linear_output = self.linear(pooled_output)

        return linear_output

## Training Loop

In [15]:
def train(model, train_data, val_data, learning_rate, epochs, criterion, optimizer):

    train, val = PersianPoemsDataset(train_data), PersianPoemsDataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=32, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=32)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label.long())
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label.long())
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {(100 * total_acc_train / len(train_data)): .2f}% \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {(100 * total_acc_val / len(val_data)): .2f}%')
            
            avg_loss = total_loss_val / len(val_data)
            
    return avg_loss

In [16]:
gc.collect()
torch.cuda.empty_cache()

model = BertClassifier()

In [17]:
EPOCHS = 5
LR = 5e-5
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr= LR)
              
avg_loss = train(model, df_train, df_val, LR, EPOCHS, criterion, optimizer)

# Saving the Model
%cd /home/dllabsharif/Robatian/
torch.save(model, 'Pretrained_BERT.pth')

100%|██████████████████████████████████████| 7007/7007 [01:09<00:00, 101.35it/s]


Epochs: 1 | Train Loss:  0.055                 | Train Accuracy:  41.79%                 | Val Loss:  0.052                 | Val Accuracy:  44.52%


100%|██████████████████████████████████████| 7007/7007 [01:08<00:00, 101.88it/s]


Epochs: 2 | Train Loss:  0.051                 | Train Accuracy:  45.75%                 | Val Loss:  0.050                 | Val Accuracy:  46.51%


100%|██████████████████████████████████████| 7007/7007 [01:08<00:00, 101.88it/s]


Epochs: 3 | Train Loss:  0.049                 | Train Accuracy:  47.13%                 | Val Loss:  0.049                 | Val Accuracy:  47.18%


100%|██████████████████████████████████████| 7007/7007 [01:08<00:00, 101.89it/s]


Epochs: 4 | Train Loss:  0.048                 | Train Accuracy:  47.95%                 | Val Loss:  0.048                 | Val Accuracy:  48.07%


100%|██████████████████████████████████████| 7007/7007 [01:08<00:00, 101.89it/s]


Epochs: 5 | Train Loss:  0.048                 | Train Accuracy:  48.58%                 | Val Loss:  0.048                 | Val Accuracy:  48.37%
/home/dllabsharif/Robatian


## Evaluate Model on Test Data

In [18]:
def evaluate(model, test_data):

    test = PersianPoemsDataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=1)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():
        
        y_pred = []
        y_true = []

        for test_input, test_label in test_dataloader:

              test_label = test_label.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask)

              acc = (output.argmax(dim=1) == test_label).sum().item()
              total_acc_test += acc
            
              output = output.argmax(dim=1).data.cpu().numpy()
              y_pred.extend(output) # Save Prediction
              test_label = test_label.data.cpu().numpy()
              y_true.extend(test_label) # Save Truth
            
            
    
    print(f'Avg Loss: {avg_loss:>6f}\n')
    accuracy = accuracy_score(y_true, y_pred)
    print(f'Accuracy: {(100*accuracy):>0.2f}%\n')
    f1 = f1_score(y_true, y_pred, average='micro')
    print(f'F1 score: {(100*f1):>0.2f}%\n')
    # constant for classes
    classes = [i for i in labels]  
    # Build confusion matrix
    cf_matrix = confusion_matrix(y_true, y_pred)
    df_cm = pd.DataFrame(cf_matrix/np.sum(cf_matrix) * 100, index = [i for i in classes])
    print('Confusion Matrix:\n')
    print(round(df_cm, 2))

# Results

In [19]:
%cd /home/dllabsharif/Robatian/
model = torch.load('Pretrained_BERT.pth')
model.eval()

evaluate(model, df_test)

/home/dllabsharif/Robatian
Avg Loss: 0.047709

Accuracy: 48.26%

F1 score: 48.26%

Confusion Matrix:

                         0     1     2      3     4     5      6     7     8  \
Vahshi Bafqi          0.03  0.56  0.01   2.13  0.16  0.01   0.39  0.11  0.04   
Jami                  0.00  2.63  0.01   6.75  0.33  0.02   1.70  0.17  0.02   
Asadi Tusi            0.00  0.21  0.04   0.86  0.07  0.01   1.86  0.01  0.01   
Attar of Nishapur     0.01  0.96  0.02  29.32  0.30  0.04   2.34  0.37  0.06   
Mohammad Taqi Bahar   0.01  0.67  0.02   3.46  0.97  0.03   1.52  0.12  0.01   
Farrokhi Yazdi        0.00  0.21  0.02   2.35  0.24  0.20   1.13  0.05  0.03   
Ferdowsi              0.01  0.44  0.01   4.05  0.25  0.03  12.53  0.08  0.01   
Shah Nimatullah Wali  0.01  0.24  0.00   3.41  0.05  0.01   0.17  1.31  0.04   
Khwaju Kermani        0.00  0.21  0.00   2.21  0.08  0.01   0.17  0.13  0.19   
Rumi                  0.00  0.38  0.01   6.65  0.22  0.04   0.90  0.22  0.03   

                 

# Part (B) - BERT Fine-Tuning

# 1.   SGD Optimizer
---

In [20]:
class BertClassifier(nn.Module):

    def __init__(self):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('HooshvareLab/bert-fa-base-uncased')
        self.linear = nn.Linear(768, 10)

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        linear_output = self.linear(pooled_output)

        return linear_output

In [21]:
gc.collect()
torch.cuda.empty_cache()

model_SGD = BertClassifier()

Some weights of the model checkpoint at HooshvareLab/bert-fa-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [22]:
EPOCHS = 3
LR = 5e-5
criterion = nn.CrossEntropyLoss()
optimizer = SGD(model_SGD.parameters(), lr= LR, momentum=0.9)
              
avg_loss = train(model_SGD, df_train, df_val, LR, EPOCHS, criterion, optimizer)

# Saving the Model
%cd /home/dllabsharif/Robatian/
torch.save(model_SGD, 'Fine_Tuned_BERT_SGD.pth')

100%|███████████████████████████████████████| 7007/7007 [04:08<00:00, 28.15it/s]


Epochs: 1 | Train Loss:  0.045                 | Train Accuracy:  52.72%                 | Val Loss:  0.036                 | Val Accuracy:  61.03%


100%|███████████████████████████████████████| 7007/7007 [04:09<00:00, 28.03it/s]


Epochs: 2 | Train Loss:  0.032                 | Train Accuracy:  65.29%                 | Val Loss:  0.030                 | Val Accuracy:  67.86%


100%|███████████████████████████████████████| 7007/7007 [04:09<00:00, 28.04it/s]


Epochs: 3 | Train Loss:  0.027                 | Train Accuracy:  70.09%                 | Val Loss:  0.027                 | Val Accuracy:  70.58%
/home/dllabsharif/Robatian


# SGD Optimizer Results

In [23]:
%cd /home/dllabsharif/Robatian/
model_SGD = torch.load('Fine_Tuned_BERT_SGD.pth')
model_SGD.eval()

evaluate(model_SGD, df_test)

/home/dllabsharif/Robatian
Avg Loss: 0.026916

Accuracy: 70.64%

F1 score: 70.64%

Confusion Matrix:

                         0     1     2      3     4     5      6     7     8  \
Vahshi Bafqi          0.35  1.23  0.01   0.80  0.49  0.10   0.04  0.11  0.24   
Jami                  0.03  9.90  0.03   0.99  0.27  0.01   0.09  0.17  0.00   
Asadi Tusi            0.01  0.44  0.52   0.33  0.14  0.06   1.53  0.00  0.00   
Attar of Nishapur     0.11  1.99  0.02  28.12  0.36  0.13   0.34  0.54  0.16   
Mohammad Taqi Bahar   0.09  1.66  0.02   0.71  2.96  0.41   0.30  0.14  0.16   
Farrokhi Yazdi        0.05  0.39  0.02   0.33  0.62  2.40   0.13  0.05  0.08   
Ferdowsi              0.00  0.59  0.31   0.59  0.22  0.11  15.65  0.00  0.00   
Shah Nimatullah Wali  0.04  0.51  0.00   0.77  0.09  0.04   0.00  3.67  0.09   
Khwaju Kermani        0.08  0.15  0.00   0.61  0.16  0.16   0.00  0.13  1.60   
Rumi                  0.04  0.87  0.00   2.25  0.34  0.14   0.02  0.27  0.10   

                 

# Part (B) - BERT Fine-Tuning

# 2.   Adam Optimizer
---

In [24]:
gc.collect()
torch.cuda.empty_cache()

model_Adam = BertClassifier()

Some weights of the model checkpoint at HooshvareLab/bert-fa-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [25]:
EPOCHS = 3
LR = 5e-5
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model_Adam.parameters(), lr= LR)
              
avg_loss = train(model_Adam, df_train, df_val, LR, EPOCHS, criterion, optimizer)

# Saving the Model
%cd /home/dllabsharif/Robatian/
torch.save(model_Adam, 'Fine_Tuned_BERT_Adam.pth')

100%|███████████████████████████████████████| 7007/7007 [05:15<00:00, 22.24it/s]


Epochs: 1 | Train Loss:  0.024                 | Train Accuracy:  74.14%                 | Val Loss:  0.018                 | Val Accuracy:  80.13%


100%|███████████████████████████████████████| 7007/7007 [05:16<00:00, 22.15it/s]


Epochs: 2 | Train Loss:  0.013                 | Train Accuracy:  86.36%                 | Val Loss:  0.016                 | Val Accuracy:  81.83%


100%|███████████████████████████████████████| 7007/7007 [05:16<00:00, 22.15it/s]


Epochs: 3 | Train Loss:  0.007                 | Train Accuracy:  92.01%                 | Val Loss:  0.017                 | Val Accuracy:  82.75%
/home/dllabsharif/Robatian


# Adam Optimizer Results

In [26]:
%cd /home/dllabsharif/Robatian/
model_Adam = torch.load('Fine_Tuned_BERT_Adam.pth')
model_Adam.eval()

evaluate(model_Adam, df_test)

/home/dllabsharif/Robatian
Avg Loss: 0.017121

Accuracy: 82.89%

F1 score: 82.89%

Confusion Matrix:

                         0      1     2      3     4     5      6     7     8  \
Vahshi Bafqi          1.55   0.60  0.01   0.49  0.44  0.11   0.02  0.10  0.10   
Jami                  0.16  10.02  0.06   0.66  0.47  0.05   0.05  0.09  0.01   
Asadi Tusi            0.02   0.10  1.89   0.11  0.04  0.02   0.92  0.00  0.00   
Attar of Nishapur     0.18   0.54  0.06  30.56  0.50  0.09   0.15  0.29  0.14   
Mohammad Taqi Bahar   0.10   0.54  0.11   0.46  4.91  0.25   0.24  0.06  0.09   
Farrokhi Yazdi        0.02   0.05  0.04   0.15  0.43  3.47   0.07  0.01  0.09   
Ferdowsi              0.04   0.05  0.61   0.16  0.05  0.00  16.64  0.00  0.00   
Shah Nimatullah Wali  0.02   0.24  0.00   0.38  0.12  0.02   0.00  4.26  0.08   
Khwaju Kermani        0.08   0.05  0.00   0.27  0.17  0.05   0.00  0.14  2.33   
Rumi                  0.02   0.15  0.00   1.44  0.32  0.10   0.02  0.12  0.06   

      

# Part (C) - Perplexity

The **perplexity** of the model m is the exponential of its cross entropy:

$Perplexity(m)=2^{-\sum_{i=1}^{n}p(x_i)log_2 m(x_i)}$


In [27]:
def perplexity(model1, model2, test_data):

    test = PersianPoemsDataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=100)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model1 = model1.cuda()
        model2 = model2.cuda()

    with torch.no_grad():

        for test_input, test_label in test_dataloader:
            
            test_label = test_label.to(device) # targets
            
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)
            
            # model outputs / logits
            output1 = model1(input_id, mask)
            output2 = model2(input_id, mask)
            
            # getting loss using cross entropy
            loss1 = F.cross_entropy(output1, test_label)
            loss2 = F.cross_entropy(output2, test_label)
            
            # calculating perplexity
            perplexity1  = torch.exp(loss1)
            perplexity2  = torch.exp(loss2)
            
            return loss1, loss2, perplexity1, perplexity2
        

In [28]:
loss1, loss2, perplexity1, perplexity2 = perplexity(model, model_Adam, df_test)

loss1 = loss1.cpu().numpy()
loss2 = loss2.cpu().numpy()
perplexity1 = perplexity1.cpu().numpy()
perplexity2 = perplexity2.cpu().numpy()

# **Perplexity Before Fine-tuning:**

In [29]:
print(f"Before Fine-tuning:")  
print(f"PP = {perplexity1:>0.4f}")  

Before Fine-tuning:
PP = 6.1174


# **Perplexity After Fine-tuning:**

In [30]:
print(f"After Fine-tuning:")  
print(f"PP = {perplexity2:>0.4f}")  

After Fine-tuning:
PP = 1.8737
