### Review classification using Pre transformer models.

In [1]:
import gzip
import shutil
import time
import pandas as pd
import requests
import torch
import torch.nn.functional as F
import torchtext
torch.__version__

'1.13.1'

In [9]:
## import continue, import Transformer
import transformers
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForSequenceClassification

In [11]:
## Set Random Seed
torch.manual_seed(42)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [13]:
## Download data from github
url = "https://github.com/rasbt/machine-learning-book/raw/main/ch08/movie_data.csv.gz"
filename = url.split("/")[-1]

with open(filename, 'wb') as f:
    r = requests.get(url)
    f.write(r.content)

with gzip.open(filename, 'rb') as f_in:
    with open('movie_data.csv', 'wb') as f_out:
        shutil.copyfileobj(f_in,f_out)

# Reading the csv to dataframe
df = pd.read_csv('movie_data.csv')
df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


In [17]:
## check the dataset
df.shape
df.describe

<bound method NDFrame.describe of                                                   review  sentiment
0      In 1974, the teenager Martha Moxley (Maggie Gr...          1
1      OK... so... I really like Kris Kristofferson a...          0
2      ***SPOILER*** Do not read this, if you think a...          0
3      hi for all the people who have seen this wonde...          1
4      I recently bought the DVD, forgetting just how...          0
...                                                  ...        ...
49995  OK, lets start with the best. the building. al...          0
49996  The British 'heritage film' industry is out of...          0
49997  I don't even know where to begin on this one. ...          0
49998  Richard Tyler is a little boy who is scared of...          0
49999  I waited long to watch this movie. Also becaus...          1

[50000 rows x 2 columns]>

## Create Train validation and test Set

In [21]:
X_train = df.iloc[:35000]['review'].values
y_train = df.iloc[:35000]['sentiment'].values

X_val = df.iloc[35000:40000]['review'].values
y_val = df.iloc[35000:40000]['sentiment'].values

X_test = df.iloc[40000:]['review'].values
y_test = df.iloc[40000:]['sentiment'].values

print(f"X_train shape : {X_train.shape} y_train shape : {y_train.shape} ")
print(f"X_val shape : {X_val.shape} y_val shape : {y_val.shape} ")
print(f"X_test shape : {X_test.shape} y_test shape : {y_test.shape} ")



X_train shape : (35000,) y_train shape : (35000,) 
X_val shape : (5000,) y_val shape : (5000,) 
X_test shape : (10000,) y_test shape : (10000,) 


## Tokennize the datset using Dataset api from torch

In [24]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
X_train_encoding = tokenizer(list(X_train), truncation=True, padding=True)
X_val_encoding = tokenizer(list(X_val), truncation=True, padding=True)
X_test_encoding = tokenizer(list(X_test), truncation=True, padding=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [25]:
### Create Dataset using torch.util.data.Dataset

In [54]:
from torch.utils.data import Dataset

class ImdbDataSet(Dataset):
    
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key : torch.tensor(val[idx]) for key,val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)

        

### create Dataset from using above class

In [55]:
train_dataset = ImdbDataSet(X_train_encoding,y_train)
val_dataset = ImdbDataSet(X_val_encoding, y_val)
test_dataset = ImdbDataSet(X_test_encoding, y_val)
len(train_dataset)

35000

## Create loaders for Dataset

In [56]:
from torch.utils.data import DataLoader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = DataLoader(val_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

### Loading and FineTuning a Pretrained BERT model

In [57]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(device)
model.train()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classi

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [60]:
## Create Optimizer and Loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [65]:
from torch import nn
def compute_accuracy(model: nn.modules, data_loader : DataLoader, device):
    with torch.inference_mode():
        correct_pred, num_examples = 0,0
        for batch_idx, batch in enumerate(data_loader):
            ## prepeare data
            input_ids = batch['input_ids'].to(device)
            
            attention_masks = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids,attention_masks)
            
            ## fetch logits
            logits = outputs['logits']
            ## 
            predicted_labels = torch.argmax(logits,1)
            num_examples += labels.size(0)
            correct_pred += (predicted_labels == labels).sum()
    return correct_pred.float()/num_examples * 100

### Trainig loop

In [None]:
start_time = time.time()
EPOCHS = 3

for epoch in range(EPOCHS):
    #Traing mode
    model.train()
    
    for batch_idx, batch in enumerate(train_loader):
        ## Prepare data
        input_ids = batch['input_ids'].to(device)
        attention_mask  = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        # forward pass
        outputs = model(input_ids,attention_mask=attention_mask,labels=labels)
        
        logits = outputs['logits']
        # get loss
        loss = outputs['loss']
        
        # Zero grad and backprop
        optimizer.zero_grad()
        loss.backward()
        
        # Optimizer step
        optimizer.step()
        
        ## Some logging for visibility during training
        
        
        if batch_idx % 250 == 0:
            print(f"Epoch : {epoch}/{EPOCHS} | Batch : {batch_idx}/ {len(train_loader)} | loss : {loss:4f}")
        
        
        ## evaluate the model
        model.eval()
        
        with torch.inference_mode():
            print(f"Training accuracy : {compute_accuracy(model,train_loader,device):.2f}%")
            print(f"validation accuracy : {compute_accuracy(model,valid_loader,device):.2f}%")
            print(f"Testing accuracy : {compute_accuracy(model,test_loader,device):.2f}%")
    print(f"Time elapsed : {(time.time()- start_time)/60:.2f} mins")

print(f"Total traning time : {(time.time() - start_time)/60: .2f} mins")
print(f"TEst accuracy : {compute_accuracy(model, test_loader, device=device): .2f}%")
            

Epoch : 0/3 | Batch : 0/ 1094 | loss : 0.942276
