<a href="https://colab.research.google.com/github/bhadreshpsavani/NLP-Notes/blob/master/ExperimentingWithBert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Import Dependencies

In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |▍                               | 10kB 15.6MB/s eta 0:00:01[K     |▉                               | 20kB 4.4MB/s eta 0:00:01[K     |█▎                              | 30kB 5.5MB/s eta 0:00:01[K     |█▊                              | 40kB 5.7MB/s eta 0:00:01[K     |██▏                             | 51kB 4.9MB/s eta 0:00:01[K     |██▋                             | 61kB 5.4MB/s eta 0:00:01[K     |███                             | 71kB 5.7MB/s eta 0:00:01[K     |███▍                            | 81kB 6.1MB/s eta 0:00:01[K     |███▉                            | 92kB 6.2MB/s eta 0:00:01[K     |████▎                           | 102kB 6.0MB/s eta 0:00:01[K     |████▊                           | 112kB 6.0MB/s eta 0:00:01[K     |█████▏                          | 122kB 6.0M

In [2]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

### Import the Training Dataset

In [4]:
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)

In [5]:
df.columns = ['SENTENCE', 'TARGET']
df.head()

Unnamed: 0,SENTENCE,TARGET
0,"a stirring , funny and finally transporting re...",1
1,apparently reassembled from the cutting room f...,0
2,they presume their audience wo n't sit still f...,0
3,this is a visually stunning rumination on love...,1
4,jonathan parker 's bartleby should have been t...,1


### Loading Pretrained Bert Tokenizer



In [7]:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




### Preprocess Input

In [6]:
TRAIN_BATCH_SIZE = 4
TEST_BATCH_SIZE = 2
EPOCHS = 5
LEARNING_RATE = 1e-05
MAX_LEN = 64

In [16]:
class Triage(torch.utils.data.Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        title = str(self.data.SENTENCE[index])
        title = " ".join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.TARGET[index], dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [17]:
# Creating the dataset and dataloader for the neural network
train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state=200)
test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = Triage(train_dataset, tokenizer, MAX_LEN)
testing_set = Triage(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (6920, 2)
TRAIN Dataset: (5536, 2)
TEST Dataset: (1384, 2)


In [18]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = torch.utils.data.DataLoader(training_set, **train_params)
testing_loader = torch.utils.data.DataLoader(testing_set, **test_params)

## Fine Tuning DistilBert

In [11]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 
class DistilBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistilBERTClass, self).__init__()
        self.l1 = ppb.DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 1)
    
    def forward(self, ids, mask):
        output_1= self.l1(ids, mask)
        output_2 = self.l2(output_1[0])
        output = self.l3(output_2)
        return output

In [12]:
model = DistilBERTClass()
model.to(device)

DistilBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_featu

In [13]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [14]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model

def train(epoch):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)
        outputs = model(ids, mask).squeeze()
        optimizer.zero_grad()
        loss = loss_function(outputs, targets)
        if _%500==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [19]:
for epoch in range(EPOCHS):
    train(epoch)

Epoch: 0, Loss:  4.281152248382568
Epoch: 0, Loss:  1.4380497932434082
Epoch: 0, Loss:  0.8455644249916077
Epoch: 0, Loss:  0.7071265578269958
Epoch: 0, Loss:  0.6060503721237183
Epoch: 0, Loss:  0.9129670858383179
Epoch: 0, Loss:  0.5552871227264404
Epoch: 0, Loss:  0.7446519136428833
Epoch: 0, Loss:  0.4004485607147217
Epoch: 0, Loss:  0.3368568420410156
Epoch: 0, Loss:  0.3948967456817627
Epoch: 0, Loss:  0.5180408358573914
Epoch: 0, Loss:  0.1444416642189026
Epoch: 0, Loss:  0.09901189804077148
Epoch: 0, Loss:  0.6328852772712708
Epoch: 0, Loss:  0.19777247309684753
Epoch: 0, Loss:  0.6900107264518738
Epoch: 0, Loss:  0.5885286927223206
Epoch: 0, Loss:  0.7088488340377808
Epoch: 0, Loss:  0.5946123600006104
Epoch: 0, Loss:  0.3296729624271393
Epoch: 0, Loss:  0.6868255734443665
Epoch: 0, Loss:  0.13432401418685913
Epoch: 0, Loss:  0.6956230998039246
Epoch: 0, Loss:  0.5299274325370789
Epoch: 0, Loss:  0.2781771421432495
Epoch: 0, Loss:  0.4445715546607971
Epoch: 0, Loss:  0.0951070

In [24]:
def valid(model, testing_loader):
  model.eval()
  n_correct, n_wrong, total = (0, 0, 0)
  with torch.no_grad():
    for _, data in enumerate(testing_loader, 0):
      ids = data['ids'].to(device, dtype = torch.long)
      mask = data['mask'].to(device, dtype = torch.long)
      target = data['targets'].to(device, dtype = torch.long)
      output = model(ids, mask).squeeze()
      big_val, big_idx = torch.max(output.data, dim=1)
      total+=target.size(0)
      n_correct+=(big_idx==target).sum().item()
  return (n_correct*100)/total

In [25]:
acc = valid(model, testing_loader)
print("This is the accuracy of our model", acc)

This is the accuracy of our model 88.15028901734104


In [28]:
output_model_file = 'model/pytorch_distilBert.bin'
output_vocab_file = 'model/vocab_distilBert.bin'

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print("All files saved")

All files saved
