In [None]:
from multiprocessing import reduction
import pandas as pd
import time
import numpy as np
import csv
import argparse
import math
from tqdm import tqdm
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoConfig, AutoModelForTokenClassification, AutoModel, BertPreTrainedModel
from torch import cuda
import os
import torch.nn as nn
import torch.nn.functional as F
from transformers.modeling_outputs import TokenClassifierOutput
import warnings
from sklearn.model_selection import train_test_split

print(torch.version.cuda)
MAX_LEN = 512 # suitable for all datasets
BATCH_SIZE = 8
LEARNING_RATE = 1e-5
num_labels = 2

12.1


In [None]:
!ls

 drive	'IMDB Dataset.csv'   live.txt   sample_data


In [None]:
# nRowsRead = 100
data = pd.read_csv('IMDB Dataset.csv', delimiter=',')

In [None]:
data[:5]

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
class dataset(Dataset):
    def __init__(self, sentences, labels, tokenizer):
        self.sentences = sentences
        self.labels = labels
        self.max_len = 512   #Max token input to bert
        self.tokenizer = tokenizer
        self.len = len(labels)

    def __getitem__(self, index):
        # step 1: tokenize sentence and adapt labels
        sentence = self.sentences[index]
        label = self.labels[index]
        # print(sentence)

        tokenized_sentence = ['[CLS]'] + sentence + ['[SEP]']

        # step 3: truncating or padding
        max_len = self.max_len
        #print(tokenized_sentence)
        if len(tokenized_sentence) > max_len:
            #truncate
            tokenized_sentence = tokenized_sentence[:max_len]
        else:
            # pad
            tokenized_sentence = tokenized_sentence + ['[PAD]' for _ in range(max_len - len(tokenized_sentence))]

        # step 4: obtain attention mask
        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]
        # step 5: convert tokens to input ids
        token_ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)
        target = []
        label_to_id = {'positive' : 0, 'negative' : 1}
        id_to_label = {0: 'positive', 1: 'negative'}
        label_id = label_to_id[label]
        target.append(label_id)

        return {
            'index': index,
            'ids': torch.tensor(token_ids, dtype=torch.long),
            'mask': torch.tensor(attn_mask, dtype=torch.long),
            'target': torch.tensor(target, dtype=torch.long)
        }

    def __len__(self):
        return self.len

In [None]:
sentence_list = data['review'].tolist()
labels = data['sentiment'].tolist()
labels[:5]

['positive', 'positive', 'positive', 'negative', 'positive']

In [None]:
sentence_list[:2]

["One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the f

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
tokenized_sentences = []
for sentence in sentence_list:
  split_sentence = list(sentence.split(' '))
  new_sentence = []
  for word in split_sentence:
    new_words = tokenizer.tokenize(word)
    new_sentence.extend(new_words)
  tokenized_sentences.append(new_sentence)

In [None]:
full_dataset  = dataset(sentences=tokenized_sentences, labels = labels, tokenizer=tokenizer)

In [None]:
full_dataset[0]

{'index': 0,
 'ids': tensor([  101,  2028,  1997,  1996,  2060, 15814,  2038,  3855,  2008,  2044,
          3666,  2074,  1015, 11472,  2792,  2017,  1005,  2222,  2022, 13322,
          1012,  2027,  2024,  2157,  1010,  2004,  2023,  2003,  3599,  2054,
          3047,  2007,  2033,  1012,  1026,  7987,  1013,  1028,  1026,  7987,
          1013,  1028,  1996,  2034,  2518,  2008,  4930,  2033,  2055, 11472,
          2001,  2049, 24083,  1998,  4895, 10258,  2378,  8450,  5019,  1997,
          4808,  1010,  2029,  2275,  1999,  2157,  2013,  1996,  2773,  2175,
          1012,  3404,  2033,  1010,  2023,  2003,  2025,  1037,  2265,  2005,
          1996,  8143, 18627,  2030,  5199,  3593,  1012,  2023,  2265,  8005,
          2053, 17957,  2007, 12362,  2000,  5850,  1010,  3348,  2030,  4808,
          1012,  2049,  2003, 13076,  1010,  1999,  1996,  4438,  2224,  1997,
          1996,  2773,  1012,  1026,  7987,  1013,  1028,  1026,  7987,  1013,
          1028,  2009,  2003,  2

In [None]:
train_size = int(0.8 * len(full_dataset))
devel_size = int(0.1 * len(full_dataset))
test_size = len(full_dataset) - train_size - devel_size
train_dataset, devel_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, devel_size, test_size])

In [None]:
devel_dataset[0]

{'index': 82,
 'ids': tensor([  101,  2821,  2643,  1010,  1045,  2442,  2031,  2464,  2023,  2043,
          1045,  2001,  2069,  2340,  2030,  4376,  1010,  1006,  2123,  1005,
          1056,  3198,  2129,  1007,  1045,  2089,  2031,  2042,  2402,  1010,
          2021,  1045,  2347,  1005,  1056,  5236,  1012,  3087,  2071,  2156,
          2008,  2023,  2003,  1037,  2919,  3185,  1010, 11808,  1010,  7977,
          1010,  4895, 15782,  2854,  1998,  2200, 10021,  1012,  1045,  1005,
          2310,  2464,  2062,  8052,  3896,  2012, 25104,  1010,  1045,  1005,
          2310,  2464,  2488,  4616,  2012,  1037,  2082,  2377,  1010,  1998,
          1045,  1005,  2310,  2464,  2062, 13359, 21843,  2015,  2012,  1996,
          9201,  1010,  2073,  2027,  2079,  2498,  2021,  4133,  1999,  1996,
          2300,  1010,  9217,  1996,  2336, 15135,  2006,  1996,  3221,  1012,
          1026,  7987,  1013,  1028,  1026,  7987,  1013,  1028,  1996,  2466,
          2003,  2275,  1999,  

In [None]:

class MainModel(BertPreTrainedModel):
    def __init__(self, config):
        super(MainModel,self).__init__(config)
        self.num_labels = num_labels
        self.bert = AutoModel.from_pretrained("bert-base-uncased")
        # two fully connected layers
        self.hidden_layer = nn.Linear(768, 2*(self.num_labels))
        self.classifier = nn.Linear(2*(self.num_labels),self.num_labels)

    def forward(self, input_ids, attention_mask, labels,device):

        output = self.bert(input_ids, attention_mask = attention_mask)
        output = output.last_hidden_state
        output = output[:,0,:]
        hidden_output = self.hidden_layer(output)
        classifier_out = self.classifier(hidden_output)
        main_prob = F.softmax(classifier_out, dim = 1)
        loss_main = F.cross_entropy(main_prob, labels.view(-1))
        return loss_main,main_prob

In [None]:

def train(model, dataloader, optimizer, device):
    tr_loss, tr_accuracy = 0, 0
    bias_loss = 0
    # tr_preds, tr_labels = [], []
    nb_tr_steps = 0
    #put model in training mode
    model.train()

    for idx, batch in enumerate(dataloader):
        indexes = batch['index']
        input_ids = batch['ids'].to(device, dtype=torch.long)
        mask = batch['mask'].to(device, dtype=torch.long)
        targets = batch['target'].to(device, dtype=torch.long)

        loss_main, main_prob = model(input_ids=input_ids, attention_mask=mask, labels=targets,device = device)

        tr_loss += loss_main.item()
        nb_tr_steps += 1
        predicted_labels = torch.argmax(main_prob, dim=1)
        targets = targets.view(-1)
        tmp_tr_accuracy = accuracy_score(targets.cpu().numpy(), predicted_labels.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
        if idx % 100 == 0:
            print(f'\tModel loss at {idx} steps: {tr_loss}')
            if idx != 0:
                print(f'\tModel Accuracy : {tr_accuracy/nb_tr_steps}')
            with open('live.txt', 'a') as fh:
                fh.write(f'\tModel Loss at {idx} steps : {tr_loss}\n')
                if idx != 0:
                    fh.write(f'\tModel Accuracy : {tr_accuracy/nb_tr_steps}')
        torch.nn.utils.clip_grad_norm_(
            parameters = model.parameters(),
            max_norm = 10
        )
        optimizer.zero_grad()
        loss_main.backward()
        optimizer.step()


    print(f'\tModel loss for the epoch: {tr_loss}')
    print(f'\tTraining accuracy for epoch: {tr_accuracy/nb_tr_steps}')


In [None]:


def valid(model, dataloader, device):
    eval_loss = 0
    bias_loss = 0
    eval_accuracy = 0
    model.eval()
    nb_eval_steps = 0
    for batch in dataloader:
        indexes = batch['index']
        input_ids = batch['ids'].to(device, dtype=torch.long)
        mask = batch['mask'].to(device, dtype=torch.long)
        targets = batch['target'].to(device, dtype=torch.long)

        loss_main, main_prob = model(input_ids=input_ids, attention_mask=mask, labels=targets,device = device)
        eval_loss += loss_main.item()
        nb_eval_steps += 1
        #compute training accuracy
        predicted_labels = torch.argmax(main_prob, dim=1)
        # print(predicted_labels.shape)
        targets = targets.view(-1)
        # print(targets.shape)
        tmp_eval_accuracy = accuracy_score(targets.cpu().numpy(), predicted_labels.cpu().numpy())
        eval_accuracy += tmp_eval_accuracy

    print(f'\tValidation accuracy for epoch: {eval_accuracy/nb_eval_steps}')

    return eval_loss, eval_accuracy/nb_eval_steps

In [None]:
def inference(model, dataloader, tokenizer, device):
    model.eval()
    pred_lst = []
    test_loss = 0
    bias_loss = 0
    nb_test_steps = 0
    test_accuracy = 0
    for idx, batch in enumerate(tqdm(dataloader, ncols=100)):
        indexes = batch['index']
        input_ids = batch['ids'].to(device, dtype=torch.long)
        mask = batch['mask'].to(device, dtype=torch.long)
        targets = batch['target'].to(device, dtype=torch.long)
        with torch.no_grad():
            loss_main, main_prob = model(input_ids=input_ids, attention_mask=mask, labels=targets, device = device)
        test_loss += loss_main.item()
        nb_test_steps += 1
        predicted_labels = torch.argmax(main_prob, dim=1)
        # print(predicted_labels.shape)
        targets = targets.view(-1)
        # print(targets.shape)
        tmp_test_accuracy = accuracy_score(targets.cpu().numpy(), predicted_labels.cpu().numpy())
        test_accuracy += tmp_test_accuracy

    test_accuracy = test_accuracy / nb_test_steps
    return test_accuracy

In [None]:
#main function

config = AutoConfig.from_pretrained("bert-base-uncased" , num_labels=num_labels)
model = MainModel.from_pretrained("bert-base-uncased", config = config)
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)
device = 'cuda' if cuda.is_available() else 'cpu'
model.to(device)
train_dataloader = DataLoader(train_dataset, shuffle = True, batch_size=BATCH_SIZE)
devel_dataloader = DataLoader(devel_dataset, shuffle=True, batch_size=BATCH_SIZE)
test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=BATCH_SIZE)
num_epochs = 10
max_acc = 0.0
patience = 0
best_model = model
best_tokenizer = tokenizer
start = time.time()
for epoch in range(num_epochs):
    print(f'Epoch {epoch+1}:')
    train(model, train_dataloader, optimizer, device)
    validation_loss, eval_acc = valid(model, devel_dataloader, device)
    print(f'\tValidation loss: {validation_loss}')
    if eval_acc >= max_acc:
        max_acc = eval_acc
        patience = 0
        best_model = model
        best_tokenizer = tokenizer
    else:
        patience += 1
        if patience > 5:
            print("Early stopping at epoch : ",epoch)
            patience = 0
            break

end = time.time()
total_time = end - start

print(f"Total training time : {total_time}")

test_accuracy = inference(model, test_dataloader, tokenizer, device)
print(f'\t test accuracy: {test_accuracy}')

Some weights of MainModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['hidden_layer.bias', 'classifier.bias', 'classifier.weight', 'hidden_layer.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1:
	Model loss at 0 steps: 0.69343501329422
	Model loss for the epoch: 6.933956801891327
	Training accuracy for epoch: 0.5
	Validation accuracy for epoch: 0.625
	Validation loss: 1.3792420625686646
Epoch 2:
	Model loss at 0 steps: 0.6973687410354614
	Model loss for the epoch: 6.875142872333527
	Training accuracy for epoch: 0.5625
	Validation accuracy for epoch: 0.8125
	Validation loss: 1.349338412284851
Epoch 3:
	Model loss at 0 steps: 0.6857047080993652
	Model loss for the epoch: 6.830626010894775
	Training accuracy for epoch: 0.5625
	Validation accuracy for epoch: 0.8125
	Validation loss: 1.3269217014312744
Epoch 4:
	Model loss at 0 steps: 0.7046381831169128
	Model loss for the epoch: 6.747569978237152
	Training accuracy for epoch: 0.6625
	Validation accuracy for epoch: 0.5625
	Validation loss: 1.3461459875106812
Epoch 5:
	Model loss at 0 steps: 0.657723605632782
	Model loss for the epoch: 6.655515670776367
	Training accuracy for epoch: 0.6875
	Validation accuracy for epoch: 0.

100%|█████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  6.55it/s]

	 test accuracy: 0.5625



