# **Sentiment Analysis with Deep Learning using BERT**


## **What is BERT?**

BERT is a large-scale transformer-based Language Model that can be finetuned for a variety of tasks.

For more information, the original paper can be found here (https://arxiv.org/abs/1810.04805).

HuggingFace documentation (https://huggingface.co/transformers/model_doc/bert.html)

In [1]:
import numpy as np 
import pandas as pd

In [2]:
df=pd.read_csv('/content/file_new.csv')

In [16]:
df.drop('Unnamed: 0',axis=1,inplace=True)

## 1: Exploratory Data Analysis and Preprocessing

In [5]:
!pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [6]:
import torch
from tqdm.notebook import tqdm
import numpy as np 
import pandas as pd

In [17]:
df['Liked']=df['Liked'].astype(int)

In [9]:
set(df.Liked)

{0, 1}

In [11]:
df.Liked.value_counts()

1    867
0    533
Name: Liked, dtype: int64

## 2: Training/Validation Split

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
X_train, X_val, y_train, y_val = train_test_split(df.index.values, 
                                                  df.Liked.values, 
                                                  test_size=0.15, 
                                                  random_state=42,
                                                  stratify=df.Liked.values)

In [20]:
df.head()

Unnamed: 0,Review,Liked
0,wow loved this place,1
1,crust is not good,0
2,not tasty and the texture was just nasty,0
3,stopped by during the late may bank holiday of...,1
4,the selection on the menu was great and so wer...,1


In [21]:
df['data_type'] = ['not_set']*df.shape[0]

In [22]:
df.head()

Unnamed: 0,Review,Liked,data_type
0,wow loved this place,1,not_set
1,crust is not good,0,not_set
2,not tasty and the texture was just nasty,0,not_set
3,stopped by during the late may bank holiday of...,1,not_set
4,the selection on the menu was great and so wer...,1,not_set


In [23]:
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

In [24]:
df

Unnamed: 0,Review,Liked,data_type
0,wow loved this place,1,train
1,crust is not good,0,train
2,not tasty and the texture was just nasty,0,val
3,stopped by during the late may bank holiday of...,1,train
4,the selection on the menu was great and so wer...,1,train
...,...,...,...
1395,A wonderful dinner. The maÃ®tre de was excepti...,1,train
1396,"Food was great, staff were attentive. It was v...",1,train
1397,"Great food and wine, great service. Highly rec...",1,train
1398,"Beautiful food , wines and a stunning view! Lo...",1,train


In [25]:
df.groupby(['Liked', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Review
Liked,data_type,Unnamed: 2_level_1
0,train,453
0,val,80
1,train,737
1,val,130


# 3. Loading Tokenizer and Encoding our Data

In [26]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m46.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m107.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.1 tokenizers-0.13.2 transformers-4.26.1


In [28]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset

In [29]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case=True
)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [32]:
encoded_data_train = tokenizer.batch_encode_plus(
    
    df[df.data_type=='train'].Review.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].Review.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].Liked.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].Liked.values)



In [33]:
input_ids_train

tensor([[  101, 10166,  3866,  ...,     0,     0,     0],
        [  101, 19116,  2003,  ...,     0,     0,     0],
        [  101,  3030,  2011,  ...,     0,     0,     0],
        ...,
        [  101,  2307,  2833,  ...,     0,     0,     0],
        [  101,  3376,  2833,  ...,     0,     0,     0],
        [  101,  6429,  2833,  ...,     0,     0,     0]])

In [34]:
dataset_train = TensorDataset(input_ids_train, 
                              attention_masks_train,
                              labels_train)

dataset_val = TensorDataset(input_ids_val, 
                            attention_masks_val,
                           labels_val)

In [35]:
len(dataset_train)

1190

In [36]:
dataset_val.tensors

(tensor([[  101,  2025, 11937,  ...,     0,     0,     0],
         [  101,  1996,  2833,  ...,     0,     0,     0],
         [  101,  2027,  2196,  ...,     0,     0,     0],
         ...,
         [  101,  3376,  3295,  ...,     0,     0,     0],
         [  101,  2307,  1010,  ...,     0,     0,     0],
         [  101,  2023,  2001,  ...,     0,     0,     0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1,
         0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
         1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1,
         1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0,

# 4. Setting up BERT Pretrained Model

In [37]:
from transformers import BertForSequenceClassification

In [39]:
model = BertForSequenceClassification.from_pretrained(
                                      'bert-base-uncased', 
                                      num_labels = 2,#6
                                      output_attentions = False,
                                      output_hidden_states = False
                                     )

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

# 5. Creating Data Loaders

In [40]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [41]:
dataset_train

<torch.utils.data.dataset.TensorDataset at 0x7f87878edf10>

In [42]:
batch_size = 4

dataloader_train = DataLoader(
    dataset_train,
    sampler=RandomSampler(dataset_train),
    batch_size=batch_size
)

dataloader_val = DataLoader(
    dataset_val,
    sampler=RandomSampler(dataset_val),
    batch_size=32
)

# 6. Setting Up Optimizer and Scheduler

In [43]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [44]:
optimizer = AdamW(
    model.parameters(),
    lr = 1e-5,
    eps = 1e-8
)



In [61]:
epochs = 3

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps = len(dataloader_train)*epochs
)

# 7. Defining our Performance Metrics

In [62]:
import numpy as np
from sklearn.metrics import f1_score

In [63]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average = 'weighted')

In [64]:
def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy:{len(y_preds[y_preds==label])}/{len(y_true)}\n')

# 8. Creating our Training Loop

In [65]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [66]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

cuda


In [67]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in tqdm(dataloader_val):
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [68]:
for epoch in tqdm(range(1, epochs+1)):
    model.train() #forward propagation
    loss_train_total = 0
    
    progress_bar = tqdm(dataloader_train, 
                        desc='Epoch {:1d}'.format(epoch), 
                        leave=False, 
                        disable=False)
    
    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }
        
        outputs = model(**inputs)
        loss = outputs[0]
        loss_train_total +=loss.item()
        loss.backward() #backwardprop
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})     
    
    torch.save(model, f'BERT_ft_Epoch{epoch}.model')
    
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_val)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (weighted): {val_f1}')


  0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/298 [00:00<?, ?it/s]


Epoch 1
Training loss: 6.914049946689734e-05


  0%|          | 0/7 [00:00<?, ?it/s]

Validation loss: 0.3694971959848772
F1 Score (weighted): 0.9523809523809523


Epoch 2:   0%|          | 0/298 [00:00<?, ?it/s]


Epoch 2
Training loss: 4.9712050034660274e-05


  0%|          | 0/7 [00:00<?, ?it/s]

Validation loss: 0.4513798121895109
F1 Score (weighted): 0.947680599854513


Epoch 3:   0%|          | 0/298 [00:00<?, ?it/s]


Epoch 3
Training loss: 4.444206345415418e-05


  0%|          | 0/7 [00:00<?, ?it/s]

Validation loss: 0.42509358482701437
F1 Score (weighted): 0.947680599854513
