# British Museum Tweet Sentiment Analysis (BERT)

In [6]:
import os 
import pandas as pd 
import numpy as np

from tqdm.notebook import tqdm

import torch 
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, BertModel
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW

from sklearnex import patch_sklearn
patch_sklearn()

from sklearn.metrics import f1_score 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import warnings 
warnings.filterwarnings("ignore") 

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [7]:
os.chdir(r"C:\Users\Al-Amin\Desktop\Data_Analyst_Projects\Python\Sentitment Analysis (Bert)\Dataset")

In [8]:
df = pd.read_csv("smile-annotations-final.csv", 
                 names = ['id', 'text', 'category'])
df.set_index('id', inplace = True)

In [9]:
df.head()

Unnamed: 0_level_0,text,category
id,Unnamed: 1_level_1,Unnamed: 2_level_1
611857364396965889,@aandraous @britishmuseum @AndrewsAntonio Merc...,nocode
614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy
614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,happy
614877582664835073,@Sofabsports thank you for following me back. ...,happy
611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,happy


In [10]:
df.text.iloc[1]

'Dorian Gray with Rainbow Scarf #LoveWins (from @britishmuseum http://t.co/Q4XSwL0esu) http://t.co/h0evbTBWRq'

In [11]:
df.category.value_counts()

category
nocode               1572
happy                1137
not-relevant          214
angry                  57
surprise               35
sad                    32
happy|surprise         11
happy|sad               9
disgust|angry           7
disgust                 6
sad|disgust             2
sad|angry               2
sad|disgust|angry       1
Name: count, dtype: int64

In [12]:
# Removing categories with multiple emotions
df = df[~df.category.str.contains('\|')]

# Removing nocode from categories
df = df[df.category != 'nocode']

In [13]:
df.category.value_counts()

category
happy           1137
not-relevant     214
angry             57
surprise          35
sad               32
disgust            6
Name: count, dtype: int64

In [14]:

possible_labels = df.category.unique()
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

label_dict

{'happy': 0,
 'not-relevant': 1,
 'angry': 2,
 'disgust': 3,
 'sad': 4,
 'surprise': 5}

In [15]:
df['label'] = df.category.replace(label_dict)
df.head()

Unnamed: 0_level_0,text,category,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy,0
614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,happy,0
614877582664835073,@Sofabsports thank you for following me back. ...,happy,0
611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,happy,0
611570404268883969,@NationalGallery @ThePoldarkian I have always ...,happy,0


In [16]:
df.groupby('category')['label'].value_counts()

category      label
angry         2          57
disgust       3           6
happy         0        1137
not-relevant  1         214
sad           4          32
surprise      5          35
Name: count, dtype: int64

There is clearly an imbalance in the data in terms of the emotions associated with the text. We would need to take this into consideration when forming our train/test split. Essentially, we will stratify our samples.

### Training/Validation Split

In [17]:
X = df.index
y = df.label.values

X_train, X_val, y_train, y_val = train_test_split(X, y, stratify =y, test_size = 0.15, random_state = 17)

In [18]:
# Checking how the data will be split

df['data_type'] = 'not_set'

df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

df.groupby(['category', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,text
category,label,data_type,Unnamed: 3_level_1
angry,2,train,48
angry,2,val,9
disgust,3,train,5
disgust,3,val,1
happy,0,train,966
happy,0,val,171
not-relevant,1,train,182
not-relevant,1,val,32
sad,4,train,27
sad,4,val,5


#### Encoding text into numerical features 

In [19]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True, clean_up_tokenization_spaces = True)

encoded_train = tokenizer.batch_encode_plus(df[df.data_type == 'train'].text.values, 
                                            add_special_tokens = True, 
                                            return_attention_mask = True, 
                                            padding = 'max_length',
                                            truncation = True,
                                            max_length  = 256, 
                                            return_tensors = 'pt')

input_ids_train = encoded_train['input_ids']
attention_masks_train = encoded_train['attention_mask']
label_train = torch.tensor(df[df.data_type == 'train'].label.values)

train_data = TensorDataset(input_ids_train, attention_masks_train, label_train)

encoded_val = tokenizer.batch_encode_plus(df[df.data_type == 'val'].text.values, 
                                            add_special_tokens = True, 
                                            return_attention_mask = True, 
                                            padding = 'max_length',
                                            truncation = True,
                                            max_length  = 256, 
                                            return_tensors = 'pt')

input_ids_val = encoded_val['input_ids']
attention_masks_val = encoded_val['attention_mask']
label_val = torch.tensor(df[df.data_type == 'val'].label.values)

val_data = TensorDataset(input_ids_val, attention_masks_val, label_val)

In [20]:
len(train_data),len(val_data)

(1258, 223)

### Setting up BERT Pretrained Model

In [21]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', 
                                      num_labels = len(label_dict), 
                                      output_attentions = False, 
                                      output_hidden_states = False)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Creating Data Loaders

In [22]:
batch_size = 4
dataloader_train = DataLoader(train_data, sampler = RandomSampler(train_data), batch_size = batch_size)

dataloader_val = DataLoader(val_data, sampler = RandomSampler(val_data), batch_size = 32)

### Setting up Optimizer and Scheduler 

In [23]:
optimizer = AdamW(model.parameters(), lr = 1e-5, eps = 1e-8)

In [24]:
epochs = 1
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, 
                                            num_training_steps = len(dataloader_train)*epochs)

### Defining our Performance Metrics

In [25]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis =1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average = 'weighted')

In [26]:
def accuracy_per_class(preds, labels):  
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    preds_flat = np.argmax(preds, axis =1).flatten()
    labels_flat = labels.flatten()   

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat == label]
        y_true = labels_flat[labels_flat == label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds == label])}/{len(y_true)}\n')

### Creating our Training Loop

In [27]:
import random 

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)

<torch._C.Generator at 0x2890599d8d0>

In [28]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

cuda


In [29]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in tqdm(dataloader_val):
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [30]:
import os
model_dir = 'torch_models'
os.makedirs(model_dir, exist_ok=True)

In [31]:
for epoch in tqdm(range(1, epochs+1)):
    model.train()
    loss_train_total = 0
    progress_bar = tqdm(dataloader_train, desc = 'Epoch {:1d}'.format(epoch), leave = False, disable = False)
    
    for batch in progress_bar:
        
        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids' : batch[0], 
                  'attention_mask' : batch[1], 
                  'labels' : batch[2]}
        
        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        progress_bar.set_postfix({'training_loss' : '{:.3f}'.format(loss.item()/len(batch))})

    model_save_path = os.path.join(model_dir, f'Bert_ft_epoch{epoch}.model')
    torch.save(model.state_dict(), model_save_path)
    tqdm.write('\nEpoch {epoch}')
    loss_train_avg = loss_train_total/len(dataloader_train)
    tqdm.write('Training loss: {loss_train_avg}')

    val_loss, predictions, true_vals = evaluate(dataloader_val)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation Loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted) : {val_f1}')

  0%|          | 0/1 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/315 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: {loss_train_avg}


  0%|          | 0/7 [00:00<?, ?it/s]

Validation Loss: 0.7401401443140847
F1 Score (Weighted) : 0.7043271626231267


### Loading and evaluating our Model

In [32]:
model_path = r"torch_models\Bert_ft_epoch1.model"
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))

<All keys matched successfully>

In [33]:
_, preicitions, true_vals = evaluate(dataloader_val)

  0%|          | 0/7 [00:00<?, ?it/s]

In [34]:
accuracy_per_class(predictions, true_vals)

Class: happy
Accuracy: 168/171

Class: not-relevant
Accuracy: 1/32

Class: angry
Accuracy: 0/9

Class: disgust
Accuracy: 0/1

Class: sad
Accuracy: 0/5

Class: surprise
Accuracy: 0/5



Ideally we would want to increase the batch size and the number of epochs