In [23]:
# Importing ml libraries
import warnings
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DistilBertTokenizer, DistilBertModel
import logging
logging.basicConfig(level=logging.ERROR)
import re
import ast

In [24]:
train_df = pd.read_csv('processed/final-train.csv')
dev_df = pd.read_csv('processed/final-dev.csv')

In [25]:
# # Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [26]:
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    return np.mean(acc_list)

In [27]:
train = pd.DataFrame()
train['tweets'] = train_df['processed']
train['labels'] = train_df.iloc[:, 1:9].values.tolist()
train['average_score'] = train_df['average_score']

dev = pd.DataFrame()
dev['tweets'] = dev_df['processed']
dev['labels'] = dev_df.iloc[:,1:9].values.tolist()
dev['average_score'] = dev_df['average_score']

train

Unnamed: 0,tweets,labels,average_score
0,worry payment problem may never joyce meyer mo...,"[0, 1, 0, 0, 1, 0, 0, 1]",[0.00333792 1. 0.0042222 0.76656254 0...
1,whatever decide make sure make happy,"[0, 1, 0, 0, 1, 0, 0, 1]",[0. 0.86536213 0.11912322 0.00963587 1...
2,help drowning thoughts,"[0, 0, 0, 1, 0, 1, 0, 0]",[0. 0.39879283 0.04923599 0.82461023 0...
3,help brother drowning,"[0, 0, 0, 1, 0, 0, 1, 0]",[2.26823486e-04 3.45036286e-03 0.00000000e+00 ...
4,also help majority nfl coaching inept bill bri...,"[1, 1, 1, 0, 1, 0, 0, 0]",[1. 0.00347338 0.96239843 0.00468326 0...
...,...,...,...
11111,feel like people life very supportive others n...,"[0, 0, 0, 0, 0, 0, 0, 1]",[0.02367427 0.02209223 0.02753819 0.2476681 1...
11112,feel loyal sen,"[0, 0, 0, 0, 0, 0, 0, 1]",[0. 0.00566371 0.00191819 0.81291933 0...
11113,feel complicit supporting owning copy,"[0, 0, 0, 0, 0, 0, 0, 1]",[0. 0.00597871 0.00205567 0.19240879 0...
11114,really feel like supporting helping,"[0, 0, 0, 0, 0, 0, 0, 1]",[0.00368038 0.00478825 0.00875506 0.23504549 0...


In [28]:
# Sections of config
# Defining some key variables that will be used later on in the training
MAX_LEN = 64
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
EPOCHS = 12
LEARNING_RATE = 1e-06
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)

In [29]:
class MultiLabelDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.tweets
        self.labels = dataframe.labels
        self.average_score = dataframe.average_score
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        labels = torch.tensor(self.labels[index], dtype=torch.float)
        
        average_score_str = self.average_score[index].strip('[]')
        average_score_list = [float(score.strip()) for score in average_score_str.split(' ') if score.strip()]
        average_score = torch.tensor(average_score_list, dtype=torch.float)
    

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'labels': labels,
            'average_score': average_score
        }


In [30]:

print("FULL Dataset: {}".format(train.shape))
print("TRAIN Dataset: {}".format(dev.shape))
# print("TEST Dataset: {}".format(test.shape))

training_set = MultiLabelDataset(train, tokenizer, MAX_LEN)
testing_set = MultiLabelDataset(dev, tokenizer, MAX_LEN)
training_set[0]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


FULL Dataset: (11116, 3)
TRAIN Dataset: (886, 3)


{'ids': tensor([  101,  4737,  7909,  3291,  2089,  2196, 11830, 11527, 14354,  4105,
          4737,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0]),
 'mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'labels': t

In [31]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [32]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class DistilBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistilBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768+8,8)

    def forward(self, input_ids, attention_mask, token_type_ids, average_score):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        # Concatenate tokenized inputs with average_score
        combined = torch.cat((pooler,average_score), dim=1)
        output = self.classifier(combined)
        return output

model = DistilBERTClass()
model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


DistilBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_featu

In [33]:
def loss_fn(outputs, targets):
    # print(outputs,targets)
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [34]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [35]:
def train(epoch):
    model.train()
    for _, data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype=torch.long)
        mask = data['mask'].to(device, dtype=torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        average_score = data['average_score'].to(device, dtype=torch.float)
        targets = data['labels'].to(device, dtype=torch.float)
        outputs = model(ids, mask, token_type_ids, average_score)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        loss.backward()
        optimizer.step()


In [36]:
from sklearn.metrics import classification_report,confusion_matrix, plot_confusion_matrix
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

def validation(testing_loader):
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype=torch.long)
            mask = data['mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            average_score = data['average_score'].to(device, dtype=torch.float)  
            targets = data['labels'].to(device, dtype=torch.float)  # Ensure this is the correct key for your labels

            outputs = model(ids, mask, token_type_ids, average_score)  # Pass emotion_scores to the model

            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    # Convert lists to numpy arrays for easier processing
    fin_targets = np.array(fin_targets)
    fin_outputs = np.array(fin_outputs)

    # Convert binary outputs to class labels
    final_outputs = np.where(fin_outputs >= 0.5, 1, 0)

    # Calculate precision, recall, and F1 score
    report = classification_report(fin_targets, final_outputs, output_dict=True)
    precision = report['weighted avg']['precision']
    recall = report['weighted avg']['recall']
    f1_score = report['weighted avg']['f1-score']

    # Calculate accuracy manually
    accuracy = np.mean(fin_targets == final_outputs)

    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1_score}")
    print(f"Accuracy: {accuracy}")

    return final_outputs, fin_targets



In [37]:
for epoch in range(EPOCHS):
    train(epoch) 

4it [00:00, 31.22it/s]

Epoch: 0, Loss:  0.6974718570709229


1390it [00:49, 28.24it/s]
4it [00:00, 29.14it/s]

Epoch: 1, Loss:  0.4691627621650696


1390it [00:53, 26.15it/s]
4it [00:00, 29.12it/s]

Epoch: 2, Loss:  0.23330560326576233


1390it [00:51, 26.75it/s]
6it [00:00, 28.07it/s]

Epoch: 3, Loss:  0.20665127038955688


1390it [00:51, 27.02it/s]
4it [00:00, 30.23it/s]

Epoch: 4, Loss:  0.42200881242752075


1390it [00:51, 26.85it/s]
3it [00:00, 29.29it/s]

Epoch: 5, Loss:  0.17278921604156494


1390it [00:51, 26.85it/s]
3it [00:00, 29.70it/s]

Epoch: 6, Loss:  0.37243330478668213


1390it [00:51, 27.09it/s]
6it [00:00, 28.28it/s]

Epoch: 7, Loss:  0.220111683011055


1390it [00:51, 27.08it/s]
4it [00:00, 29.43it/s]

Epoch: 8, Loss:  0.28109997510910034


1390it [00:50, 27.36it/s]
3it [00:00, 29.95it/s]

Epoch: 9, Loss:  0.12249624729156494


1390it [00:50, 27.47it/s]
4it [00:00, 29.72it/s]

Epoch: 10, Loss:  0.1811279058456421


1390it [00:50, 27.44it/s]
4it [00:00, 29.63it/s]

Epoch: 11, Loss:  0.20254802703857422


1390it [00:50, 27.42it/s]


In [41]:
outputs, targets = validation(testing_loader)

final_outputs = np.array(outputs) >= 0.5

111it [00:00, 112.52it/s]

Precision: 0.7632003811877183
Recall: 0.6502325581395348
F1 Score: 0.7000984205719701
Accuracy: 0.8325338600451467





In [39]:
val_hamming_loss = metrics.hamming_loss(targets, final_outputs)
val_hamming_score = hamming_score(np.array(targets), np.array(final_outputs))

print(f"Hamming Score = {val_hamming_score}")
print(f"Hamming Loss = {val_hamming_loss}")

Hamming Score = 0.5822234762979683
Hamming Loss = 0.16746613995485327


In [40]:
# Saving the files for inference

output_model_file = 'models/distilbert_emotions.bin'
output_vocab_file = 'models/vocab_distilbert_emotions.bin'

# torch.save(model.state_dict(), output_model_file)
torch.save(model, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('Saved')

Saved
