In [1]:
import csv
import torch
from datasets import Dataset
import transformers
from transformers import (
  AdamW,
  BertConfig,
  BertModel,
  BertTokenizer)
from torch.utils.data import DataLoader
import torch.nn as nn
import os

In [2]:
finegrained_sentiments_dict = {
"anger": ["anger", "annoyance", "disapproval"],
"disgust": ["disgust"],
"fear": ["fear", "nervousness"],
"joy": ["joy", "amusement", "approval", "excitement", "gratitude",  "love", "optimism", "relief", "pride", "admiration", "desire", "caring"],
"sadness": ["sadness", "disappointment", "embarrassment", "grief",  "remorse"],
"surprise": ["surprise", "realization", "confusion", "curiosity"]
}

In [3]:
!ls ../

 3-cs626-pos-tagging-week-of-8aug22.pdf			   models
'assignment-discussion-format-slide-POS-4sep22 (1).pptx'   modelStats.ods
 FineGrained-Emotion-Classification			   POS


In [4]:
DATA_DIR = "./data/full_dataset/"
train = {"input": [], "labels": []}
dev = {"input": [], "labels": []}
test = {"input": [], "labels": []}

with open(DATA_DIR + "train.tsv") as file:
    tsv_file = csv.reader(file, delimiter="\t") 
    for line in tsv_file:
        train["input"].append(line[0])
        labels = line[1].split(",")
        one_hot = [0 for i in range(28)]
        for label in labels:
            one_hot[int(label)] = 1
        train["labels"].append(one_hot)

with open(DATA_DIR + "dev.tsv") as file:
    tsv_file = csv.reader(file, delimiter="\t") 
    for line in tsv_file:
        dev["input"].append(line[0])
        labels = line[1].split(",")
        one_hot = [0 for i in range(28)]
        for label in labels:
            one_hot[int(label)] = 1
        dev["labels"].append(one_hot)

with open(DATA_DIR + "test.tsv") as file:
    tsv_file = csv.reader(file, delimiter="\t") 
    for line in tsv_file:
        test["input"].append(line[0])
        labels = line[1].split(",")
        one_hot = [0 for i in range(28)]
        for label in labels:
            one_hot[int(label)] = 1
        test["labels"].append(one_hot)
        
print("Number of train examples are {}".format(len(train["input"])))
print("Number of dev examples are {}".format(len(dev["input"])))
print("Number of test examples are {}".format(len(test["input"])))

Number of train examples are 43410
Number of dev examples are 5426
Number of test examples are 5427


In [5]:
# Creating higgingface datasets
train_dataset = Dataset.from_dict(train)
dev_dataset = Dataset.from_dict(dev)
test_dataset = Dataset.from_dict(test)

print(train_dataset)

Dataset({
    features: ['input', 'labels'],
    num_rows: 43410
})


In [6]:
from torch.utils.data import Dataset
class LoadData(Dataset):
    """
    Using this since dataloader expects map-style dataset objects
    
    """
    
    def __init__(
        self, dataset, tokenizer, source_length):
        """
        Initializes a Dataset class

        Args:
            dataset (Dataset object): Input Dataset
            tokenizer (Tokenizer object): Transformer tokenizer
            source_length (int): Max length of source text
        """
        
        self.tokenizer = tokenizer
        self.data = dataset
        self.source_length = source_length
        self.source_text = self.data["input"]
        self.target_labels = self.data["labels"]

    def __len__(self):
        return len(self.target_labels)

    def __getitem__(self, index):
        """
        return input ids, attention masks and target ids
        
        """
        source_text = str(self.source_text[index])
        target_label = self.target_labels[index]

        # cleaning data so as to ensure data is in string type
        source_text = " ".join(source_text.split())

        source = self.tokenizer.__call__(
            [source_text],
            max_length=self.source_length,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        
        target = torch.tensor(target_label)

        source_ids = source["input_ids"].squeeze()
        source_mask = source["attention_mask"].squeeze()

        return {
            "source_ids": source_ids.to(dtype=torch.long),
            "source_mask": source_mask.to(dtype=torch.long),
            "target": target.squeeze().to(dtype=torch.long)
        }

In [7]:
def evaluate(model, classifier, eval_dataloader, tokenizer, device, criterion):
    predictions = []
    ground_truths = []
    losses = []
    
    with torch.no_grad():
        steps = 0
        for eval_batch in eval_dataloader:
            y = eval_batch['target'].to(device, dtype = torch.float32)
            ids = eval_batch['source_ids'].to(device, dtype = torch.long)
            mask = eval_batch['source_mask'].to(device, dtype = torch.long)

            output = model(
                input_ids=ids,
                attention_mask=mask, 
            )
            
            output = classifier(output.pooler_output)
            output = torch.sigmoid(output)
            loss = criterion(output, y)
            
            losses.append(loss.item())
            steps += 1
            if steps == 150: break  # evaluating only 1500 examples

    avg_loss = sum(losses)/len(losses)
    print("Validation data loss is", avg_loss)
    
    return avg_loss

def train_(model, classifier, train_loader, valid_loader, device, tokenizer, optimizer, criterion, scheduler):
    steps = 0
    last_loss = 1000
    
    checkpoint_path = parameters["out_dir"] + "best_checkpoint/"
    if not os.path.exists(checkpoint_path):
        os.makedirs(checkpoint_path)
    
    for epoch in range(parameters["epochs"]):
        print("Epoch: ", epoch)    
        for batch in train_loader:
            model.train()
            classifier.train()
            
            y = batch["target"].to(device, dtype=torch.float32)
            ids = batch["source_ids"].to(device, dtype=torch.long)
            mask = batch["source_mask"].to(device, dtype=torch.long)

            outputs = model(
                input_ids=ids,
                attention_mask=mask,
            )
            output = classifier(outputs.pooler_output)
            output = torch.sigmoid(output)
            loss = criterion(output, y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if steps % 200 == 0:
                print("Train loss on {}th step is {}".format(steps, loss.item()))
            
            if steps % 800 == 0:
                model.eval()
                print("Train loss on {}th step is {}".format(steps, loss.item()))
                loss = evaluate(model, classifier, valid_loader, tokenizer, device, criterion)
                if loss < last_loss: # save model parameters
                    print("saving model weights")
                    model.save_pretrained(checkpoint_path)
                    tokenizer.save_pretrained(checkpoint_path)
                    torch.save(optimizer.state_dict(), os.path.join(checkpoint_path, "optimizer.pt"))
                    torch.save(classifier.state_dict(), os.path.join(checkpoint_path, "classifier.pt"))
                    last_loss = loss
            steps += 1
        scheduler.step()
    
    loss = evaluate(model, valid_loader, tokenizer)
    if loss < last_loss: # save model parameters
        print("saving model weights")
        model.save_pretrained(checkpoint_path)
        torch.save(classifier.state_dict(), os.path.join(checkpoint_path, "classifier.pt"))
        tokenizer.save_pretrained(checkpoint_path)
        torch.save(optimizer.state_dict(), os.path.join(checkpoint_path, "optimizer.pt"))
        last_loss = loss
    
    # save the last model weights
    model.save_pretrained(parameters["out_dir"])
    torch.save(classifier.state_dict(), os.path.join(parameters["out_dir"], "classifier.pt"))
    tokenizer.save_pretrained(parameters["out_dir"])
    torch.save(optimizer.state_dict(), os.path.join(parameters["out_dir"], "optimizer.pt"))

In [8]:
def train_model(parameters, train_dataset, valid_dataset):
    cuda =  torch.cuda.is_available()
    device = torch.device("cuda") if cuda else torch.device("cpu")
    
    tokenizer = BertTokenizer.from_pretrained(parameters["model"])    
    model = BertModel.from_pretrained(parameters["model"])
    classifier = nn.Linear(parameters["hidden_size"], parameters["num_classes"])
    classifier = classifier.to(device)
    model = model.to(device)

    params = list(model.parameters()) + list(classifier.parameters())
    optimizer = AdamW(params, lr=parameters["lr"], weight_decay=parameters["wd"])
    criterion = nn.BCELoss()
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.7)

    train_obj = LoadData(
        train_dataset,
        tokenizer,
        parameters["max_source_length"]
    )

    val_obj = LoadData(
        valid_dataset,
        tokenizer,
        parameters["max_source_length"]
    )
    
    train_loader = DataLoader(train_obj, shuffle=True, batch_size=parameters["train_bs"])
    valid_loader = DataLoader(val_obj, shuffle=False, batch_size=parameters["val_bs"])
    
    num_training_steps = parameters["epochs"] * len(train_loader)
    print("Training steps are", num_training_steps)
    
    train_(model, classifier, train_loader, valid_loader, device, tokenizer, optimizer, criterion, scheduler)
    
    return model, classifier

In [9]:
parameters = {"model": "bert-base-cased",  # model_type: t5-base/t5-large
    "train_bs": 5,  # training batch size
    "val_bs": 5,  # validation batch size
    "test_bs": 15,
    "epochs": 3,  # number of training epochs
    "lr": 6e-4,  # learning rate
    "wd": 0.0001,
    "max_source_length": 512,  # max length of source text
    "SEED": 42,
    "out_dir": "./",
    "hidden_size": 768,
    "num_classes": 28}

index_label = {0:"admiration", 1:"amusement", 2:"anger", 3:"annoyance", 4:"approval", 5:"caring", 6:"confusion",
            7:"curiosity", 8:"desire", 9:"disappointment", 10:"disapproval", 11:"disgust", 12:"embarrassment",
            13:"excitement", 14:"fear", 15:"gratitude", 16:"grief", 17:"joy", 18:"love", 19:"nervousness",
            20:"optimism", 21:"pride", 22:"realization", 23:"relief", 24:"remorse", 25:"sadness",
            26:"surprise", 27:"neutral"}
label_list = list(index_label.values())

In [10]:
model, classifier = train_model(parameters, train_dataset, dev_dataset)

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training steps are 26046
Epoch:  0
Train loss on 0th step is 0.7976132035255432
Train loss on 0th step is 0.7976132035255432
Validation data loss is 0.6173229622840881
saving model weights


RuntimeError: [enforce fail at inline_container.cc:319] . unexpected pos 489305856 vs 489305744

In [25]:
# compute metrics on test data
def compute_metrics_allemotions(outputs, labels, label_list, index_label):
    predictions = []
    
    for output in outputs:
        output = [int(out > 0.6) for out in output]
        predictions.append(output)
    print("1st prediction", predictions[0])
    
    confusion_matrix = {}
    precisions, recalls, fscores = {}, {}, {}
    for label in label_list:
        confusion_matrix[label] = {"TP":0, "FP": 0, "FN": 0}
        precisions[label], recalls[label], fscores[label] = 0, 0, 0
    
    for i, prediction in enumerate(predictions):
        gt = labels[i]
        for j, out in enumerate(gt):
            pred = prediction[j]
            if out == 0 and pred == 0: continue
            elif out == 0 and pred == 1:
                # FP found
                confusion_matrix[index_label[j]]["FP"] += 1
            elif out == 1 and pred == 0:
                # FN found
                confusion_matrix[index_label[j]]["FN"] += 1
            elif out == 1 and pred == 1:
                # TP found
                confusion_matrix[index_label[j]]["TP"] += 1
    
    
    for label in label_list:
        precisions[label] = confusion_matrix[label]["TP"]/(confusion_matrix[label]["TP"] + confusion_matrix[label]["FP"] + 1e-4)
        recalls[label] = confusion_matrix[label]["TP"]/(confusion_matrix[label]["TP"] + confusion_matrix[label]["FN"] + 1e-4)
        fscores[label] = 2*precisions[label]*recalls[label]/(precisions[label]+recalls[label] + 1e-4)
    
    return precisions, recalls, fscores

    

def compute_test_outputs(model, classifier, test_dataloader, tokenizer, device, label_list, index_label):
    predictions = []
    labels = []
    
    with torch.no_grad():
        steps = 0
        for test_batch in test_dataloader:
            y = test_batch['target'].to(device, dtype = torch.float32)
            ids = test_batch['source_ids'].to(device, dtype = torch.long)
            mask = test_batch['source_mask'].to(device, dtype = torch.long)

            output = model(
                input_ids=ids,
                attention_mask=mask,
            )
            
            output = classifier(output.pooler_output)
            output = torch.sigmoid(output)
            
            predictions.extend(output.detach().cpu().numpy())
            labels.extend(y.detach().cpu().numpy())
            if steps == 5: break
    
    return predictions, labels
    


cuda =  torch.cuda.is_available()
device = torch.device("cuda") if cuda else torch.device("cpu")

tokenizer = BertTokenizer.from_pretrained(parameters["model"])
model = BertModel.from_pretrained(parameters["model"])
classifier = nn.Linear(parameters["hidden_size"], parameters["num_classes"])
classifier = classifier.to(device)
model = model.to(device)
test_obj = LoadData(
        test_dataset,
        tokenizer,
        parameters["max_source_length"]
    )
test_loader = DataLoader(test_obj, shuffle=True, batch_size=parameters["test_bs"])
predictions, labels = compute_test_outputs(model, classifier, test_loader, tokenizer, device, label_list, index_label)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [26]:
precisions, recalls, fscores = compute_metrics_allemotions(predictions, labels, label_list, index_label)
print("Precision, Recall and Fscores for all labels are ")

precision, recall, fscore = 0, 0, 0
for label in label_list:
    precision += precisions[label]
    recall += recalls[label]
    fscore += fscores[label]
    print("Emotion {}: precision: {}, recall: {}, fscore: {}".format(label, precisions[label], 
                                                                     recalls[label], fscores[label]))

print("Macro precision: {}, Macro recall: {}, Macro fscore: {}".format(precision/28, recall/28, fscore/28))

1st prediction [0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Precision, Recall and Fscores for all labels are 
Emotion admiration: precision: 0.1666638889351844, recall: 0.0019841265904510736, fscore: 0.00391924317064396
Emotion amusement: precision: 0.0, recall: 0.0, fscore: 0.0
Emotion anger: precision: 0.03650586633440368, recall: 0.9898984899502575, fscore: 0.07040808300034321
Emotion annoyance: precision: 0.0, recall: 0.0, fscore: 0.0
Emotion approval: precision: 0.0, recall: 0.0, fscore: 0.0
Emotion caring: precision: 0.0248985609572379, recall: 0.9999992592598079, fscore: 0.04858262529495087
Emotion confusion: precision: 0.02819237095647004, recall: 0.9999993464056559, fscore: 0.054833374720585325
Emotion curiosity: precision: 0.0, recall: 0.0, fscore: 0.0
Emotion desire: precision: 0.01530235959988275, recall: 0.9999987951821745, fscore: 0.030140484066342395
Emotion disappointment: precision: 0.0, recall: 0.0, fscore: 0.0
Emotion disapprov