In [4]:
# download the files
!wget https://www.dropbox.com/s/ikkqxfdbdec3fuj/test.txt
!wget https://www.dropbox.com/s/1pzkadrvffbqw6o/train.txt
!wget https://www.dropbox.com/s/2mzialpsgf9k5l3/val.txt

# create the data folders
!mkdir -p data
!mv val.txt data
!mv test.txt data
!mv train.txt data

--2022-06-09 14:59:16--  https://www.dropbox.com/s/ikkqxfdbdec3fuj/test.txt
Resolving www.dropbox.com (www.dropbox.com)... 2620:100:6025:18::a27d:4512, 162.125.69.18
Connecting to www.dropbox.com (www.dropbox.com)|2620:100:6025:18::a27d:4512|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/ikkqxfdbdec3fuj/test.txt [following]
--2022-06-09 14:59:17--  https://www.dropbox.com/s/raw/ikkqxfdbdec3fuj/test.txt
Reusing existing connection to [www.dropbox.com]:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc57a9c47bcba9a2f4f21b9b87ef.dl.dropboxusercontent.com/cd/0/inline/Bm6E2c0RZrxaKBmvcG_tdT9OzItowgh_JgPsJAPMFs8ooguqEM4khPwNGXZbBJGZ2Kkcz03_Azp4IO2MAxPOe72u4jxN5KVRUmedlxGVrrKgC3Z9E0DoNpWreFGDHq1Cd_KXjw92wdkV01CZ9c3Y1lF5hosISAaug_VLmoZ6DeNLhQ/file# [following]
--2022-06-09 14:59:18--  https://uc57a9c47bcba9a2f4f21b9b87ef.dl.dropboxusercontent.com/cd/0/inline/Bm6E2c0RZrxaKBmvcG_tdT9OzItowgh_JgPsJAPMFs8ooguqEM4khPwNGXZbB

In [30]:
!wget https://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip
!mv glove.6B.100d.txt data

--2022-06-09 17:17:45--  https://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-06-09 17:17:46--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2022-06-09 17:20:29 (5.05 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [32]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
import numpy as np


UNK_TOKEN = "<unk>"
PAD_TOKEN = "<pad>"

train_path = "data/train.txt"
test_path = "data/test.txt"
val_path = "data/val.txt"

## emotion labels
emotion2int = {
  "sadness": 0,
  "joy": 1,
  "love": 2,
  "anger": 3,
  "fear": 4,
  "surprise": 5
}

emotions = list(emotion2int.keys())

def read_data(path):
    return pd.read_csv(path, sep=";", header=None, names=["text", "emotion"],
                       engine="python")

def create_vocab(data):
    vocab = {PAD_TOKEN: 0, UNK_TOKEN: 1}

    for text in tqdm(data, total=len(data)):
        for token in text.split(" "):
            if token not in vocab:
                vocab[token] = len(vocab)
    
    return vocab

def create_embedding_matrix(embedding_dim=100):
    glove = pd.read_csv(f'data/glove.6B.{embedding_dim}d.txt', sep=" ", quoting=3, header=None, index_col=0)
    vocab = {PAD_TOKEN: 0, UNK_TOKEN: 1}
    embeddings = np.zeros((len(glove) + 2, embedding_dim))
    embeddings[0] = np.zeros(embedding_dim)
    embeddings[1] = np.zeros(embedding_dim)

    for index, (key, val) in tqdm(enumerate(glove.T.items()), total=len(glove)):
        vocab[key] = index + 2
        embeddings[index+2] = val.to_numpy()

    return vocab, embeddings

class EmotionDataset(Dataset):
    def __init__(self, data, vocab):
        super().__init__()
        self.labels = data["emotion"].map(emotion2int)
        self.features = pad_sequence([torch.tensor([vocab.get(token, vocab.get(UNK_TOKEN)) for token in text.split(" ")], dtype=torch.int) for text in data['text']],
                                     batch_first=True, padding_value=vocab.get(PAD_TOKEN))
        

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

    def __len__(self):
        return len(self.features)

In [33]:
EMBEDDING_DIM = 100

train_data = read_data(train_path)
test_data = read_data(test_path)
val_data = read_data(val_path)

# vocab = create_vocab(train_data['text'].to_list() + test_data['text'].to_list() + val_data['text'].to_list())
vocab, embeddings = create_embedding_matrix()

with open(f"data/embedding_matrix_glove_{EMBEDDING_DIM}d.npy", "wb") as f:
    np.save(f, embeddings)

Epoch 1:   1%|          | 3/282 [54:11<83:59:12, 1083.70s/it, loss=1.71, v_num=1]


100%|██████████| 400000/400000 [00:09<00:00, 40162.03it/s]


In [34]:
vocab

{'<pad>': 0,
 '<unk>': 1,
 'the': 2,
 ',': 3,
 '.': 4,
 'of': 5,
 'to': 6,
 'and': 7,
 'in': 8,
 'a': 9,
 '"': 10,
 "'s": 11,
 'for': 12,
 '-': 13,
 'that': 14,
 'on': 15,
 'is': 16,
 'was': 17,
 'said': 18,
 'with': 19,
 'he': 20,
 'as': 21,
 'it': 22,
 'by': 23,
 'at': 24,
 '(': 25,
 ')': 26,
 'from': 27,
 'his': 28,
 "''": 29,
 '``': 30,
 'an': 31,
 'be': 32,
 'has': 33,
 'are': 34,
 'have': 35,
 'but': 36,
 'were': 37,
 'not': 38,
 'this': 39,
 'who': 40,
 'they': 41,
 'had': 42,
 'i': 43,
 'which': 44,
 'will': 45,
 'their': 46,
 ':': 47,
 'or': 48,
 'its': 49,
 'one': 50,
 'after': 51,
 'new': 52,
 'been': 53,
 'also': 54,
 'we': 55,
 'would': 56,
 'two': 57,
 'more': 58,
 "'": 59,
 'first': 60,
 'about': 61,
 'up': 62,
 'when': 63,
 'year': 64,
 'there': 65,
 'all': 66,
 '--': 67,
 'out': 68,
 'she': 69,
 'other': 70,
 'people': 71,
 "n't": 72,
 'her': 73,
 'percent': 74,
 'than': 75,
 'over': 76,
 'into': 77,
 'last': 78,
 'some': 79,
 'government': 80,
 'time': 81,
 '$': 82,
 

In [35]:
train_dataset = EmotionDataset(train_data, vocab)
test_dataset = EmotionDataset(test_data, vocab)
val_dataset = EmotionDataset(val_data, vocab)

In [36]:
train_dataset[2]

(tensor([14665, 14195,     9,  1216,     6,   660,    43,  1000, 20335,  1799,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0], dtype=torch.int32),
 3)

In [37]:
BATCH_SIZE = 128
NUM_EPOCHS = 10

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

In [40]:
import torch
import torch.nn as nn
import pytorch_lightning as pl
import torch.nn.functional as F
import torch.optim as optim
import torchmetrics
from pytorch_lightning.loggers import WandbLogger


class MaxPool(nn.Module):
    def forward(self, X):
        values, _ = torch.max(X, dim=1)
        return values


class AvgPool(nn.Module):
    def forward(self, X):
        return torch.mean(X, dim=1)


class SumPool(nn.Module):
    def forward(self, X):
        return torch.sum(X, dim=1)


pool_map = {
    "max": MaxPool,
    "avg": AvgPool,
    "sum": SumPool
}

class CBoW(torch.nn.Module):
    def __init__(self, input_dim, output_dim, embedding_dim, pooling="max", load_pretrained_embeddings=True):
        super(CBoW, self).__init__()

        if load_pretrained_embeddings:
            embedding_matrix = np.load(
                f"data/embedding_matrix_glove_{embedding_dim}d.npy", allow_pickle=True
            )
            self.embedding = nn.Embedding(
                num_embeddings=embedding_matrix.shape[0],
                embedding_dim=embedding_matrix.shape[1],
            ).from_pretrained(
                torch.tensor(embedding_matrix, dtype=torch.float32),
                freeze=False
            )
        else:
            self.embedding = torch.nn.Embedding(input_dim, embedding_dim)

        self.pool = pool_map[pooling]()
        self.linear = torch.nn.Linear(embedding_dim, output_dim)

        # use xavier initialization for weights
        nn.init.xavier_uniform_(self.embedding.weight)
        nn.init.xavier_uniform_(self.linear.weight)

    def forward(self, x):
        out = self.embedding(x)
        out = self.pool(out)
        out = self.linear(out)
        return out


class EmotionClassifier(pl.LightningModule):
    def __init__(self, *args, **kwargs):
        super().__init__()
        self.save_hyperparameters()
        self.model = CBoW(input_dim=self.hparams.input_dim, output_dim=self.hparams.output_dim, embedding_dim=self.hparams.embedding_dim,
                          pooling=self.hparams.pooling, load_pretrained_embeddings=self.hparams.load_pretrained_embeddings)
        self.criterion = nn.CrossEntropyLoss()
        self.metrics = {
            "accuracy": {
                "train": torchmetrics.Accuracy(),
                "val": torchmetrics.Accuracy(),
                "test": torchmetrics.Accuracy()
            }
        }

    def step(self, batch, step_name="train"):
        X, y = batch
        outputs = self.model(X)
        loss = self.criterion(outputs, y)
        preds = self.forward(X)
        metric = self.metrics["accuracy"][step_name]
        metric.update(preds, y)
        metric_val = metric.compute()
        self.log(f"{step_name}_loss", loss, on_epoch=True)
        self.log(f"{step_name}_accuracy", metric_val, on_epoch=True)
        return loss

    def forward(self, X, *args):
        outputs = self.model(X)
        probs = F.softmax(outputs)
        return probs

    def training_step(self, batch, batch_idx):
        return self.step(batch, "train")
    
    def validation_step(self, batch, batch_idx):
        return self.step(batch, "val")
        
    def test_step(self, batch, batch_idx):
        return self.step(batch, "test")

    def configure_optimizers(self):
        optimizer = optim.Adam(self.model.parameters(), lr=self.hparams.lr)
        return optimizer

In [41]:
import wandb
wandb_logger = WandbLogger(project="optml-project", name=f"emotion-classifier-glove")

model = EmotionClassifier(lr=1e-4, input_dim=len(vocab), output_dim=len(emotions), embedding_dim=EMBEDDING_DIM, pooling="avg", load_pretrained_embeddings=True)
trainer = pl.Trainer(default_root_dir="models", max_epochs=NUM_EPOCHS, logger=wandb_logger)
trainer.fit(model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)
trainer.test(model, dataloaders=test_dataloader)
wandb.finish()

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name      | Type             | Params
-----------------------------------------------
0 | model     | CBoW             | 40.0 M
1 | criterion | CrossEntropyLoss | 0     
-----------------------------------------------
40.0 M    Trainable params
0         Non-trainable params
40.0 M    Total params
160.003   Total estimated model params size (MB)


                                                                           

  rank_zero_warn(
  probs = F.softmax(outputs)
  rank_zero_warn(


Epoch 0:  60%|██████    | 85/141 [00:34<00:22,  2.46it/s, loss=1.77, v_num=8v3f]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
  rank_zero_warn(


Testing DataLoader 0: 100%|██████████| 16/16 [00:00<00:00, 76.81it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test_accuracy         0.2936764657497406
        test_loss           1.7614895105361938
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


0,1
epoch,▁▁
test_accuracy,▁
test_loss,▁
train_accuracy_step,▁
train_loss_step,▁
trainer/global_step,▁█

0,1
epoch,0.0
test_accuracy,0.29368
test_loss,1.76149
train_accuracy_step,0.28531
train_loss_step,1.78181
trainer/global_step,85.0
