In [1]:
!mkdir -p data

# download datasets
!wget https://www.dropbox.com/s/ikkqxfdbdec3fuj/test.txt -P data
!wget https://www.dropbox.com/s/1pzkadrvffbqw6o/train.txt -P data
!wget https://www.dropbox.com/s/2mzialpsgf9k5l3/val.txt -P data

# download glove embeddings
!wget https://nlp.stanford.edu/data/glove.6B.zip -P data
!unzip data/glove.6B.zip -d data

--2022-06-09 15:58:54--  https://www.dropbox.com/s/ikkqxfdbdec3fuj/test.txt
Resolving www.dropbox.com (www.dropbox.com)... 162.125.69.18, 2620:100:6025:18::a27d:4512
Connecting to www.dropbox.com (www.dropbox.com)|162.125.69.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/ikkqxfdbdec3fuj/test.txt [following]
--2022-06-09 15:58:55--  https://www.dropbox.com/s/raw/ikkqxfdbdec3fuj/test.txt
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc4945555e01cb99a5a3353ba4d6.dl.dropboxusercontent.com/cd/0/inline/Bm4QsMuncbxoCRrfC4nITtzYZ9Kmzycp6T1uC7ylUala-Y0rNavvCkV_j1B-KA3QHcpo1-LkOL4oyZYNW67-Z5rSpYLBOhAtlopOi5LDupae1wy7fOPjSdnOtpIpkRDp8fCJiAKbrh6K5a0CS2NQ_uqCNOBXGY2aj5eMp2mh_bDNOQ/file# [following]
--2022-06-09 15:58:55--  https://uc4945555e01cb99a5a3353ba4d6.dl.dropboxusercontent.com/cd/0/inline/Bm4QsMuncbxoCRrfC4nITtzYZ9Kmzycp6T1uC7ylUala-Y0rNavvCkV_j1B-KA3QHcpo1-LkOL4oy

In [1]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
import numpy as np
import torch

UNK_TOKEN = "<unk>"
PAD_TOKEN = "<pad>"

train_path = "data/train.txt"
test_path = "data/test.txt"
val_path = "data/val.txt"

## emotion labels
emotion2int = {
  "sadness": 0,
  "joy": 1,
  "love": 2,
  "anger": 3,
  "fear": 4,
  "surprise": 5
}

emotions = list(emotion2int.keys())

def read_data(path):
    return pd.read_csv(path, sep=";", header=None, names=["text", "emotion"],
                       engine="python")

def create_vocab(data):
    vocab = {PAD_TOKEN: 0, UNK_TOKEN: 1}

    for text in tqdm(data, total=len(data)):
        for token in text.split(" "):
            if token not in vocab:
                vocab[token] = len(vocab)
    
    return vocab

def create_embedding_matrix(embedding_dim=100):
    glove = pd.read_csv(f'data/glove.6B.{embedding_dim}d.txt', sep=" ", quoting=3, header=None, index_col=0)
    vocab = {PAD_TOKEN: 0, UNK_TOKEN: 1}
    embeddings = np.zeros((len(glove) + 2, embedding_dim))
    embeddings[0] = np.zeros(embedding_dim)
    embeddings[1] = np.zeros(embedding_dim)

    for index, (key, val) in tqdm(enumerate(glove.T.items()), total=len(glove)):
        vocab[key] = index + 2
        embeddings[index+2] = val.to_numpy()

    return vocab, embeddings

class EmotionDataset(Dataset):
    def __init__(self, data, vocab):
        super().__init__()
        self.labels = data["emotion"].map(emotion2int)
        self.features = pad_sequence([torch.tensor([vocab.get(token, vocab.get(UNK_TOKEN)) for token in text.split(" ")], dtype=torch.int) for text in data['text']],
                                     batch_first=True, padding_value=vocab.get(PAD_TOKEN))
        

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

    def __len__(self):
        return len(self.features)

In [2]:
EMBEDDING_DIM = 100

train_data = read_data(train_path)
test_data = read_data(test_path)
val_data = read_data(val_path)

# vocab = create_vocab(train_data['text'].to_list() + test_data['text'].to_list() + val_data['text'].to_list())
vocab, embeddings = create_embedding_matrix()

with open(f"data/embedding_matrix_glove_{EMBEDDING_DIM}d.npy", "wb") as f:
    np.save(f, embeddings)

100%|██████████| 400000/400000 [00:05<00:00, 67480.10it/s]


In [3]:
train_dataset = EmotionDataset(train_data, vocab)
test_dataset = EmotionDataset(test_data, vocab)
val_dataset = EmotionDataset(val_data, vocab)

In [4]:
BATCH_SIZE = 128

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

In [9]:
import torch
import torch.nn as nn
import pytorch_lightning as pl
import torch.nn.functional as F
import torch.optim as optim
import torchmetrics
from pytorch_lightning.loggers import WandbLogger


class MaxPool(nn.Module):
    def forward(self, X):
        values, _ = torch.max(X, dim=1)
        return values


class AvgPool(nn.Module):
    def forward(self, X):
        return torch.mean(X, dim=1)


class SumPool(nn.Module):
    def forward(self, X):
        return torch.sum(X, dim=1)


pool_map = {
    "max": MaxPool,
    "avg": AvgPool,
    "sum": SumPool
}

class CBoW(torch.nn.Module):
    def __init__(self, input_dim, output_dim, embedding_dim, hidden_dim=128, pooling="max", load_pretrained_embeddings=True, freeze_embeddings=False):
        super(CBoW, self).__init__()

        if load_pretrained_embeddings:
            embedding_matrix = np.load(
                f"data/embedding_matrix_glove_{embedding_dim}d.npy", allow_pickle=True
            )
            input_dim, embedding_dim = embedding_matrix.shape
            self.embedding = nn.Embedding(
                input_dim,
                embedding_dim
            ).from_pretrained(
                torch.tensor(embedding_matrix, dtype=torch.float32),
                freeze=freeze_embeddings
            )
        else:
            self.embedding = torch.nn.Embedding(input_dim, embedding_dim)
            nn.init.xavier_uniform_(self.embedding.weight)

        self.pool = pool_map[pooling]()
        self.linear1 = nn.Linear(embedding_dim, hidden_dim)
        self.activation1 = nn.ReLU()
        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
        self.activation2 = nn.ReLU()
        self.linear3 = nn.Linear(hidden_dim, output_dim)

        nn.init.xavier_uniform_(self.linear1.weight)
        nn.init.xavier_uniform_(self.linear2.weight)

        self.model = nn.Sequential(self.embedding, self.pool, self.linear1, self.activation1, self.linear2,
                                    self.activation2, self.linear3)

    def forward(self, x):
        return self.model(x)


class EmotionClassifier(pl.LightningModule):
    def __init__(self, *args, **kwargs):
        super().__init__()
        self.save_hyperparameters()
        self.model = CBoW(input_dim=self.hparams.input_dim, output_dim=self.hparams.output_dim, embedding_dim=self.hparams.embedding_dim,
                          pooling=self.hparams.pooling, load_pretrained_embeddings=self.hparams.load_pretrained_embeddings, freeze_embeddings=self.hparams.freeze_embeddings)
        self.criterion = nn.CrossEntropyLoss()
        self.metrics = {
            "accuracy": {
                "train": torchmetrics.Accuracy(),
                "val": torchmetrics.Accuracy(),
                "test": torchmetrics.Accuracy()
            }
        }

    def step(self, batch, step_name="train"):
        X, y = batch
        outputs = self.model(X)
        loss = self.criterion(outputs, y)
        preds = self.forward(X)
        metric = self.metrics["accuracy"][step_name]
        metric.update(preds.cpu(), y.cpu())
        metric_val = metric.compute()
        self.log(f"{step_name}_loss", loss, on_epoch=True)
        self.log(f"{step_name}_accuracy", metric_val, on_epoch=True)
        return loss

    def forward(self, X, *args):
        outputs = self.model(X)
        probs = F.softmax(outputs, dim=1)
        return probs

    def training_step(self, batch, batch_idx):
        return self.step(batch, "train")
    
    def validation_step(self, batch, batch_idx):
        return self.step(batch, "val")
        
    def test_step(self, batch, batch_idx):
        return self.step(batch, "test")

    def configure_optimizers(self):
        optimizer = optim.Adam(self.model.parameters(), lr=self.hparams.lr)
        return optimizer

In [8]:
import wandb

NUM_EPOCHS = 100

wandb_logger = WandbLogger(project="optml-project", name=f"emotion-classifier-glove-frozen-deep2")

model = EmotionClassifier(lr=1e-4, input_dim=len(vocab), output_dim=len(emotions), embedding_dim=EMBEDDING_DIM, pooling="sum", load_pretrained_embeddings=True, freeze_embeddings=True)
trainer = pl.Trainer(default_root_dir="models", max_epochs=NUM_EPOCHS, logger=wandb_logger, accelerator="gpu")
trainer.fit(model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)
trainer.test(model, dataloaders=test_dataloader)
wandb.finish()

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | model     | CBoW             | 40.0 M
1 | criterion | CrossEntropyLoss | 0     
-----------------------------------------------
13.7 K    Trainable params
40.0 M    Non-trainable params
40.0 M    Total params
160.056   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  probs = F.softmax(outputs)
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test_accuracy          0.54598468542099
        test_loss           1.1889243125915527
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
test_accuracy,▁
test_loss,▁
train_accuracy_epoch,▁▂▃▄▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████████
train_accuracy_step,▁▃▄▄▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇███████████████
train_loss_epoch,█▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▆▄▂▃▃▃▂▁▂▂▃▂▂▂▃▂▂▂▂▂▂▂▂▂▁▁▁▂▂▂▂▁▁▂▂▂▁▃▂
trainer/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_accuracy,▁▂▃▄▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇████████████████
val_loss,█▅▄▃▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,100.0
test_accuracy,0.54598
test_loss,1.18892
train_accuracy_epoch,0.55954
train_accuracy_step,0.55978
train_loss_epoch,1.06586
train_loss_step,1.10218
trainer/global_step,12500.0
val_accuracy,0.55015
val_loss,1.19539
