In [1]:
from learner import Trainer, EvaluateOnTest
from model import SpanEmo
from data_loader import DataClass
from data_selector import DataSelector
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import roc_auc_score
import json

seed = 12345678

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

if str(device) == 'cuda:0':
    print("Currently using GPU: {}".format(device))
    np.random.seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
else:
    print("WARNING: USING CPU")

Currently using GPU: cuda:0


In [3]:
def make_loaders(args, data, batch_size, shuffle=True):
    dataset = DataClass(args, data)
    data_loader = DataLoader(dataset,
                             batch_size=int(batch_size),
                             shuffle=shuffle)

    print('The number of batches: ', len(data_loader))

    return data_loader

In [4]:
def make_model(args):
    model = SpanEmo(output_dropout=args['output_dropout'],
                    backbone=args['backbone'])
    
    return model


In [5]:
def pipeline(args, loaders=None):
    now = datetime.datetime.now()
    filename = now.strftime("%Y-%m-%d-%H:%M:%S")
    fw = open('configs/' + filename + '.json', 'a')
    json.dump(args, fw, sort_keys=True, indent=2)

    train_data_loader, val_data_loader = loaders
    model = make_model(args)

    learn = Trainer(model, train_data_loader, val_data_loader, filename=filename)
    learn.fit(
        num_epochs=int(args['max_epoch']),
        args=args,
        device=device
    )

In [35]:
hyperparams = {
    'train_path':'data/train.csv', 
    'val_path':'data/val.csv',
    'backbone':'bert-base-uncased',
    'train_batch_size': 32,
    'val_batch_size': 32,
    'output_dropout': 0.1,
    'max_epoch': 20,
    'max_length': 512,
    'ffn_lr': 0.001,
    'bert_lr': 2e-5
}

In [7]:
assert False

AssertionError: 

## Train

In [8]:
def filter_neutrals(df):
    df["filter"] = df['27'] < 2
    return df

In [9]:
ds = DataSelector("data.csv")
train, val, tests = ds.select_data(ratio={"Cornell": 1}, filter=filter_neutrals)

Filtering...
Calculating counts...
Grabbing data...


45258
Cornell    45258
dtype: int64


In [31]:
m = 0
c = 0
tc = 0
for i in train_loader.dataset:
    tc += 1
    ma = i[0]['attention_mask'].sum()
    if ma > 350:
        # print(i[0]['input_ids'])
        print(ma)
        if ma > m: m = ma
        c += 1

print(m)
print(c)
print(tc)

tensor(288)
tensor(275)
tensor(267)
tensor(307)
tensor(290)
tensor(271)
tensor(271)
tensor(310)
tensor(328)
tensor(271)
tensor(327)
tensor(298)
tensor(258)
tensor(257)
tensor(367)
tensor(262)
tensor(265)
tensor(380)
tensor(324)
tensor(287)
tensor(289)
tensor(348)
tensor(262)
tensor(265)
tensor(327)
tensor(277)
tensor(277)
tensor(260)
tensor(340)
tensor(304)
tensor(266)
tensor(260)
tensor(266)
tensor(261)
tensor(282)
tensor(277)
tensor(265)
tensor(290)
tensor(262)
tensor(268)
tensor(267)
tensor(281)
tensor(274)
tensor(349)
tensor(316)
tensor(285)
tensor(304)
tensor(341)
tensor(260)
tensor(266)
tensor(307)
tensor(294)
tensor(286)
tensor(258)
tensor(260)
tensor(293)
tensor(294)
tensor(318)
tensor(289)
tensor(323)
tensor(435)
tensor(272)
tensor(287)
tensor(299)
tensor(278)
tensor(271)
tensor(312)
tensor(282)
tensor(353)
tensor(282)
tensor(285)
tensor(260)
tensor(330)
tensor(285)
tensor(257)
tensor(328)
tensor(284)
tensor(313)
tensor(334)
tensor(308)
tensor(283)
tensor(266)
tensor(270)
tens

In [36]:
train_loader = make_loaders(hyperparams, train, hyperparams['train_batch_size'])
val_loader = make_loaders(hyperparams, val, hyperparams['val_batch_size'])
loaders = (train_loader, val_loader)

Reading twitter_2018 - 1grams ...
Reading twitter_2018 - 2grams ...
Reading twitter_2018 - 1grams ...


PreProcessing dataset ...: 100%|██████████| 36206/36206 [01:17<00:00, 464.96it/s]


The number of batches:  566
Reading twitter_2018 - 1grams ...
Reading twitter_2018 - 2grams ...
Reading twitter_2018 - 1grams ...


PreProcessing dataset ...: 100%|██████████| 4526/4526 [00:10<00:00, 450.78it/s]


The number of batches:  71


In [37]:
#wooooooooooooooooooooo
pipeline(hyperparams, loaders=loaders)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Train_Loss,Val_Loss,RMSE-Macro,RMSE-Micro,MSE-Micro,Time


OutOfMemoryError: CUDA out of memory. Tried to allocate 360.00 MiB (GPU 0; 23.68 GiB total capacity; 21.58 GiB already allocated; 344.75 MiB free; 21.85 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

## Test

In [None]:
def test(args, model_path, loader=None):

    if loader is None:
        test_data_loader = make_loaders(args, test=True)
    else:
        test_data_loader = loader

    model = make_model(args)
    
    learn = EvaluateOnTest(model, test_data_loader, model_path='models/' + model_path)
    return learn.predict(device=device), model

In [None]:
hyperparams['test_path'] = 'data/test.csv'
hyperparams['test_batch_size'] = 32

In [None]:
test_loader = make_loaders(hyperparams, tests, hyperparams['test_batch_size'], shuffle=False)

  self.tok = re.compile(r"({})".format("|".join(pipeline)))


Reading twitter_2018 - 1grams ...
Reading twitter_2018 - 2grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


Reading twitter_2018 - 1grams ...


PreProcessing dataset ...: 100%|██████████| 6584/6584 [00:18<00:00, 346.62it/s]

The number of batches:  206





In [None]:
preds, model = test(hyperparams, "2023-11-25-15:31:26_checkpoint.pt", loader=test_loader)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RMSE-Macro: 0.1302 RMSE-Micro: 0.1066 MSE-Micro: 0.0170 Time: 00:43


In [None]:
dc = DataClass(hyperparams, [
    [
        "i just lost my job today. it's devastating", 
        "i just found a job today, i'm so happy", 
        "this is miserable. i feel like a failure.", 
        "i can't believe my good fortune!"
    ], [
        "I just got a promotion at work! I'm so excited and proud of myself!",
        "Wow, that's fantastic news! Congratulations! 🎉",
        "Thanks! It's been a long journey, but I finally feel recognized for my efforts.",
        "I can imagine the hard work you've put in. Your dedication has paid off. How do you feel now?",
    ]], pred_mode=True)
dl = DataLoader(dc, batch_size=100, shuffle=False)

Reading twitter_2018 - 1grams ...
Reading twitter_2018 - 2grams ...
Reading twitter_2018 - 1grams ...


PreProcessing dataset ...: 100%|██████████| 2/2 [00:00<00:00, 344.49it/s]


In [None]:
logits = model.predict(next(iter(dl)), device)[2].cpu().detach().numpy()
probs = 1/(1+np.exp(-logits))[0]


In [None]:
label_names = ["admiration", "amusement", "anger", "annoyance", "approval", "caring", "confusion", "curiosity", "desire", "disappointment", "disapproval", "disgust", "embarrassment", "excitement", "fear", "gratitude", "grief", "joy", "love", "nervous", "optimism", "pride", "realization", "relief", "remorse", "sadness", "surprise", "neutral"]

for i in range(28):
    print(f"{probs[i]:.3f}", label_names[i])


0.070 admiration
0.012 amusement
0.038 anger
0.054 annoyance
0.103 approval
0.055 caring
0.069 confusion
0.130 curiosity
0.023 desire
0.065 disappointment
0.109 disapproval
0.009 disgust
0.003 embarrassment
0.027 excitement
0.024 fear
0.014 gratitude
0.008 grief
0.044 joy
0.023 love
0.006 nervous
0.074 optimism
0.003 pride
0.036 realization
0.004 relief
0.029 remorse
0.078 sadness
0.036 surprise
0.470 neutral


In [None]:
label_names = ["admiration", "amusement", "anger", "annoyance", "approval", "caring", "confusion", "curiosity", "desire", "disappointment", "disapproval", "disgust", "embarrassment", "excitement", "fear", "gratitude", "grief", "joy", "love", "nervous", "optimism", "pride", "realization", "relief", "remorse", "sadness", "surprise", "neutral"]

y_true = pd.DataFrame(preds['y_true'], columns=label_names)
y_pred = pd.DataFrame(preds['y_pred'], columns=label_names)
logits = pd.DataFrame(preds['logits'], columns=label_names)

correlation = "pearson"

y_true_corr = y_true.corr(correlation)
y_pred_corr = y_pred.corr(correlation)
logits_corr = logits.corr(correlation)

In [None]:
# _ = sns.clustermap(logits_corr, annot=False)

In [None]:
# data = pd.read_csv("data/test.csv")

# wrongs = [(i[0], i[1].sum(), [(j, label_names[j]) for j, x in enumerate(i[1]) if x]) for i in enumerate(preds['y_true'] != preds['y_pred']) if i[1].any()]
# test_data = [(i, data['text'][i], [(j, label_names[j]) for j in range(28) if data[str(j)][i]]) for i in data.index]
# test_data = [test_data[i[0]] for i in wrongs]