In [1]:
import torch
import torch.nn as nn
import math
import numpy as np
import random
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from torchtext.data import Field, BucketIterator
from sklearn.model_selection import train_test_split, KFold
from torchtext.vocab import build_vocab_from_iterator
import re
from transformers import BertTokenizer
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torchmetrics.classification import F1Score
import optuna
import warnings
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [2]:
from utils import *

In [30]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

# Import data

In [87]:
SEED = 2020

In [88]:
# load data
train_setA = pd.read_csv("./Datasets/sentiment140_processed_21_30000.csv", encoding='ISO-8859-1')
train_setB = pd.read_csv("./Datasets/sentiment140_processed_22_30000.csv", encoding='ISO-8859-1')
test_set = pd.read_csv("./Datasets/sentiment140_processed_26_60000.csv", encoding='ISO-8859-1')

In [89]:
%%time
# preprocessing (tokenization, discard long sentence, lowercase etc.)
train_setA = preproc(train_setA)

Tokenizing the data...
Length of the data :  30000
CPU times: total: 28.7 s
Wall time: 32.8 s


In [90]:
%%time
# preprocessing (tokenization, discard long sentence, lowercase etc.)
train_setB = preproc(train_setB)

Tokenizing the data...
Length of the data :  29999
CPU times: total: 26.6 s
Wall time: 28.3 s


In [91]:
%%time
# preprocessing (tokenization, discard long sentence, lowercase etc.)
test_set = preproc(test_set)

Tokenizing the data...
Length of the data :  59999
CPU times: total: 47.5 s
Wall time: 49.3 s


In [92]:
# build vocab
vocabA = {'__PAD__': 0, '__</e>__': 1, '__UNK__': 2}
for item in train_setA.iloc[:, 0]:
    for word in item:
        if word not in vocabA:
            vocabA[word] = len(vocabA)
pad_idxA = vocabA['__PAD__']
voc_sizeA = len(vocabA)
print("Vocabulary Size A: ", voc_sizeA)

Vocabulary Size A:  13911


In [93]:
# build vocab
vocabB = {'__PAD__': 0, '__</e>__': 1, '__UNK__': 2}
for item in train_setB.iloc[:, 0]:
    for word in item:
        if word not in vocabB:
            vocabB[word] = len(vocabB)
pad_idxB = vocabB['__PAD__']
voc_sizeB = len(vocabB)
print("Vocabulary Size B: ", voc_sizeB)

Vocabulary Size B:  13971


In [94]:
# build vocab
vocab_test = {'__PAD__': 0, '__</e>__': 1, '__UNK__': 2}
for item in test_set.iloc[:, 0]:
    for word in item:
        if word not in vocab_test:
            vocab_test[word] = len(vocab_test)
pad_idx_test = vocab_test['__PAD__']
voc_size_test = len(vocab_test)
print("Vocabulary Size B: ", voc_size_test)

Vocabulary Size B:  17290


# Import models

In [95]:
paramsA = torch.load('./Models/modelA_sentiment140_processed_1_256')
paramsB = torch.load('./Models/modelA_sentiment140_processed_2_256')

In [96]:
embeddingA = paramsA['encoder.emb.tok_emb.embedding.weight']
embeddingB = paramsB['encoder.emb.tok_emb.embedding.weight']

del paramsA['encoder.emb.tok_emb.embedding.weight']
del paramsB['encoder.emb.tok_emb.embedding.weight']

In [97]:
# create models
modelA = new_model(embeddingA, pad_idxA, voc_sizeA, device) # init model
modelB = new_model(embeddingB, pad_idxB, voc_sizeB, device) # init model

In [98]:
# load weights
modelA.load_state_dict(paramsA)
modelB.load_state_dict(paramsB)

<All keys matched successfully>

# CV

### Definition

In [99]:
LEARNING_RATE = 2e-4 # starting learning rate for scheduler

### Evaluation

In [100]:
model_names = ['A', 'B', 'vanilla_pre', 'vanilla_post', 'OT_pre', 'OT_post', 'random']
scores = {'loss': {model_name: [] for model_name in model_names},
          'accuracy': {model_name: [] for model_name in model_names},
          'f1': {model_name: [] for model_name in model_names},}

In [101]:
"""# data generators
def get_generator(train_set, test_set, voc_train, voc_test, batch_size=512):
    # dataframe to tensor
    train_y = torch.tensor(train_set.iloc[:, 1].values.astype(np.float32), device=device)
    test_y = torch.tensor(test_set.iloc[:, 1].values.astype(np.float32), device=device)

    unk_ID = voc["__UNK__"]

    train_x_tensor = []
    for idx, text_corpus in enumerate(tqdm(train_set.iloc[:, 0])):
        foo = []
        for token in text_corpus:
            word_ID = voc.get(token, unk_ID)
            foo.append(word_ID)
        while len(foo) < 256:
            foo.append(voc["__PAD__"])
        train_x_tensor.append(foo)

    test_x_tensor = []
    for idx, text_corpus in enumerate(tqdm(test_set.iloc[:, 0])):
        foo = []
        for token in text_corpus:
            word_ID = voc.get(token, unk_ID)
            foo.append(word_ID)
        while len(foo) < 256:
            foo.append(voc["__PAD__"])
        test_x_tensor.append(foo)

    train_x = torch.tensor(train_x_tensor, device=device)
    test_x = torch.tensor(test_x_tensor, device=device)

    train = torch.utils.data.TensorDataset(train_x, train_y)
    test = torch.utils.data.TensorDataset(test_x, test_y)

    train_iterator = torch.utils.data.DataLoader(dataset=train, batch_size=batch_size, shuffle=True)
    test_iterator = torch.utils.data.DataLoader(dataset=test, batch_size=batch_size, shuffle=True)

    # check imbalance
    check_imbalance(train_iterator, name='train set')
    check_imbalance(test_iterator, name='test set')

    print('Dataset initializing done')

    return train_iterator, test_iterator"""

'# data generators\ndef get_generator(train_set, test_set, voc_train, voc_test, batch_size=512):\n    # dataframe to tensor\n    train_y = torch.tensor(train_set.iloc[:, 1].values.astype(np.float32), device=device)\n    test_y = torch.tensor(test_set.iloc[:, 1].values.astype(np.float32), device=device)\n\n    unk_ID = voc["__UNK__"]\n\n    train_x_tensor = []\n    for idx, text_corpus in enumerate(tqdm(train_set.iloc[:, 0])):\n        foo = []\n        for token in text_corpus:\n            word_ID = voc.get(token, unk_ID)\n            foo.append(word_ID)\n        while len(foo) < 256:\n            foo.append(voc["__PAD__"])\n        train_x_tensor.append(foo)\n\n    test_x_tensor = []\n    for idx, text_corpus in enumerate(tqdm(test_set.iloc[:, 0])):\n        foo = []\n        for token in text_corpus:\n            word_ID = voc.get(token, unk_ID)\n            foo.append(word_ID)\n        while len(foo) < 256:\n            foo.append(voc["__PAD__"])\n        test_x_tensor.append(foo)\

In [104]:
# data generators
def get_generator(dataset, voc, batch_size=512, name=''):
    # dataframe to tensor
    y = torch.tensor(dataset.iloc[:, 1].values.astype(np.float32), device=device)

    unk_ID = voc["__UNK__"]

    x_tensor = []
    for idx, text_corpus in enumerate(tqdm(dataset.iloc[:, 0])):
        foo = []
        for token in text_corpus:
            word_ID = voc.get(token, unk_ID)
            foo.append(word_ID)
        while len(foo) < 256:
            foo.append(voc["__PAD__"])
        x_tensor.append(foo)

    x_tensor = torch.tensor(x_tensor, device=device)
    tensor_dataset = torch.utils.data.TensorDataset(x_tensor, y)
    iterator = torch.utils.data.DataLoader(dataset=tensor_dataset, batch_size=batch_size, shuffle=True)

    # check imbalance
    check_imbalance(iterator, name=name)

    print('Dataset initializing done')
    return iterator

In [107]:
%%time
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')

    # build generators
    train_iteratorA = get_generator(train_setA, vocabA, batch_size=512, name='A')
    train_iteratorB = get_generator(train_setB, vocabB, batch_size=512, name='B')
    test_iterator = get_generator(test_set, vocab_test, batch_size=512, name='test')

100%|██████████| 30000/30000 [00:01<00:00, 19893.91it/s]


Positive labels ratio (A): 0.0
Dataset initializing done


100%|██████████| 29999/29999 [00:01<00:00, 24731.18it/s]


Positive labels ratio (B): 0.0
Dataset initializing done


100%|██████████| 59999/59999 [00:02<00:00, 24212.66it/s]


Positive labels ratio (test): 1.0
Dataset initializing done
CPU times: total: 8.91 s
Wall time: 8.52 s


In [106]:
%%time
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    for i in range(FOLDS):
        print(f'Fold {i + 1}/{FOLDS}')

        # get training, test set
        train_set, test_set = datasets.pop()

        # build generators
        train_iterator, test_iterator, voc_size, pad_idx, embedding = build_generators(train_set, test_set, device, batch_size=512)

        # train parent models
        train_parent = lambda x: train_early_stopping(model_name=x,
                                                      train_iter=train_iterator,
                                                      valid_iter=test_iterator,
                                                      embedding=embedding,
                                                      pad_idx=pad_idx,
                                                      voc_size=voc_size,
                                                      device=device,
                                                      lr=LEARNING_RATE,
                                                      save=False)
        print('Starting training for model A')
        modelA = train_parent('A')
        print('Starting training for model B')
        modelB = train_parent('B')

        # model fusion
        # 1) vanilla
        model_fusion_vanilla = vanilla_fusion(modelA=modelA, modelB=modelB, pad_idx=pad_idx, voc_size=voc_size, embedding=embedding, device=device)
        # 2) optimal transport
        N_TRIALS = 100
        study = optuna.create_study()
        study.optimize(weighted_fusion(modelA, modelB, train_iterator, test_iterator, embedding, pad_idx, voc_size, device), n_trials=N_TRIALS)
        best_weighting_factor = study.best_params['weighting_factor']
        print('Best fusion weight:', best_weighting_factor)
        model_fusion = ot_fusion(modelA, modelB, train_iterator, embedding, pad_idx, voc_size, device, fusion_ratio=best_weighting_factor)

        # evaluate
        # ensure all models on same device
        model_to_cpu = lambda x: x.to(device)
        modelA = model_to_cpu(modelA)
        modelB = model_to_cpu(modelB)
        model_random = new_model(embedding, pad_idx, voc_size, device)
        model_fusion = model_to_cpu(model_fusion)
        model_fusion_vanilla = model_to_cpu(model_fusion_vanilla)

        # test models
        for name, model in zip(('A', 'B', 'random', 'OT_pre', 'vanilla_pre'), (modelA, modelB, model_random, model_fusion, model_fusion_vanilla)):
            loss, acc, f1 = validation(model, test_iterator, nn.CrossEntropyLoss(), device) # (loss, accuracy, f1)

            # put into cpu
            to_cpu = lambda x: x.to('cpu') if isinstance(x, torch.Tensor) else x
            loss = to_cpu(loss)
            acc = to_cpu(acc)
            f1 = to_cpu(f1)

            print(name, f'loss: {loss} - accuracy: {acc} - f1: {f1}')
            scores['loss'][name].append(loss), scores['accuracy'][name].append(acc), scores['f1'][name].append(f1)

        # retraining
        retrain = lambda x: train(model=x,
                                  iterator=train_iterator,
                                  valid_iter=test_iterator,
                                  optimizer=torch.optim.SGD(x.parameters(), lr=LEARNING_RATE),
                                  criterion=nn.CrossEntropyLoss(),
                                  epoch='unrestricted',
                                  clip=1,
                                  device=device)
        # 1) vanilla
        # train with early stopping
        print('Starting retraining for model vanilla fusion')
        retrain(model_fusion_vanilla)

        # 2) optimal transport
        # train with early stopping
        print('Starting retraining for model OT fusion')
        retrain(model_fusion)

        # evaluate
        # ensure all models on same device
        model_fusion = model_to_cpu(model_fusion)
        model_fusion_vanilla = model_to_cpu(model_fusion_vanilla)

        # test models
        for name, model in zip(('OT_post', 'vanilla_post'), (model_fusion, model_fusion_vanilla)):
            loss, acc, f1 = validation(model, test_iterator, nn.CrossEntropyLoss(), device) # (loss, accuracy, f1)

            # put into cpu
            to_cpu = lambda x: x.to('cpu') if isinstance(x, torch.Tensor) else x
            loss = to_cpu(loss)
            acc = to_cpu(acc)
            f1 = to_cpu(f1)

            print(name, f'loss: {loss} - accuracy: {acc} - f1: {f1}')
            scores['loss'][name].append(loss), scores['accuracy'][name].append(acc), scores['f1'][name].append(f1)

NameError: name 'FOLDS' is not defined

### Export as LaTeX

In [13]:
model_names_latex = ['Model A', 'Model B', 'Vanilla', 'Vanilla (retraining)', 'Optimal transport', 'Optimal transport (retraining)', 'Untrained model (baseline)']

In [14]:
latex = scores_to_latex(scores, model_names_latex)

             loss accuracy  f1
A              []       []  []
B              []       []  []
vanilla_pre    []       []  []
vanilla_post   []       []  []
OT_pre         []       []  []
OT_post        []       []  []
random         []       []  []
\begin{table}[H]
\centering
\caption{Model performance (5-fold CV)}
\begin{tabular}{llll}
\toprule
{} &             Loss &         Accuracy &         F1 score \\
\midrule
\textbf{Model A                       } &  \textbf{nan ± nan} &  \textbf{nan ± nan} &  \textbf{nan ± nan} \\
\textbf{Model B                       } &        nan ± nan &        nan ± nan &        nan ± nan \\
\textbf{Vanilla                       } &        nan ± nan &        nan ± nan &        nan ± nan \\
\textbf{Vanilla (retraining)          } &        nan ± nan &        nan ± nan &        nan ± nan \\
\textbf{Optimal transport             } &        nan ± nan &        nan ± nan &        nan ± nan \\
\textbf{Optimal transport (retraining)} &        nan ± nan &        nan 

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)
  latex = df.to_latex(index=True,


In [15]:
# save as latex (in text format) (optional)
with open('./Output/scores_different_seeds (weighted full retraining).txt','w') as dat:
    dat.write(str(latex))