In [2]:
import torch
import torch.nn as nn
import math
import numpy as np
import random
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from torchtext.data import Field, BucketIterator
from sklearn.model_selection import train_test_split
from torchtext.vocab import build_vocab_from_iterator
import re
from transformers import BertTokenizer
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [1]:
from utils import *

# Import data

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [4]:
# import data
data = import_data()

# train-test-validation split
train_iter, valid_iter, test_iter, voc_size, pad_idx = train_test_val_split(data,
                                                                            device,
                                                                            batch_size=512)

# Creating the embedding matrix
embedding = torch.nn.Embedding(voc_size, 16)

dataset initializing start
Tokenizing the data...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["len"] = data.iloc[:, 0].apply(lambda x : len(self.tokenize(x)))


Length of the data :  29544
0
review       [[CLS], one, of, the, many, silent, comedies, ...
sentiment                                                    0
len                                                        186
Name: 46539, dtype: object


100%|██████████| 23635/23635 [00:00<00:00, 38358.18it/s]
100%|██████████| 2954/2954 [00:00<00:00, 35953.82it/s]
100%|██████████| 2955/2955 [00:00<00:00, 40028.19it/s]


dataset initializing done
Vocabulary Size :  23050


# Training parent models

Idea: We train model A and model B for long enough, s.t. they start overfitting. We use their best models w.r.t. validation set (i.e. not the final model after all training epochs) and fuse them together. The fused model is then trained for long enough as well, saving the best model w.r.t to the same validation set. The fused model is then compared with its parent models on the separate test set.


note that dataset is imbalanced -> accuracy is not a good metric

In [None]:
# template for training parent models (as we train them the same way)
def train_early_stopping(model_name: str, train_iter, valid_iter, epochs, device, lr=2e-4):
    # init
    model = TransformerClassifier(embedding=embedding,
                                  src_pad_idx = pad_idx,
                                  enc_voc_size = voc_size,
                                  max_len = 256,
                                  d_model = 16,
                                  ffn_hidden = 32,
                                  n_head = 1,
                                  n_layers = 1,
                                  drop_prob = 0.5,
                                  device = device)
    model = model.to(device) # put on CPU/GPU

    opt = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()

    # train with early stopping
    history, best_model, best_model_score = train_save_best(model=model,
                                                            iterator=train_iter,
                                                            valid_iter=valid_iter,
                                                            optimizer=opt,
                                                            criterion=loss_fn,
                                                            epoch=epochs,
                                                            clip=1,
                                                            device=device)

    # save model
    name = f'parallel_training/model{model_name}_IMDB_256'
    save_model(model, name=name)

    # save history
    name = f'parallel_training/history_model{model_name}_IMDB_256'
    save_history(history, name=name)

In [None]:
%%time
# train parent model A
train_early_stopping(model_name='A',
                     train_iter=train_iter,
                     valid_iter=valid_iter,
                     epochs=100,
                     device=device,
                     lr=2e-4)

In [None]:
%%time
# train parent model B
train_early_stopping(model_name='B',
                     train_iter=train_iter,
                     valid_iter=valid_iter,
                     epochs=100,
                     device=device,
                     lr=2e-4)

# Model fusion

### (Optional) load models

In [None]:
# load parent models
modelA = TransformerClassifier(src_pad_idx = pad_idx,
                               enc_voc_size = voc_size,
                               max_len = 256,
                               d_model = 512,
                               ffn_hidden = 2048,
                               n_head = 1,
                               n_layers = 1,
                               drop_prob = 0.1,
                               device = device)
modelB = TransformerClassifier(src_pad_idx = pad_idx,
                               enc_voc_size = voc_size,
                               max_len = 256,
                               d_model = 512,
                               ffn_hidden = 2048,
                               n_head = 1,
                               n_layers = 1,
                               drop_prob = 0.1,
                               device = device)

modelA.load_state_dict(torch.load('./Models/modelA'))
modelB.load_state_dict(torch.load('./Models/modelB'))

modelA = modelA.to(device) # put on CPU/GPU
modelB = modelB.to(device) # put on CPU/GPU

### Fusion

Vanilla fusion

In [None]:
model_fusion = vanilla_fusion(modelA, modelB)
model_fusion = model_fusion.to(device)

Optimal transport

In [None]:
model_fusion = TransformerClassifier(src_pad_idx = pad_idx,
                                     enc_voc_size = voc_size,
                                     max_len = 256,
                                     d_model = 512,
                                     ffn_hidden = 2048,
                                     n_head = 1,
                                     n_layers = 1,
                                     drop_prob = 0.1,
                                     device = device)

model_fusion.load_state_dict(torch.load('./Models/model_fusion_OT_pre_retraining'))

model_fusion = model_fusion.to(device)

### Test fusion

Randomly initialiized model for comparison

In [None]:
# test with new randomly initialized transformer
test_fusion(modelA, modelB, TransformerClassifier(src_pad_idx = pad_idx,
                              enc_voc_size = voc_size,
                              max_len = 256,
                              d_model = 512,
                              ffn_hidden = 2048,
                              n_head = 1,
                              n_layers = 1,
                              drop_prob = 0.1,
                              device = device).to(device))

Fusion model

In [None]:
# test with vanilla fusion
test_fusion(modelA, modelB, model_fusion)

### Retraining

In [None]:
model_name = 'model_fusion_OT_post_retraining'

In [None]:
%%time
opt_fusion = torch.optim.SGD(model_fusion.parameters(), lr=0.001)
loss_fn_fusion = nn.CrossEntropyLoss()


epochs = 200
history_fusion, best_model_fusion, best_model_score_fusion = train_save_best(model=model_fusion,
                                                                             iterator=train_iter,
                                                                             optimizer=opt_fusion,
                                                                             criterion=loss_fn_fusion,
                                                                             epoch=epochs,
                                                                             clip=1,
                                                                             device=device)

In [None]:
# save model
torch.save(best_model_fusion, f'./Models/{model_name}')

# save history
with open(f'./Models/history_{model_name}.txt', 'w') as dat:
    dat.write(str(history_fusion))

In [None]:
# load best model (current model is trained on full epochs)
model_fusion.load_state_dict(best_model_fusion)

### Test fusion (after retraining)

In [None]:
# test with vanilla fusion
test_fusion(modelA, modelB, model_fusion)