In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import torch

class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt'):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
            path (str): Path for the checkpoint to be saved to.
                            Default: 'checkpoint.pt'
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path

    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

In [3]:
def Accuracy(prediction, observation):
  prediction = prediction[:,1]
  prediction_class = (torch.reshape(prediction, observation.shape) > 0.5).float()
  correct = (prediction_class == observation).float().sum()
  accuracy = correct/prediction_class.shape[0]
  return float(accuracy.cpu())

def Precision(prediction, observation):
  prediction = prediction[:,1]
  res = []
  prediction_class = (torch.reshape(prediction, observation.shape) > 0.5).float()
  for label in [0, 1]:
    correct = (prediction_class[prediction_class == label] == observation[prediction_class == label]).float().sum()
    precision = correct/prediction_class[prediction_class == label].shape[0]
    res.append(float(precision.cpu()))
  return res

def Recall(prediction, observation):
  prediction = prediction[:,1]
  res = []
  prediction_class = (torch.reshape(prediction, observation.shape) > 0.5).float()
  for label in [0, 1]:
    correct = (prediction_class[observation == label] == observation[observation == label]).float().sum()
    recall = correct/prediction_class[observation == label].shape[0]
    res.append(float(recall.cpu()))
  return res

In [4]:
n_split = 1

In [5]:
import pickle
import numpy as np

In [6]:
with open('/content/drive/My Drive/Data Master/X_train_final', 'rb') as file:
    X_train = pickle.load(file)

with open('/content/drive/My Drive/Data Master/Y_train_final', 'rb') as file:
    Y_train = pickle.load(file)

with open('/content/drive/My Drive/Data Master/X_test_final', 'rb') as file:
    X_test = pickle.load(file)

with open('/content/drive/My Drive/Data Master/Y_test_final', 'rb') as file:
    Y_test = pickle.load(file)

with open('/content/drive/My Drive/Data Master/word_index_final', 'rb') as file:
    word_index = pickle.load(file)

with open('/content/drive/My Drive/Data Master/train_last_layer_embeddings_tail_fine_tuned_split' + str(n_split) + '.pkl', 'rb') as file:
    train_last_layer_embeddings = pickle.load(file)

with open('/content/drive/My Drive/Data Master/train_all_layers_embeddings_tail_fine_tuned_split' + str(n_split) + '.pkl', 'rb') as file:
    train_all_layers_embeddings = pickle.load(file)

with open('/content/drive/My Drive/Data Master/train_cls_token_embeddings_tail_fine_tuned_split' + str(n_split) + '.pkl', 'rb') as file:
    train_cls_token_embeddings = pickle.load(file)

with open('/content/drive/My Drive/Data Master/test_last_layer_embeddings_tail_fine_tuned_split' + str(n_split) + '.pkl', 'rb') as file:
    test_last_layer_embeddings = pickle.load(file)

with open('/content/drive/My Drive/Data Master/test_all_layers_embeddings_tail_fine_tuned_split' + str(n_split) + '.pkl', 'rb') as file:
    test_all_layers_embeddings = pickle.load(file)

with open('/content/drive/My Drive/Data Master/test_cls_token_embeddings_tail_fine_tuned_split' + str(n_split) + '.pkl', 'rb') as file:
    test_cls_token_embeddings = pickle.load(file)

with open('/content/drive/My Drive/Data Master/train_index_final_split_' + str(n_split), 'rb') as file:
    train_index = pickle.load(file)

with open('/content/drive/My Drive/Data Master/valid_index_final_split_' + str(n_split), 'rb') as file:
    valid_index = pickle.load(file)

In [7]:
test_index = [i for i, _ in enumerate(X_test)]

In [8]:
inv_word_index = {ix : w for w, ix in word_index.items()}

In [9]:
import itertools
features_index = {w:ix for ix, w in enumerate(np.unique(list(itertools.chain.from_iterable(np.array(X_train)[train_index]))))}
inv_features_index = {ix:w for ix, w in enumerate(np.unique(list(itertools.chain.from_iterable(np.array(X_train)[train_index]))))}

In [10]:
X_train_matrix = np.zeros((len(X_train), len(features_index)))
X_test_matrix = np.zeros((len(X_test), len(features_index)))

In [11]:
for i, x in enumerate(X_train):
  for w in x:
    if w in features_index:
      X_train_matrix[i,features_index[w]] += 1

for i, x in enumerate(X_test):
  for w in x:
    if w in features_index:
      X_test_matrix[i,features_index[w]] += 1

In [12]:
data_train_doc_ids = [(i, d) for i, d in enumerate(X_train)]
data_test_doc_ids = [(i, d) for i, d in enumerate(X_test)]

In [13]:
target_train_doc_ids = [(i, y) for i, y in enumerate(Y_train)]
target_test_doc_ids = [(i, y) for i, y in enumerate(Y_test)]

In [14]:
doc_train_ids = np.array([i for i, _ in data_train_doc_ids])
doc_test_ids = np.array([i for i, _ in data_test_doc_ids])
doc_train_dict = {k:v for k, v in [(i, d) for i, d in data_train_doc_ids]}
doc_test_dict = {k:v for k, v in [(i, d) for i, d in data_test_doc_ids]}
target_train_dict = {k:v for k, v in [(i, y) for i, y in target_train_doc_ids]}
target_test_dict = {k:v for k, v in [(i, y) for i, y in target_test_doc_ids]}

In [15]:
doc_train_ids[train_index]

array([  1,   2,   3,   4,   6,  10,  11,  12,  15,  16,  17,  18,  19,
        20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  31,  32,  33,
        34,  35,  36,  37,  38,  39,  40,  41,  42,  44,  45,  46,  47,
        48,  49,  50,  51,  52,  54,  56,  57,  58,  59,  60,  61,  62,
        63,  67,  68,  69,  70,  71,  72,  74,  75,  77,  81,  82,  83,
        84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,
        98, 100, 101, 102, 104, 105, 107, 108, 109, 110, 111, 112, 113,
       114, 115, 116, 117, 118, 119, 120, 123, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 136, 138, 139, 141, 142, 143, 145, 146,
       148, 150, 151, 152, 153, 154, 156, 157, 160, 162, 163, 164, 166,
       167, 170, 171, 172, 173, 174, 177, 178, 179, 180, 181, 182, 184,
       185, 187, 188, 189, 190, 191, 193, 194, 195, 196, 197, 198, 199,
       200, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213,
       214, 215, 219, 222, 223, 224, 225, 226, 227, 229, 233, 23

In [16]:
doc_train_ids[valid_index]

array([  0,   5,   7,   8,   9,  13,  14,  30,  43,  53,  55,  64,  65,
        66,  73,  76,  78,  79,  80,  97,  99, 103, 106, 121, 122, 124,
       135, 137, 140, 144, 147, 149, 155, 158, 159, 161, 165, 168, 169,
       175, 176, 183, 186, 192, 201, 216, 217, 218, 220, 221, 228, 230,
       231, 232, 235, 245, 248, 254, 274, 283, 284, 288, 294, 295, 301,
       308, 316, 319, 324, 330, 336, 337, 351, 355, 356, 358, 359, 360,
       369, 371, 372, 376, 389, 393, 411, 413, 418, 427, 428, 430, 434,
       436, 437, 438, 453, 461, 463, 464, 467, 469, 485, 491, 502, 517,
       518, 520, 522, 527, 552, 559, 560, 568, 575, 578, 583, 592, 595,
       598, 599, 603, 616, 618, 619, 623, 626, 630, 631, 632, 635, 637,
       645, 646, 647, 649, 654, 662, 664, 666, 667, 669, 670, 674, 675,
       679, 680, 682, 691, 692, 703, 706, 712, 713, 724, 726, 735, 736,
       744, 754, 755, 757, 758, 765, 768, 777, 790, 794, 798, 799, 803,
       810, 824, 829, 835, 850, 852, 860, 874, 885])

In [17]:
doc_test_ids[test_index]

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

In [18]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/99/84/7bc03215279f603125d844bf81c3fb3f2d50fe8e511546eb4897e4be2067/transformers-4.0.0-py3-none-any.whl (1.4MB)
[K     |████████████████████████████████| 1.4MB 8.1MB/s 
Collecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 23.6MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 47.9MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893257 sha256=80c8d5ab2c1126d25a8

In [19]:
import pandas as pd
import numpy as np
import itertools

import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
import torch.utils.data as tdata
import torch.optim as optim

import tqdm

torch.manual_seed(0)
np.random.seed(0)

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [21]:
X0_train_tensor = torch.from_numpy(X_train_matrix[train_index]).float().to(device)
X0_valid_tensor = torch.from_numpy(X_train_matrix[valid_index]).float().to(device)
X0_test_tensor = torch.from_numpy(X_test_matrix).float().to(device)

In [22]:
np.max([len(clss) for doc, clss in train_cls_token_embeddings.items()])

285

In [23]:
max_len = 50

In [24]:
def manual_padding(sent, max_len = 200):
  pad_tensor = [torch.from_numpy(np.zeros((1, 768))).float()]
  if len(sent) > max_len:
    res = sent[-max_len:]
  else:
    res = (pad_tensor * (max_len - len(sent))) + sent
  return res

In [25]:
X1_train_tensor = torch.stack([torch.stack(manual_padding(sent=train_cls_token_embeddings[i], max_len = max_len)) for i in train_index]).squeeze(2).to(device)
X2_train_tensor = torch.stack([train_last_layer_embeddings[i] for i in train_index]).squeeze(1).to(device)
X3_train_tensor = torch.stack([torch.flatten(train_all_layers_embeddings[i].permute(1, 0, 2), start_dim=1) for i in train_index]).squeeze(1).to(device)

In [26]:
X1_valid_tensor = torch.stack([torch.stack(manual_padding(sent=train_cls_token_embeddings[i], max_len = max_len)) for i in valid_index]).squeeze(2).to(device)
X2_valid_tensor = torch.stack([train_last_layer_embeddings[i] for i in valid_index]).squeeze(1).to(device)
X3_valid_tensor = torch.stack([torch.flatten(train_all_layers_embeddings[i].permute(1, 0, 2), start_dim=1) for i in valid_index]).squeeze(1).to(device)

In [27]:
X1_test_tensor = torch.stack([torch.stack(manual_padding(sent=test_cls_token_embeddings[i], max_len = max_len)) for i in test_index]).squeeze(2).to(device)
X2_test_tensor = torch.stack([test_last_layer_embeddings[i] for i in test_index]).squeeze(1).to(device)
X3_test_tensor = torch.stack([torch.flatten(test_all_layers_embeddings[i].permute(1, 0, 2), start_dim=1) for i in test_index]).squeeze(1).to(device)

In [28]:
Y_train_tensor = torch.LongTensor(np.array(Y_train)[train_index]).to(device)
Y_valid_tensor = torch.LongTensor(np.array(Y_train)[valid_index]).to(device)
Y_test_tensor = torch.LongTensor(np.array(Y_test)).to(device)

In [29]:
X0_train_tensor.shape

torch.Size([710, 246510])

In [30]:
X1_train_tensor.shape

torch.Size([710, 50, 768])

In [31]:
X2_train_tensor.shape

torch.Size([710, 768])

In [32]:
X3_train_tensor.shape

torch.Size([710, 9984])

In [33]:
input_dim_0 = X0_train_tensor.shape[1]
input_dim_2 = X2_train_tensor.shape[1]
input_dim_3 = X3_train_tensor.shape[1]
EMBEDDING_DIM = X1_train_tensor.shape[2]

In [34]:
input_dim_0

246510

In [35]:
input_dim_2

768

In [36]:
input_dim_3

9984

In [37]:
EMBEDDING_DIM

768

In [38]:
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()

        hidden_dim = 50
        num_layers = 2

        self.fc1 = nn.Linear(input_dim_0, 50)

        #self.lstm = nn.LSTM(EMBEDDING_DIM, hidden_dim, num_layers)
        self.lstm = nn.LSTM(EMBEDDING_DIM, hidden_dim, num_layers)
        
        self.fc_out = nn.Linear(50 + hidden_dim, 2)

        self.softmax = nn.Softmax(dim=1)

        self.dropout = nn.Dropout(0.3)
        
    def forward(self, x1, x2):
        x1 = F.normalize(x1)
        h1 = self.dropout(self.fc1(x1))

        x2 = F.normalize(x2)
        x2 = x2.permute(1,0,2)
        lstm_out, hidden = self.lstm(x2)
        h2 = self.dropout(lstm_out[-1])

        # Concatenate in dim1 (feature dimension)
        x = torch.cat((h1, h2), 1)

        y = self.softmax(self.fc_out(x))
        return y

In [39]:
model = MyModel()
model.to(device)

MyModel(
  (fc1): Linear(in_features=246510, out_features=50, bias=True)
  (lstm): LSTM(768, 50, num_layers=2)
  (fc_out): Linear(in_features=100, out_features=2, bias=True)
  (softmax): Softmax(dim=1)
  (dropout): Dropout(p=0.3, inplace=False)
)

In [40]:
X0_train_tensor.shape

torch.Size([710, 246510])

In [41]:
X1_train_tensor.shape

torch.Size([710, 50, 768])

In [42]:
X2_train_tensor.shape

torch.Size([710, 768])

In [43]:
X3_train_tensor.shape

torch.Size([710, 9984])

In [44]:
import torch.optim as optim
optimizer = optim.AdamW(model.parameters(), lr=0.01)

In [45]:
from torch.optim.lr_scheduler import ReduceLROnPlateau
scheduler = ReduceLROnPlateau(optimizer, 'min')

In [46]:
weights = [sum(Y_train)/len(Y_train), 1-sum(Y_train)/len(Y_train)]
class_weights = torch.FloatTensor(weights)
class_weights

tensor([0.1734, 0.8266])

In [47]:
criterion = nn.CrossEntropyLoss(weight=class_weights)

In [48]:
patience = 20
early_stopping = EarlyStopping(patience=patience, verbose=True)


for i in range(100):
  model.train()
  optimizer.zero_grad()
  prediction = model(X0_train_tensor, X1_train_tensor)
  loss = criterion(prediction, Y_train_tensor)
  loss.backward()
  optimizer.step()

  accuracy = Accuracy(prediction, Y_train_tensor)

  model.eval()

  val_prediction = model(X0_valid_tensor, X1_valid_tensor)
  test_prediction = model(X0_test_tensor, X1_test_tensor)
  val_loss = criterion(val_prediction, Y_valid_tensor)
  test_loss = criterion(test_prediction, Y_test_tensor)

  val_accuracy = Accuracy(val_prediction, Y_valid_tensor)
  test_accuracy = Accuracy(test_prediction, Y_test_tensor)

  early_stopping(val_loss, model)

  if early_stopping.early_stop:
    print("Early stopping")
    break

  print(i, float(loss.cpu()), accuracy, float(val_loss.cpu()), val_accuracy, float(test_loss.cpu()), test_accuracy)

  scheduler.step(val_loss)

model.load_state_dict(torch.load('checkpoint.pt'))

Validation loss decreased (inf --> 0.674205).  Saving model ...
0 0.6917068362236023 0.202816903591156 0.6742053627967834 0.8258426785469055 0.6751478910446167 0.834645688533783
Validation loss decreased (0.674205 --> 0.649745).  Saving model ...
1 0.6606553792953491 0.8267605900764465 0.6497449278831482 0.3764044940471649 0.6742716431617737 0.3044619560241699
Validation loss decreased (0.649745 --> 0.598487).  Saving model ...
2 0.6372442245483398 0.3746478855609894 0.5984874367713928 0.8258426785469055 0.6261024475097656 0.7769029140472412
EarlyStopping counter: 1 out of 20
3 0.5583387613296509 0.8732394576072693 0.6351451277732849 0.882022500038147 0.6410955190658569 0.874015748500824
Validation loss decreased (0.598487 --> 0.559377).  Saving model ...
4 0.576991856098175 0.8999999761581421 0.5593774318695068 0.7359550595283508 0.6026454567909241 0.6850393414497375
EarlyStopping counter: 1 out of 20
5 0.4991770088672638 0.8028169274330139 0.5619143843650818 0.6348314881324768 0.6134

<All keys matched successfully>

In [49]:
print(Precision(prediction, Y_train_tensor))
print(Precision(val_prediction, Y_valid_tensor))
print(Precision(test_prediction, Y_test_tensor))

[1.0, 0.9919354915618896]
[0.9407894611358643, 0.8461538553237915]
[0.9345238208770752, 0.9111111164093018]


In [50]:
print(Recall(prediction, Y_train_tensor))
print(Recall(val_prediction, Y_valid_tensor))
print(Recall(test_prediction, Y_test_tensor))

[0.9982964396476746, 1.0]
[0.9727891087532043, 0.7096773982048035]
[0.9874213933944702, 0.6507936716079712]


In [51]:
print(Accuracy(prediction, Y_train_tensor))
print(Accuracy(val_prediction, Y_valid_tensor))
print(Accuracy(test_prediction, Y_test_tensor))

0.9985915422439575
0.9269663095474243
0.9317585229873657


In [52]:
with open('/content/drive/My Drive/Data Master/Prediction_BOW_BERT_LSTM/test_prediction_split' + str(n_split), 'wb') as file:
    pickle.dump(test_prediction.detach().numpy(), file)