In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
import numpy as np
import torch

class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt'):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
            path (str): Path for the checkpoint to be saved to.
                            Default: 'checkpoint.pt'
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path

    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

In [3]:
def Accuracy(prediction, observation):
  prediction = prediction[:,1]
  prediction_class = (torch.reshape(prediction, observation.shape) > 0.5).float()
  correct = (prediction_class == observation).float().sum()
  accuracy = correct/prediction_class.shape[0]
  return float(accuracy.cpu())

def Precision(prediction, observation):
  prediction = prediction[:,1]
  res = []
  prediction_class = (torch.reshape(prediction, observation.shape) > 0.5).float()
  for label in [0, 1]:
    correct = (prediction_class[prediction_class == label] == observation[prediction_class == label]).float().sum()
    precision = correct/prediction_class[prediction_class == label].shape[0]
    res.append(float(precision.cpu()))
  return res

def Recall(prediction, observation):
  prediction = prediction[:,1]
  res = []
  prediction_class = (torch.reshape(prediction, observation.shape) > 0.5).float()
  for label in [0, 1]:
    correct = (prediction_class[observation == label] == observation[observation == label]).float().sum()
    recall = correct/prediction_class[observation == label].shape[0]
    res.append(float(recall.cpu()))
  return res

In [4]:
n_split = 3

In [5]:
import pickle
import numpy as np

In [6]:
with open('/content/drive/My Drive/Data Master/X_train_final', 'rb') as file:
    X_train = pickle.load(file)

with open('/content/drive/My Drive/Data Master/Y_train_final', 'rb') as file:
    Y_train = pickle.load(file)

with open('/content/drive/My Drive/Data Master/X_test_final', 'rb') as file:
    X_test = pickle.load(file)

with open('/content/drive/My Drive/Data Master/Y_test_final', 'rb') as file:
    Y_test = pickle.load(file)

with open('/content/drive/My Drive/Data Master/word_index_final', 'rb') as file:
    word_index = pickle.load(file)

with open('/content/drive/My Drive/Data Master/train_last_layer_embeddings_tail_fine_tuned_split' + str(n_split) + '.pkl', 'rb') as file:
    train_last_layer_embeddings = pickle.load(file)

with open('/content/drive/My Drive/Data Master/train_all_layers_embeddings_tail_fine_tuned_split' + str(n_split) + '.pkl', 'rb') as file:
    train_all_layers_embeddings = pickle.load(file)

with open('/content/drive/My Drive/Data Master/train_cls_token_embeddings_tail_fine_tuned_split' + str(n_split) + '.pkl', 'rb') as file:
    train_cls_token_embeddings = pickle.load(file)

with open('/content/drive/My Drive/Data Master/test_last_layer_embeddings_tail_fine_tuned_split' + str(n_split) + '.pkl', 'rb') as file:
    test_last_layer_embeddings = pickle.load(file)

with open('/content/drive/My Drive/Data Master/test_all_layers_embeddings_tail_fine_tuned_split' + str(n_split) + '.pkl', 'rb') as file:
    test_all_layers_embeddings = pickle.load(file)

with open('/content/drive/My Drive/Data Master/test_cls_token_embeddings_tail_fine_tuned_split' + str(n_split) + '.pkl', 'rb') as file:
    test_cls_token_embeddings = pickle.load(file)

with open('/content/drive/My Drive/Data Master/train_index_final_split_' + str(n_split), 'rb') as file:
    train_index = pickle.load(file)

with open('/content/drive/My Drive/Data Master/valid_index_final_split_' + str(n_split), 'rb') as file:
    valid_index = pickle.load(file)

In [7]:
test_index = [i for i, _ in enumerate(X_test)]

In [8]:
inv_word_index = {ix : w for w, ix in word_index.items()}

In [9]:
import itertools
features_index = {w:ix for ix, w in enumerate(np.unique(list(itertools.chain.from_iterable(np.array(X_train)[train_index]))))}
inv_features_index = {ix:w for ix, w in enumerate(np.unique(list(itertools.chain.from_iterable(np.array(X_train)[train_index]))))}

In [10]:
X_train_matrix = np.zeros((len(X_train), len(features_index)))
X_test_matrix = np.zeros((len(X_test), len(features_index)))

In [11]:
for i, x in enumerate(X_train):
  for w in x:
    if w in features_index:
      X_train_matrix[i,features_index[w]] += 1

for i, x in enumerate(X_test):
  for w in x:
    if w in features_index:
      X_test_matrix[i,features_index[w]] += 1

In [12]:
data_train_doc_ids = [(i, d) for i, d in enumerate(X_train)]
data_test_doc_ids = [(i, d) for i, d in enumerate(X_test)]

In [13]:
target_train_doc_ids = [(i, y) for i, y in enumerate(Y_train)]
target_test_doc_ids = [(i, y) for i, y in enumerate(Y_test)]

In [14]:
doc_train_ids = np.array([i for i, _ in data_train_doc_ids])
doc_test_ids = np.array([i for i, _ in data_test_doc_ids])
doc_train_dict = {k:v for k, v in [(i, d) for i, d in data_train_doc_ids]}
doc_test_dict = {k:v for k, v in [(i, d) for i, d in data_test_doc_ids]}
target_train_dict = {k:v for k, v in [(i, y) for i, y in target_train_doc_ids]}
target_test_dict = {k:v for k, v in [(i, y) for i, y in target_test_doc_ids]}

In [15]:
doc_train_ids[train_index]

array([  0,   1,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  17,  18,  19,  23,  25,  26,  27,  28,  29,  30,  31,  33,
        34,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  47,  48,
        50,  51,  52,  53,  54,  55,  56,  57,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  70,  71,  72,  73,  74,  76,  77,  78,  79,
        80,  81,  83,  84,  85,  86,  87,  89,  90,  91,  92,  93,  94,
        95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107,
       108, 110, 112, 113, 114, 115, 116, 117, 118, 121, 122, 123, 124,
       125, 126, 127, 128, 129, 131, 133, 134, 135, 136, 137, 138, 139,
       140, 142, 143, 144, 145, 146, 147, 148, 149, 151, 152, 153, 154,
       155, 156, 158, 159, 161, 162, 163, 164, 165, 166, 167, 168, 169,
       170, 171, 172, 173, 174, 175, 176, 177, 179, 180, 181, 182, 183,
       186, 189, 190, 191, 192, 193, 194, 195, 196, 198, 200, 201, 202,
       203, 204, 205, 206, 207, 209, 210, 211, 212, 213, 214, 21

In [16]:
doc_train_ids[valid_index]

array([  2,  15,  16,  20,  21,  22,  24,  32,  35,  46,  49,  58,  59,
        69,  75,  82,  88, 109, 111, 119, 120, 130, 132, 141, 150, 157,
       160, 178, 184, 185, 187, 188, 197, 199, 208, 225, 233, 236, 237,
       239, 247, 250, 253, 258, 267, 271, 272, 278, 281, 290, 291, 296,
       299, 300, 304, 311, 325, 326, 327, 329, 331, 332, 333, 334, 338,
       343, 345, 348, 363, 366, 367, 379, 382, 387, 390, 398, 401, 402,
       405, 407, 414, 415, 419, 423, 424, 446, 447, 452, 457, 458, 466,
       473, 474, 480, 494, 495, 505, 512, 513, 521, 526, 531, 533, 535,
       541, 543, 544, 545, 547, 548, 551, 554, 556, 558, 561, 565, 566,
       572, 574, 576, 577, 579, 580, 582, 585, 587, 600, 602, 605, 606,
       607, 611, 613, 614, 615, 621, 640, 643, 660, 661, 665, 671, 672,
       685, 698, 730, 734, 742, 743, 748, 761, 784, 785, 788, 796, 800,
       806, 811, 814, 816, 818, 826, 827, 830, 839, 840, 841, 844, 851,
       856, 859, 861, 863, 870, 875, 880, 881, 882])

In [17]:
doc_test_ids[test_index]

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

In [18]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/9c/35/1c3f6e62d81f5f0daff1384e6d5e6c5758682a8357ebc765ece2b9def62b/transformers-3.0.0-py3-none-any.whl (754kB)
[K     |████████████████████████████████| 757kB 2.9MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 9.9MB/s 
[?25hCollecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 17.4MB/s 
Collecting tokenizers==0.8.0-rc4
[?25l  Downloading https://files.pythonhosted.org/packages/e8/bd/e5abec46af977c8a1375c1dca7cb1e5b3ec392ef279067af7f6bc50491a0/tokenizers-0.8.0rc4-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)


In [19]:
import pandas as pd
import numpy as np
import itertools

import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
import torch.utils.data as tdata
import torch.optim as optim

import tqdm

torch.manual_seed(0)
np.random.seed(0)

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [21]:
X0_train_tensor = torch.from_numpy(X_train_matrix[train_index]).float().to(device)
X0_valid_tensor = torch.from_numpy(X_train_matrix[valid_index]).float().to(device)
X0_test_tensor = torch.from_numpy(X_test_matrix).float().to(device)

In [22]:
np.max([len(clss) for doc, clss in train_cls_token_embeddings.items()])

285

In [23]:
max_len = 50

In [24]:
def manual_padding(sent, max_len = 200):
  pad_tensor = [torch.from_numpy(np.zeros((1, 768))).float()]
  if len(sent) > max_len:
    res = sent[-max_len:]
  else:
    res = (pad_tensor * (max_len - len(sent))) + sent
  return res

In [25]:
X1_train_tensor = torch.stack([torch.stack(manual_padding(sent=train_cls_token_embeddings[i], max_len = max_len)) for i in train_index]).squeeze(2).to(device)
X2_train_tensor = torch.stack([train_last_layer_embeddings[i] for i in train_index]).squeeze(1).to(device)
X3_train_tensor = torch.stack([torch.flatten(train_all_layers_embeddings[i].permute(1, 0, 2), start_dim=1) for i in train_index]).squeeze(1).to(device)

In [26]:
X1_valid_tensor = torch.stack([torch.stack(manual_padding(sent=train_cls_token_embeddings[i], max_len = max_len)) for i in valid_index]).squeeze(2).to(device)
X2_valid_tensor = torch.stack([train_last_layer_embeddings[i] for i in valid_index]).squeeze(1).to(device)
X3_valid_tensor = torch.stack([torch.flatten(train_all_layers_embeddings[i].permute(1, 0, 2), start_dim=1) for i in valid_index]).squeeze(1).to(device)

In [27]:
X1_test_tensor = torch.stack([torch.stack(manual_padding(sent=test_cls_token_embeddings[i], max_len = max_len)) for i in test_index]).squeeze(2).to(device)
X2_test_tensor = torch.stack([test_last_layer_embeddings[i] for i in test_index]).squeeze(1).to(device)
X3_test_tensor = torch.stack([torch.flatten(test_all_layers_embeddings[i].permute(1, 0, 2), start_dim=1) for i in test_index]).squeeze(1).to(device)

In [28]:
Y_train_tensor = torch.LongTensor(np.array(Y_train)[train_index]).to(device)
Y_valid_tensor = torch.LongTensor(np.array(Y_train)[valid_index]).to(device)
Y_test_tensor = torch.LongTensor(np.array(Y_test)).to(device)

In [29]:
X0_train_tensor.shape

torch.Size([710, 245166])

In [30]:
X1_train_tensor.shape

torch.Size([710, 50, 768])

In [31]:
X2_train_tensor.shape

torch.Size([710, 768])

In [32]:
X3_train_tensor.shape

torch.Size([710, 9984])

In [33]:
input_dim_0 = X0_train_tensor.shape[1]
input_dim_2 = X2_train_tensor.shape[1]
input_dim_3 = X3_train_tensor.shape[1]
EMBEDDING_DIM = X1_train_tensor.shape[2]

In [34]:
input_dim_0

245166

In [35]:
input_dim_2

768

In [36]:
input_dim_3

9984

In [37]:
EMBEDDING_DIM

768

In [38]:
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()

        hidden_dim = 50
        num_layers = 2

        #self.lstm = nn.LSTM(EMBEDDING_DIM, hidden_dim, num_layers)
        self.lstm = nn.LSTM(EMBEDDING_DIM, hidden_dim, num_layers)
        
        self.fc_out = nn.Linear(hidden_dim, 2)

        self.softmax = nn.Softmax(dim=1)

        self.dropout = nn.Dropout(0.3)
        
    def forward(self, x2):
        x2 = F.normalize(x2)
        x2 = x2.permute(1,0,2)
        lstm_out, hidden = self.lstm(x2)
        h2 = self.dropout(lstm_out[-1])

        # Concatenate in dim1 (feature dimension)

        y = self.softmax(self.fc_out(h2))
        return y

In [39]:
model = MyModel()
model.to(device)

MyModel(
  (lstm): LSTM(768, 50, num_layers=2)
  (fc_out): Linear(in_features=50, out_features=2, bias=True)
  (softmax): Softmax(dim=1)
  (dropout): Dropout(p=0.3, inplace=False)
)

In [40]:
X0_train_tensor.shape

torch.Size([710, 245166])

In [41]:
X1_train_tensor.shape

torch.Size([710, 50, 768])

In [42]:
X2_train_tensor.shape

torch.Size([710, 768])

In [43]:
X3_train_tensor.shape

torch.Size([710, 9984])

In [44]:
import torch.optim as optim
optimizer = optim.AdamW(model.parameters(), lr=0.01)

In [45]:
from torch.optim.lr_scheduler import ReduceLROnPlateau
scheduler = ReduceLROnPlateau(optimizer, 'min')

In [46]:
weights = [sum(Y_train)/len(Y_train), 1-sum(Y_train)/len(Y_train)]
class_weights = torch.FloatTensor(weights)
class_weights

tensor([0.1734, 0.8266])

In [47]:
criterion = nn.CrossEntropyLoss(weight=class_weights)

In [48]:
patience = 20
early_stopping = EarlyStopping(patience=patience, verbose=True)


for i in range(200):
  model.train()
  optimizer.zero_grad()
  prediction = model(X1_train_tensor)
  loss = criterion(prediction, Y_train_tensor)
  loss.backward()
  optimizer.step()

  accuracy = Accuracy(prediction, Y_train_tensor)

  model.eval()

  val_prediction = model(X1_valid_tensor)
  test_prediction = model(X1_test_tensor)
  val_loss = criterion(val_prediction, Y_valid_tensor)
  test_loss = criterion(test_prediction, Y_test_tensor)

  val_accuracy = Accuracy(val_prediction, Y_valid_tensor)
  test_accuracy = Accuracy(test_prediction, Y_test_tensor)

  early_stopping(val_loss, model)

  if early_stopping.early_stop:
    print("Early stopping")
    break

  print(i, float(loss.cpu()), accuracy, float(val_loss.cpu()), val_accuracy, float(test_loss.cpu()), test_accuracy)

  scheduler.step(val_loss)

model.load_state_dict(torch.load('checkpoint.pt'))

Validation loss decreased (inf --> 0.661601).  Saving model ...
0 0.6919524669647217 0.8267605900764465 0.6616005301475525 0.8426966071128845 0.6667419075965881 0.8582677245140076
Validation loss decreased (0.661601 --> 0.611015).  Saving model ...
1 0.6536290049552917 0.8577464818954468 0.611014723777771 0.6123595237731934 0.6415847539901733 0.5669291615486145
Validation loss decreased (0.611015 --> 0.583168).  Saving model ...
2 0.5945330262184143 0.6352112889289856 0.583167552947998 0.8483145833015442 0.5988492965698242 0.8635170459747314
Validation loss decreased (0.583168 --> 0.552978).  Saving model ...
3 0.5367887616157532 0.8788732290267944 0.5529783964157104 0.8876404762268066 0.562003493309021 0.8976377844810486
Validation loss decreased (0.552978 --> 0.527669).  Saving model ...
4 0.5076143741607666 0.8971831202507019 0.5276694893836975 0.8089887499809265 0.5209231376647949 0.808398962020874
Validation loss decreased (0.527669 --> 0.523007).  Saving model ...
5 0.49461874365

<All keys matched successfully>

In [49]:
print(Precision(prediction, Y_train_tensor))
print(Precision(val_prediction, Y_valid_tensor))
print(Precision(test_prediction, Y_test_tensor))

[0.9792746305465698, 0.847328245639801]
[0.9399999976158142, 0.7857142686843872]
[0.9335442781448364, 0.6461538672447205]


In [50]:
print(Recall(prediction, Y_train_tensor))
print(Recall(val_prediction, Y_valid_tensor))
print(Recall(test_prediction, Y_test_tensor))

[0.9659284353256226, 0.9024389982223511]
[0.9591836929321289, 0.7096773982048035]
[0.9276729822158813, 0.6666666865348816]


In [51]:
print(Accuracy(prediction, Y_train_tensor))
print(Accuracy(val_prediction, Y_valid_tensor))
print(Accuracy(test_prediction, Y_test_tensor))

0.9549295902252197
0.915730357170105
0.8845144510269165
