In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
import numpy as np
import torch

class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt'):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
            path (str): Path for the checkpoint to be saved to.
                            Default: 'checkpoint.pt'
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path

    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

In [3]:
def Accuracy(prediction, observation):
  prediction = prediction[:,1]
  prediction_class = (torch.reshape(prediction, observation.shape) > 0.5).float()
  correct = (prediction_class == observation).float().sum()
  accuracy = correct/prediction_class.shape[0]
  return float(accuracy.cpu())

def Precision(prediction, observation):
  prediction = prediction[:,1]
  res = []
  prediction_class = (torch.reshape(prediction, observation.shape) > 0.5).float()
  for label in [0, 1]:
    correct = (prediction_class[prediction_class == label] == observation[prediction_class == label]).float().sum()
    precision = correct/prediction_class[prediction_class == label].shape[0]
    res.append(float(precision.cpu()))
  return res

def Recall(prediction, observation):
  prediction = prediction[:,1]
  res = []
  prediction_class = (torch.reshape(prediction, observation.shape) > 0.5).float()
  for label in [0, 1]:
    correct = (prediction_class[observation == label] == observation[observation == label]).float().sum()
    recall = correct/prediction_class[observation == label].shape[0]
    res.append(float(recall.cpu()))
  return res

In [4]:
n_split = 2

In [5]:
import pickle
import numpy as np

In [6]:
with open('/content/drive/My Drive/Data Master/X_train_final', 'rb') as file:
    X_train = pickle.load(file)

with open('/content/drive/My Drive/Data Master/Y_train_final', 'rb') as file:
    Y_train = pickle.load(file)

with open('/content/drive/My Drive/Data Master/X_test_final', 'rb') as file:
    X_test = pickle.load(file)

with open('/content/drive/My Drive/Data Master/Y_test_final', 'rb') as file:
    Y_test = pickle.load(file)

with open('/content/drive/My Drive/Data Master/word_index_final', 'rb') as file:
    word_index = pickle.load(file)

with open('/content/drive/My Drive/Data Master/train_last_layer_embeddings_tail_fine_tuned_split' + str(n_split) + '.pkl', 'rb') as file:
    train_last_layer_embeddings = pickle.load(file)

with open('/content/drive/My Drive/Data Master/train_all_layers_embeddings_tail_fine_tuned_split' + str(n_split) + '.pkl', 'rb') as file:
    train_all_layers_embeddings = pickle.load(file)

with open('/content/drive/My Drive/Data Master/train_cls_token_embeddings_tail_fine_tuned_split' + str(n_split) + '.pkl', 'rb') as file:
    train_cls_token_embeddings = pickle.load(file)

with open('/content/drive/My Drive/Data Master/test_last_layer_embeddings_tail_fine_tuned_split' + str(n_split) + '.pkl', 'rb') as file:
    test_last_layer_embeddings = pickle.load(file)

with open('/content/drive/My Drive/Data Master/test_all_layers_embeddings_tail_fine_tuned_split' + str(n_split) + '.pkl', 'rb') as file:
    test_all_layers_embeddings = pickle.load(file)

with open('/content/drive/My Drive/Data Master/test_cls_token_embeddings_tail_fine_tuned_split' + str(n_split) + '.pkl', 'rb') as file:
    test_cls_token_embeddings = pickle.load(file)

with open('/content/drive/My Drive/Data Master/train_index_final_split_' + str(n_split), 'rb') as file:
    train_index = pickle.load(file)

with open('/content/drive/My Drive/Data Master/valid_index_final_split_' + str(n_split), 'rb') as file:
    valid_index = pickle.load(file)

In [7]:
test_index = [i for i, _ in enumerate(X_test)]

In [8]:
inv_word_index = {ix : w for w, ix in word_index.items()}

In [9]:
import itertools
features_index = {w:ix for ix, w in enumerate(np.unique(list(itertools.chain.from_iterable(np.array(X_train)[train_index]))))}
inv_features_index = {ix:w for ix, w in enumerate(np.unique(list(itertools.chain.from_iterable(np.array(X_train)[train_index]))))}

In [10]:
X_train_matrix = np.zeros((len(X_train), len(features_index)))
X_test_matrix = np.zeros((len(X_test), len(features_index)))

In [11]:
for i, x in enumerate(X_train):
  for w in x:
    if w in features_index:
      X_train_matrix[i,features_index[w]] += 1

for i, x in enumerate(X_test):
  for w in x:
    if w in features_index:
      X_test_matrix[i,features_index[w]] += 1

In [12]:
data_train_doc_ids = [(i, d) for i, d in enumerate(X_train)]
data_test_doc_ids = [(i, d) for i, d in enumerate(X_test)]

In [13]:
target_train_doc_ids = [(i, y) for i, y in enumerate(Y_train)]
target_test_doc_ids = [(i, y) for i, y in enumerate(Y_test)]

In [14]:
doc_train_ids = np.array([i for i, _ in data_train_doc_ids])
doc_test_ids = np.array([i for i, _ in data_test_doc_ids])
doc_train_dict = {k:v for k, v in [(i, d) for i, d in data_train_doc_ids]}
doc_test_dict = {k:v for k, v in [(i, d) for i, d in data_test_doc_ids]}
target_train_dict = {k:v for k, v in [(i, y) for i, y in target_train_doc_ids]}
target_test_dict = {k:v for k, v in [(i, y) for i, y in target_test_doc_ids]}

In [15]:
doc_train_ids[train_index]

array([  0,   1,   2,   5,   6,   7,   8,   9,  11,  12,  13,  14,  15,
        16,  17,  18,  19,  20,  21,  22,  24,  26,  29,  30,  31,  32,
        33,  34,  35,  36,  37,  39,  40,  41,  43,  44,  45,  46,  47,
        49,  50,  51,  52,  53,  54,  55,  56,  58,  59,  60,  64,  65,
        66,  67,  68,  69,  70,  72,  73,  74,  75,  76,  78,  79,  80,
        81,  82,  85,  86,  88,  91,  92,  93,  94,  95,  96,  97,  98,
        99, 102, 103, 104, 106, 107, 108, 109, 110, 111, 112, 114, 118,
       119, 120, 121, 122, 123, 124, 125, 126, 128, 130, 132, 133, 134,
       135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147,
       148, 149, 150, 151, 153, 154, 155, 157, 158, 159, 160, 161, 163,
       164, 165, 166, 167, 168, 169, 170, 171, 172, 175, 176, 177, 178,
       181, 182, 183, 184, 185, 186, 187, 188, 189, 192, 193, 194, 195,
       197, 198, 199, 200, 201, 202, 204, 205, 208, 209, 212, 215, 216,
       217, 218, 219, 220, 221, 224, 225, 228, 229, 230, 231, 23

In [16]:
doc_train_ids[valid_index]

array([  3,   4,  10,  23,  25,  27,  28,  38,  42,  48,  57,  61,  62,
        63,  71,  77,  83,  84,  87,  89,  90, 100, 101, 105, 113, 115,
       116, 117, 127, 129, 131, 152, 156, 162, 173, 174, 179, 180, 190,
       191, 196, 203, 206, 207, 210, 211, 213, 214, 222, 223, 226, 227,
       234, 238, 252, 255, 257, 264, 279, 280, 282, 297, 302, 303, 306,
       307, 322, 339, 340, 342, 350, 364, 385, 391, 392, 395, 397, 406,
       408, 433, 435, 441, 442, 445, 449, 450, 456, 462, 470, 481, 484,
       486, 488, 496, 499, 500, 504, 509, 511, 524, 525, 528, 530, 532,
       538, 540, 555, 563, 567, 573, 586, 588, 591, 601, 610, 629, 636,
       642, 655, 656, 676, 678, 684, 686, 688, 689, 690, 697, 702, 705,
       707, 708, 709, 710, 720, 728, 729, 731, 733, 737, 738, 740, 745,
       750, 759, 760, 769, 770, 773, 774, 776, 779, 781, 786, 789, 795,
       797, 802, 807, 809, 813, 821, 832, 838, 842, 847, 848, 855, 857,
       865, 866, 867, 868, 869, 876, 877, 879, 883])

In [17]:
doc_test_ids[test_index]

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

In [18]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/9c/35/1c3f6e62d81f5f0daff1384e6d5e6c5758682a8357ebc765ece2b9def62b/transformers-3.0.0-py3-none-any.whl (754kB)
[K     |▍                               | 10kB 18.3MB/s eta 0:00:01[K     |▉                               | 20kB 1.7MB/s eta 0:00:01[K     |█▎                              | 30kB 2.3MB/s eta 0:00:01[K     |█▊                              | 40kB 2.5MB/s eta 0:00:01[K     |██▏                             | 51kB 2.0MB/s eta 0:00:01[K     |██▋                             | 61kB 2.3MB/s eta 0:00:01[K     |███                             | 71kB 2.5MB/s eta 0:00:01[K     |███▌                            | 81kB 2.7MB/s eta 0:00:01[K     |████                            | 92kB 2.9MB/s eta 0:00:01[K     |████▍                           | 102kB 2.8MB/s eta 0:00:01[K     |████▊                           | 112kB 2.8MB/s eta 0:00:01[K     |█████▏                          | 122kB 2.8M

In [19]:
import pandas as pd
import numpy as np
import itertools

import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
import torch.utils.data as tdata
import torch.optim as optim

import tqdm

torch.manual_seed(0)
np.random.seed(0)

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [21]:
X0_train_tensor = torch.from_numpy(X_train_matrix[train_index]).float().to(device)
X0_valid_tensor = torch.from_numpy(X_train_matrix[valid_index]).float().to(device)
X0_test_tensor = torch.from_numpy(X_test_matrix).float().to(device)

In [22]:
np.max([len(clss) for doc, clss in train_cls_token_embeddings.items()])

285

In [23]:
max_len = 50

In [24]:
def manual_padding(sent, max_len = 200):
  pad_tensor = [torch.from_numpy(np.zeros((1, 768))).float()]
  if len(sent) > max_len:
    res = sent[-max_len:]
  else:
    res = (pad_tensor * (max_len - len(sent))) + sent
  return res

In [25]:
X1_train_tensor = torch.stack([torch.stack(manual_padding(sent=train_cls_token_embeddings[i], max_len = max_len)) for i in train_index]).squeeze(2).to(device)
X2_train_tensor = torch.stack([train_last_layer_embeddings[i] for i in train_index]).squeeze(1).to(device)
X3_train_tensor = torch.stack([torch.flatten(train_all_layers_embeddings[i].permute(1, 0, 2), start_dim=1) for i in train_index]).squeeze(1).to(device)

In [26]:
X1_valid_tensor = torch.stack([torch.stack(manual_padding(sent=train_cls_token_embeddings[i], max_len = max_len)) for i in valid_index]).squeeze(2).to(device)
X2_valid_tensor = torch.stack([train_last_layer_embeddings[i] for i in valid_index]).squeeze(1).to(device)
X3_valid_tensor = torch.stack([torch.flatten(train_all_layers_embeddings[i].permute(1, 0, 2), start_dim=1) for i in valid_index]).squeeze(1).to(device)

In [27]:
X1_test_tensor = torch.stack([torch.stack(manual_padding(sent=test_cls_token_embeddings[i], max_len = max_len)) for i in test_index]).squeeze(2).to(device)
X2_test_tensor = torch.stack([test_last_layer_embeddings[i] for i in test_index]).squeeze(1).to(device)
X3_test_tensor = torch.stack([torch.flatten(test_all_layers_embeddings[i].permute(1, 0, 2), start_dim=1) for i in test_index]).squeeze(1).to(device)

In [28]:
Y_train_tensor = torch.LongTensor(np.array(Y_train)[train_index]).to(device)
Y_valid_tensor = torch.LongTensor(np.array(Y_train)[valid_index]).to(device)
Y_test_tensor = torch.LongTensor(np.array(Y_test)).to(device)

In [29]:
X0_train_tensor.shape

torch.Size([710, 248563])

In [30]:
X1_train_tensor.shape

torch.Size([710, 50, 768])

In [31]:
X2_train_tensor.shape

torch.Size([710, 768])

In [32]:
X3_train_tensor.shape

torch.Size([710, 9984])

In [33]:
input_dim_0 = X0_train_tensor.shape[1]
input_dim_2 = X2_train_tensor.shape[1]
input_dim_3 = X3_train_tensor.shape[1]
EMBEDDING_DIM = X1_train_tensor.shape[2]

In [34]:
input_dim_0

248563

In [35]:
input_dim_2

768

In [36]:
input_dim_3

9984

In [37]:
EMBEDDING_DIM

768

In [38]:
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.fc1 = nn.Linear(input_dim_0, 50)
        self.conv2 = nn.Sequential(
          nn.Conv1d(in_channels=EMBEDDING_DIM, out_channels=64, kernel_size=2),
          nn.ReLU(),
          #nn.Conv1d(in_channels=64, out_channels=32, kernel_size=3),
          #nn.ReLU(),
          #nn.Conv1d(in_channels=32, out_channels=16, kernel_size=4),
          #nn.ReLU(),
          nn.AdaptiveMaxPool1d(1),
          nn.Dropout(0.3),
      )
        
        self.fc_out = nn.Linear(64 + 50, 2)

        self.softmax = nn.Softmax(dim=1)

        self.dropout = nn.Dropout(0.3)
        
    def forward(self, x1, x2):
        x1 = F.normalize(x1)
        x2 = x2.permute(0,2,1)
        x2 = F.normalize(x2)

        h1 = self.dropout(self.fc1(x1))
        h2 = self.conv2(x2).squeeze(2)

        # Concatenate in dim1 (feature dimension)
        x = torch.cat((h1, h2), 1)
        y = self.softmax(self.fc_out(x))
        return y

In [39]:
model = MyModel()
model.to(device)

MyModel(
  (fc1): Linear(in_features=248563, out_features=50, bias=True)
  (conv2): Sequential(
    (0): Conv1d(768, 64, kernel_size=(2,), stride=(1,))
    (1): ReLU()
    (2): AdaptiveMaxPool1d(output_size=1)
    (3): Dropout(p=0.3, inplace=False)
  )
  (fc_out): Linear(in_features=114, out_features=2, bias=True)
  (softmax): Softmax(dim=1)
  (dropout): Dropout(p=0.3, inplace=False)
)

In [40]:
X0_train_tensor.shape

torch.Size([710, 248563])

In [41]:
X1_train_tensor.shape

torch.Size([710, 50, 768])

In [42]:
X2_train_tensor.shape

torch.Size([710, 768])

In [43]:
X3_train_tensor.shape

torch.Size([710, 9984])

In [44]:
import torch.optim as optim
optimizer = optim.AdamW(model.parameters(), lr=0.01)

In [45]:
from torch.optim.lr_scheduler import ReduceLROnPlateau
scheduler = ReduceLROnPlateau(optimizer, 'min')

In [46]:
weights = [sum(Y_train)/len(Y_train), 1-sum(Y_train)/len(Y_train)]
class_weights = torch.FloatTensor(weights)
class_weights

tensor([0.1734, 0.8266])

In [47]:
criterion = nn.CrossEntropyLoss(weight=class_weights)

In [48]:
patience = 20
early_stopping = EarlyStopping(patience=patience, verbose=True)


for i in range(100):
  model.train()
  optimizer.zero_grad()
  prediction = model(X0_train_tensor, X1_train_tensor)
  loss = criterion(prediction, Y_train_tensor)
  loss.backward()
  optimizer.step()

  accuracy = Accuracy(prediction, Y_train_tensor)

  model.eval()

  val_prediction = model(X0_valid_tensor, X1_valid_tensor)
  test_prediction = model(X0_test_tensor, X1_test_tensor)
  val_loss = criterion(val_prediction, Y_valid_tensor)
  test_loss = criterion(test_prediction, Y_test_tensor)

  val_accuracy = Accuracy(val_prediction, Y_valid_tensor)
  test_accuracy = Accuracy(test_prediction, Y_test_tensor)

  early_stopping(val_loss, model)

  if early_stopping.early_stop:
    print("Early stopping")
    break

  print(i, float(loss.cpu()), accuracy, float(val_loss.cpu()), val_accuracy, float(test_loss.cpu()), test_accuracy)

  scheduler.step(val_loss)

model.load_state_dict(torch.load('checkpoint.pt'))

Validation loss decreased (inf --> 0.654686).  Saving model ...
0 0.6927529573440552 0.8267605900764465 0.6546862125396729 0.8876404762268066 0.6610687971115112 0.8923884630203247
Validation loss decreased (0.654686 --> 0.601335).  Saving model ...
1 0.6511163115501404 0.902816891670227 0.6013347506523132 0.9101123809814453 0.6183273792266846 0.8976377844810486
Validation loss decreased (0.601335 --> 0.553407).  Saving model ...
2 0.5905457139015198 0.908450722694397 0.5534073710441589 0.915730357170105 0.5784168243408203 0.8792650699615479
Validation loss decreased (0.553407 --> 0.521757).  Saving model ...
3 0.544152557849884 0.8943662047386169 0.5217569470405579 0.8764045238494873 0.5505358576774597 0.8582677245140076
Validation loss decreased (0.521757 --> 0.501662).  Saving model ...
4 0.5195237398147583 0.8408450484275818 0.5016621947288513 0.8764045238494873 0.5327299237251282 0.8687664270401001
Validation loss decreased (0.501662 --> 0.492484).  Saving model ...
5 0.49151745438

<All keys matched successfully>

In [49]:
print(Precision(prediction, Y_train_tensor))
print(Precision(val_prediction, Y_valid_tensor))
print(Precision(test_prediction, Y_test_tensor))

[1.0, 0.9919354915618896]
[0.9477124214172363, 0.9200000166893005]
[0.9341317415237427, 0.8723404407501221]


In [50]:
print(Recall(prediction, Y_train_tensor))
print(Recall(val_prediction, Y_valid_tensor))
print(Recall(test_prediction, Y_test_tensor))

[0.9982964396476746, 1.0]
[0.9863945841789246, 0.7419354915618896]
[0.9811320900917053, 0.6507936716079712]


In [51]:
print(Accuracy(prediction, Y_train_tensor))
print(Accuracy(val_prediction, Y_valid_tensor))
print(Accuracy(test_prediction, Y_test_tensor))

0.9985915422439575
0.9438202381134033
0.9265092015266418
