In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
import numpy as np
import torch

class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt'):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
            path (str): Path for the checkpoint to be saved to.
                            Default: 'checkpoint.pt'
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path

    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

In [3]:
def Accuracy(prediction, observation):
  prediction = prediction[:,1]
  prediction_class = (torch.reshape(prediction, observation.shape) > 0.5).float()
  correct = (prediction_class == observation).float().sum()
  accuracy = correct/prediction_class.shape[0]
  return float(accuracy.cpu())

def Precision(prediction, observation):
  prediction = prediction[:,1]
  res = []
  prediction_class = (torch.reshape(prediction, observation.shape) > 0.5).float()
  for label in [0, 1]:
    correct = (prediction_class[prediction_class == label] == observation[prediction_class == label]).float().sum()
    precision = correct/prediction_class[prediction_class == label].shape[0]
    res.append(float(precision.cpu()))
  return res

def Recall(prediction, observation):
  prediction = prediction[:,1]
  res = []
  prediction_class = (torch.reshape(prediction, observation.shape) > 0.5).float()
  for label in [0, 1]:
    correct = (prediction_class[observation == label] == observation[observation == label]).float().sum()
    recall = correct/prediction_class[observation == label].shape[0]
    res.append(float(recall.cpu()))
  return res

In [4]:
n_split = 4

In [5]:
import pickle
import pandas as pd
import numpy as np

In [6]:
with open('/content/drive/My Drive/Data Master/X_train_final', 'rb') as file:
    X_train = pickle.load(file)

with open('/content/drive/My Drive/Data Master/Y_train_final', 'rb') as file:
    Y_train = pickle.load(file)

with open('/content/drive/My Drive/Data Master/X_test_final', 'rb') as file:
    X_test = pickle.load(file)

with open('/content/drive/My Drive/Data Master/Y_test_final', 'rb') as file:
    Y_test = pickle.load(file)

with open('/content/drive/My Drive/Data Master/word_index_final', 'rb') as file:
    word_index = pickle.load(file)

with open('/content/drive/My Drive/Data Master/train_index_final_split_' + str(n_split), 'rb') as file:
    train_index = pickle.load(file)

with open('/content/drive/My Drive/Data Master/valid_index_final_split_' + str(n_split), 'rb') as file:
    valid_index = pickle.load(file)

In [7]:
test_index = [i for i, _ in enumerate(X_test)]

In [8]:
inv_word_index = {ix : w for w, ix in word_index.items()}

In [9]:
import itertools
features_index = {w:ix for ix, w in enumerate(np.unique(list(itertools.chain.from_iterable(np.array(X_train)[train_index]))))}
inv_features_index = {ix:w for ix, w in enumerate(np.unique(list(itertools.chain.from_iterable(np.array(X_train)[train_index]))))}

In [10]:
errors = []
embeddings_index = {}
f = open('/content/drive/My Drive/Data Master/skip_s50.txt')
for line in f:
    values = line.split()
    word = values[0]
    try:
      coefs = np.asarray(values[1:], dtype='float32')
    except:
      errors.append(line)
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 929595 word vectors.


In [11]:
print(len(embeddings_index))
print(len(errors))

929595
4


In [12]:
EMBEDDING_DIM = 50
errors_2 = []
nomatchs = []

X_train_matrix_1 = np.zeros((len(X_train), len(features_index)))
X_test_matrix_1 = np.zeros((len(X_test), len(features_index)))

X_train_matrix_2 = np.zeros((len(X_train), EMBEDDING_DIM))
X_test_matrix_2 = np.zeros((len(X_test), EMBEDDING_DIM))

In [13]:
for i, x in enumerate(X_train):
  for w in x:
    if w in features_index:
      X_train_matrix_1[i,features_index[w]] += 1
      
    embedding_vector = embeddings_index.get(inv_word_index[w])
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        try:
          X_train_matrix_2[i] += embedding_vector
        except:
          errors_2.append([word, len(embedding_vector), embedding_vector])
    else:
      nomatchs.append(word)

In [14]:
for i, x in enumerate(X_test):
  for w in x:
    if w in features_index:
      X_test_matrix_1[i,features_index[w]] += 1

    embedding_vector = embeddings_index.get(inv_word_index[w])
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        try:
          X_test_matrix_2[i] += embedding_vector
        except:
          errors_2.append([word, len(embedding_vector), embedding_vector])
    else:
      nomatchs.append(word)

In [15]:
X_train_matrix_2.shape

(888, 50)

In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [17]:
input_dim = X_train_matrix_1.shape[1]
input_dim

231280

In [18]:
input_dim_1 = X_train_matrix_1.shape[1]
input_dim_1

231280

In [19]:
input_dim_2 = EMBEDDING_DIM
input_dim_2

50

In [20]:
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.fc0 = nn.Linear(input_dim_1 + input_dim_2, 50)
        self.fc_out = nn.Linear(50, 2)

        self.softmax = nn.Softmax(dim=1)

        self.dropout = nn.Dropout(0.3)
        
    def forward(self, x0, x1):
        x0 = F.normalize(x0)
        x1 = F.normalize(x1)

        x01 = torch.cat((x0, x1), 1)

        h0 = self.fc0(x01)

        # Concatenate in dim1 (feature dimension)
        x = self.dropout(h0)
        y = self.softmax(self.fc_out(x))
        return y

In [21]:
X1_train_tensor = torch.from_numpy(X_train_matrix_1[train_index]).float()
X2_train_tensor = torch.from_numpy(X_train_matrix_2[train_index]).float()
Y_train_tensor = torch.LongTensor(np.array(Y_train)[train_index])

X1_valid_tensor = torch.from_numpy(X_train_matrix_1[valid_index]).float()
X2_valid_tensor = torch.from_numpy(X_train_matrix_2[valid_index]).float()
Y_valid_tensor = torch.LongTensor(np.array(Y_train)[valid_index])

X1_test_tensor = torch.from_numpy(X_test_matrix_1).float()
X2_test_tensor = torch.from_numpy(X_test_matrix_2).float()
Y_test_tensor = torch.LongTensor(np.array(Y_test))

In [22]:
model = MyModel()

In [23]:
import torch.optim as optim
optimizer = optim.AdamW(model.parameters(), lr=0.01)

In [24]:
from torch.optim.lr_scheduler import ReduceLROnPlateau
scheduler = ReduceLROnPlateau(optimizer, 'min')

In [25]:
weights = [sum(Y_train)/len(Y_train), 1-sum(Y_train)/len(Y_train)]
class_weights = torch.FloatTensor(weights)
class_weights

tensor([0.1734, 0.8266])

In [26]:
criterion = nn.CrossEntropyLoss(weight=class_weights)

In [27]:
patience = 20
early_stopping = EarlyStopping(patience=patience, verbose=True)


for i in range(100):
  model.train()
  optimizer.zero_grad()
  prediction = model(X1_train_tensor, X2_train_tensor)
  loss = criterion(prediction, Y_train_tensor)
  loss.backward()
  optimizer.step()

  accuracy = Accuracy(prediction, Y_train_tensor)

  model.eval()

  val_prediction = model(X1_valid_tensor, X2_valid_tensor)
  test_prediction = model(X1_test_tensor, X2_test_tensor)
  val_loss = criterion(val_prediction, Y_valid_tensor)
  test_loss = criterion(test_prediction, Y_test_tensor)

  val_accuracy = Accuracy(val_prediction, Y_valid_tensor)
  test_accuracy = Accuracy(test_prediction, Y_test_tensor)

  early_stopping(val_loss, model)

  if early_stopping.early_stop:
    print("Early stopping")
    break

  print(i, float(loss.cpu()), accuracy, float(val_loss.cpu()), val_accuracy, float(test_loss.cpu()), test_accuracy)

  scheduler.step(val_loss)

model.load_state_dict(torch.load('checkpoint.pt'))

Validation loss decreased (inf --> 0.671758).  Saving model ...
0 0.6931381821632385 0.8267605900764465 0.6717578768730164 0.898876428604126 0.6764016151428223 0.9002624750137329
Validation loss decreased (0.671758 --> 0.645782).  Saving model ...
1 0.6648614406585693 0.9464788436889648 0.6457824110984802 0.898876428604126 0.6563182473182678 0.9028871655464172
Validation loss decreased (0.645782 --> 0.615974).  Saving model ...
2 0.6307007074356079 0.9507042169570923 0.6159735918045044 0.8876404762268066 0.6336736083030701 0.8976377844810486
Validation loss decreased (0.615974 --> 0.584358).  Saving model ...
3 0.5903650522232056 0.9492957592010498 0.5843579173088074 0.882022500038147 0.6097661852836609 0.8792650699615479
Validation loss decreased (0.584358 --> 0.553554).  Saving model ...
4 0.5500233173370361 0.9521126747131348 0.553554117679596 0.882022500038147 0.5865105986595154 0.8792650699615479
Validation loss decreased (0.553554 --> 0.525896).  Saving model ...
5 0.510828614234

<All keys matched successfully>

In [28]:
print(Precision(prediction, Y_train_tensor))
print(Precision(val_prediction, Y_valid_tensor))
print(Precision(test_prediction, Y_test_tensor))

[1.0, 1.0]
[0.9527027010917664, 0.800000011920929]
[0.939393937587738, 0.843137264251709]


In [29]:
print(Recall(prediction, Y_train_tensor))
print(Recall(val_prediction, Y_valid_tensor))
print(Recall(test_prediction, Y_test_tensor))

[1.0, 1.0]
[0.9591836929321289, 0.774193525314331]
[0.9748427867889404, 0.682539701461792]


In [30]:
print(Accuracy(prediction, Y_train_tensor))
print(Accuracy(val_prediction, Y_valid_tensor))
print(Accuracy(test_prediction, Y_test_tensor))

1.0
0.9269663095474243
0.9265092015266418
