In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
import numpy as np
import torch

class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt'):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
            path (str): Path for the checkpoint to be saved to.
                            Default: 'checkpoint.pt'
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path

    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

In [3]:
def Accuracy(prediction, observation):
  prediction = prediction[:,1]
  prediction_class = (torch.reshape(prediction, observation.shape) > 0.5).float()
  correct = (prediction_class == observation).float().sum()
  accuracy = correct/prediction_class.shape[0]
  return float(accuracy.cpu())

def Precision(prediction, observation):
  prediction = prediction[:,1]
  res = []
  prediction_class = (torch.reshape(prediction, observation.shape) > 0.5).float()
  for label in [0, 1]:
    correct = (prediction_class[prediction_class == label] == observation[prediction_class == label]).float().sum()
    precision = correct/prediction_class[prediction_class == label].shape[0]
    res.append(float(precision.cpu()))
  return res

def Recall(prediction, observation):
  prediction = prediction[:,1]
  res = []
  prediction_class = (torch.reshape(prediction, observation.shape) > 0.5).float()
  for label in [0, 1]:
    correct = (prediction_class[observation == label] == observation[observation == label]).float().sum()
    recall = correct/prediction_class[observation == label].shape[0]
    res.append(float(recall.cpu()))
  return res

In [4]:
n_split = 4

In [5]:
import pickle
import pandas as pd
import numpy as np

In [6]:
with open('/content/drive/My Drive/Data Master/X_train_final', 'rb') as file:
    X_train = pickle.load(file)

with open('/content/drive/My Drive/Data Master/Y_train_final', 'rb') as file:
    Y_train = pickle.load(file)

with open('/content/drive/My Drive/Data Master/X_test_final', 'rb') as file:
    X_test = pickle.load(file)

with open('/content/drive/My Drive/Data Master/Y_test_final', 'rb') as file:
    Y_test = pickle.load(file)

with open('/content/drive/My Drive/Data Master/word_index_final', 'rb') as file:
    word_index = pickle.load(file)

with open('/content/drive/My Drive/Data Master/train_index_final_split_' + str(n_split), 'rb') as file:
    train_index = pickle.load(file)

with open('/content/drive/My Drive/Data Master/valid_index_final_split_' + str(n_split), 'rb') as file:
    valid_index = pickle.load(file)

In [7]:
inv_word_index = {ix : w for w, ix in word_index.items()}

In [8]:
errors = []
embeddings_index = {}
f = open('/content/drive/My Drive/Data Master/skip_s50.txt')
for line in f:
    values = line.split()
    word = values[0]
    try:
      coefs = np.asarray(values[1:], dtype='float32')
    except:
      errors.append(line)
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 929595 word vectors.


In [9]:
print(len(embeddings_index))
print(len(errors))

929595
4


In [10]:
EMBEDDING_DIM = 50
errors_2 = []
nomatchs = []

X_train_matrix = np.zeros((len(X_train), EMBEDDING_DIM))
X_test_matrix = np.zeros((len(X_test), EMBEDDING_DIM))

In [11]:
for i, x in enumerate(X_train):
  for w in x:
    embedding_vector = embeddings_index.get(inv_word_index[w])
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        try:
          X_train_matrix[i] += embedding_vector
        except:
          errors_2.append([word, len(embedding_vector), embedding_vector])
    else:
      nomatchs.append(word)

In [12]:
for i, x in enumerate(X_test):
  for w in x:
    embedding_vector = embeddings_index.get(inv_word_index[w])
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        try:
          X_test_matrix[i] += embedding_vector
        except:
          errors_2.append([word, len(embedding_vector), embedding_vector])
    else:
      nomatchs.append(word)

In [13]:
X_train_matrix.shape

(888, 50)

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [15]:
input_dim = X_train_matrix.shape[1]
input_dim

50

In [16]:
class LogisticRegression (nn.Module):

  def __init__(self):
    super(LogisticRegression, self).__init__()

    self.fc1 = nn.Linear(input_dim, 2)
                                
    self.softmax = nn.Softmax(dim=1)

  def forward(self, x):
    x = F.normalize(x)
    y = self.softmax(self.fc1(x))

    return y

In [17]:
X_train_tensor = torch.from_numpy(X_train_matrix[train_index]).float()
Y_train_tensor = torch.LongTensor(np.array(Y_train[train_index]))

X_valid_tensor = torch.from_numpy(X_train_matrix[valid_index]).float()
Y_valid_tensor = torch.LongTensor(np.array(Y_train[valid_index]))

X_test_tensor = torch.from_numpy(X_test_matrix).float()
Y_test_tensor = torch.LongTensor(np.array(Y_test))

In [18]:
model = LogisticRegression()

In [19]:
torch.manual_seed(0)
np.random.seed(0)

In [20]:
import torch.optim as optim
optimizer = optim.AdamW(model.parameters(), lr=0.01)

In [21]:
from torch.optim.lr_scheduler import ReduceLROnPlateau
scheduler = ReduceLROnPlateau(optimizer, 'min')

In [22]:
weights = [sum(Y_train)/len(Y_train), 1-sum(Y_train)/len(Y_train)]
class_weights = torch.FloatTensor(weights)
class_weights

tensor([0.1734, 0.8266])

In [23]:
criterion = nn.CrossEntropyLoss(weight=class_weights)

In [24]:
patience = 20
early_stopping = EarlyStopping(patience=patience, verbose=True)


for i in range(1000):
  model.train()
  optimizer.zero_grad()
  prediction = model(X_train_tensor)
  loss = criterion(prediction, Y_train_tensor)
  loss.backward()
  optimizer.step()

  accuracy = Accuracy(prediction, Y_train_tensor)

  model.eval()

  val_prediction = model(X_valid_tensor)
  test_prediction = model(X_test_tensor)
  val_loss = criterion(val_prediction, Y_valid_tensor)
  test_loss = criterion(test_prediction, Y_test_tensor)

  val_accuracy = Accuracy(val_prediction, Y_valid_tensor)
  test_accuracy = Accuracy(test_prediction, Y_test_tensor)

  early_stopping(val_loss, model)

  if early_stopping.early_stop:
    print("Early stopping")
    break

  print(i, float(loss.cpu()), accuracy, float(val_loss.cpu()), val_accuracy, float(test_loss.cpu()), test_accuracy)

  scheduler.step(val_loss)

model.load_state_dict(torch.load('checkpoint.pt'))

Validation loss decreased (inf --> 0.692374).  Saving model ...
0 0.6943894624710083 0.17323943972587585 0.6923736333847046 0.7247191071510315 0.6932754516601562 0.682414710521698
Validation loss decreased (0.692374 --> 0.690954).  Saving model ...
1 0.6930996775627136 0.6507042050361633 0.6909543871879578 0.8426966071128845 0.6921858787536621 0.847769021987915
Validation loss decreased (0.690954 --> 0.689523).  Saving model ...
2 0.6918512582778931 0.8295774459838867 0.6895228028297424 0.8651685118675232 0.6911998987197876 0.847769021987915
Validation loss decreased (0.689523 --> 0.688084).  Saving model ...
3 0.6905986070632935 0.8521126508712769 0.6880837678909302 0.8539325594902039 0.6902690529823303 0.8451443314552307
Validation loss decreased (0.688084 --> 0.686652).  Saving model ...
4 0.6893416047096252 0.8450704216957092 0.6866522431373596 0.7808988690376282 0.6893642544746399 0.7716535329818726
Validation loss decreased (0.686652 --> 0.685235).  Saving model ...
5 0.688091158

<All keys matched successfully>

In [25]:
print(Precision(prediction, Y_train_tensor))
print(Precision(val_prediction, Y_valid_tensor))
print(Precision(test_prediction, Y_test_tensor))

[0.9546279311180115, 0.6163522005081177]
[0.9772727489471436, 0.6086956262588501]
[0.9331103563308716, 0.5243902206420898]


In [26]:
print(Recall(prediction, Y_train_tensor))
print(Recall(val_prediction, Y_valid_tensor))
print(Recall(test_prediction, Y_test_tensor))

[0.8960817456245422, 0.7967479825019836]
[0.8775510191917419, 0.9032257795333862]
[0.8773584961891174, 0.682539701461792]


In [27]:
print(Accuracy(prediction, Y_train_tensor))
print(Accuracy(val_prediction, Y_valid_tensor))
print(Accuracy(test_prediction, Y_test_tensor))

0.8788732290267944
0.882022500038147
0.8451443314552307
