In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import torch

class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt'):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
            path (str): Path for the checkpoint to be saved to.
                            Default: 'checkpoint.pt'
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path

    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

In [3]:
def Accuracy(prediction, observation):
  prediction = prediction[:,1]
  prediction_class = (torch.reshape(prediction, observation.shape) > 0.5).float()
  correct = (prediction_class == observation).float().sum()
  accuracy = correct/prediction_class.shape[0]
  return float(accuracy.cpu())

def Precision(prediction, observation):
  prediction = prediction[:,1]
  res = []
  prediction_class = (torch.reshape(prediction, observation.shape) > 0.5).float()
  for label in [0, 1]:
    correct = (prediction_class[prediction_class == label] == observation[prediction_class == label]).float().sum()
    precision = correct/prediction_class[prediction_class == label].shape[0]
    res.append(float(precision.cpu()))
  return res

def Recall(prediction, observation):
  prediction = prediction[:,1]
  res = []
  prediction_class = (torch.reshape(prediction, observation.shape) > 0.5).float()
  for label in [0, 1]:
    correct = (prediction_class[observation == label] == observation[observation == label]).float().sum()
    recall = correct/prediction_class[observation == label].shape[0]
    res.append(float(recall.cpu()))
  return res

In [4]:
n_split = 5

In [5]:
import pickle
import pandas as pd
import numpy as np

In [6]:
with open('/content/drive/My Drive/Data Master/X_train_final', 'rb') as file:
    X_train = pickle.load(file)

with open('/content/drive/My Drive/Data Master/Y_train_final', 'rb') as file:
    Y_train = pickle.load(file)

with open('/content/drive/My Drive/Data Master/X_test_final', 'rb') as file:
    X_test = pickle.load(file)

with open('/content/drive/My Drive/Data Master/Y_test_final', 'rb') as file:
    Y_test = pickle.load(file)

with open('/content/drive/My Drive/Data Master/word_index_final', 'rb') as file:
    word_index = pickle.load(file)

with open('/content/drive/My Drive/Data Master/train_index_final_split_' + str(n_split), 'rb') as file:
    train_index = pickle.load(file)

with open('/content/drive/My Drive/Data Master/valid_index_final_split_' + str(n_split), 'rb') as file:
    valid_index = pickle.load(file)

In [7]:
test_index = [i for i, _ in enumerate(X_test)]

In [8]:
inv_word_index = {ix : w for w, ix in word_index.items()}

In [9]:
import itertools
features_index = {w:ix for ix, w in enumerate(np.unique(list(itertools.chain.from_iterable(np.array(X_train)[train_index]))))}
inv_features_index = {ix:w for ix, w in enumerate(np.unique(list(itertools.chain.from_iterable(np.array(X_train)[train_index]))))}

In [10]:
X_train_matrix = np.zeros((len(X_train), len(features_index)))
X_test_matrix = np.zeros((len(X_test), len(features_index)))

In [11]:
for i, x in enumerate(X_train):
  for w in x:
    if w in features_index:
      X_train_matrix[i,features_index[w]] += 1

for i, x in enumerate(X_test):
  for w in x:
    if w in features_index:
      X_test_matrix[i,features_index[w]] += 1

In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [13]:
input_dim = X_train_matrix.shape[1]
input_dim

239262

In [14]:
class LogisticRegression (nn.Module):

  def __init__(self):
    super(LogisticRegression, self).__init__()

    self.fc1 = nn.Linear(input_dim, 2)
                                
    self.softmax = nn.Softmax(dim=1)

  def forward(self, x):
    x = F.normalize(x)
    y = self.softmax(self.fc1(x))

    return y

In [15]:
model = LogisticRegression()

In [16]:
X_train_tensor = torch.from_numpy(X_train_matrix[train_index]).float()
Y_train_tensor = torch.LongTensor(np.array(Y_train[train_index]))

X_valid_tensor = torch.from_numpy(X_train_matrix[valid_index]).float()
Y_valid_tensor = torch.LongTensor(np.array(Y_train[valid_index]))

X_test_tensor = torch.from_numpy(X_test_matrix).float()
Y_test_tensor = torch.LongTensor(np.array(Y_test))

In [17]:
torch.manual_seed(0)
np.random.seed(0)

In [18]:
import torch.optim as optim
optimizer = optim.AdamW(model.parameters(), lr=0.01)

In [19]:
from torch.optim.lr_scheduler import ReduceLROnPlateau
scheduler = ReduceLROnPlateau(optimizer, 'min')

In [20]:
weights = [sum(Y_train)/len(Y_train), 1-sum(Y_train)/len(Y_train)]
class_weights = torch.FloatTensor(weights)
class_weights

tensor([0.1734, 0.8266])

In [21]:
criterion = nn.CrossEntropyLoss(weight=class_weights)

In [22]:
patience = 20
early_stopping = EarlyStopping(patience=patience, verbose=True)

for i in range(100):
  model.train()
  optimizer.zero_grad()
  prediction = model(X_train_tensor)
  loss = criterion(prediction, Y_train_tensor)
  loss.backward()
  optimizer.step()

  accuracy = Accuracy(prediction, Y_train_tensor)

  model.eval()

  val_prediction = model(X_valid_tensor)
  test_prediction = model(X_test_tensor)
  val_loss = criterion(val_prediction, Y_valid_tensor)
  test_loss = criterion(test_prediction, Y_test_tensor)

  val_accuracy = Accuracy(val_prediction, Y_valid_tensor)
  test_accuracy = Accuracy(test_prediction, Y_test_tensor)

  early_stopping(val_loss, model)

  if early_stopping.early_stop:
    print("Early stopping")
    break

  print(i, float(loss.cpu()), accuracy, float(val_loss.cpu()), val_accuracy, float(test_loss.cpu()), test_accuracy)

  scheduler.step(val_loss)

model.load_state_dict(torch.load('checkpoint.pt'))

Validation loss decreased (inf --> 0.684617).  Saving model ...
0 0.6931641101837158 0.7542135119438171 0.6846165657043457 0.9147727489471436 0.6863439679145813 0.8792650699615479
Validation loss decreased (0.684617 --> 0.676210).  Saving model ...
1 0.6808705925941467 0.9578651785850525 0.6762101650238037 0.9147727489471436 0.6796659827232361 0.8792650699615479
Validation loss decreased (0.676210 --> 0.667952).  Saving model ...
2 0.6688099503517151 0.9578651785850525 0.6679520010948181 0.9147727489471436 0.6731255650520325 0.8792650699615479
Validation loss decreased (0.667952 --> 0.659856).  Saving model ...
3 0.6570106148719788 0.9578651785850525 0.6598563194274902 0.9147727489471436 0.6667340993881226 0.8792650699615479
Validation loss decreased (0.659856 --> 0.651937).  Saving model ...
4 0.6454964280128479 0.9578651785850525 0.6519372463226318 0.9147727489471436 0.6605029106140137 0.8792650699615479
Validation loss decreased (0.651937 --> 0.644209).  Saving model ...
5 0.6342886

<All keys matched successfully>

In [23]:
print(Precision(prediction, Y_train_tensor))
print(Precision(val_prediction, Y_valid_tensor))
print(Precision(test_prediction, Y_test_tensor))

[1.0, 0.9841269850730896]
[0.9655172228813171, 0.8064516186714172]
[0.941717803478241, 0.800000011920929]


In [24]:
print(Recall(prediction, Y_train_tensor))
print(Recall(val_prediction, Y_valid_tensor))
print(Recall(test_prediction, Y_test_tensor))

[0.9965986609458923, 1.0]
[0.9589040875434875, 0.8333333134651184]
[0.9654088020324707, 0.6984127163887024]


In [25]:
print(Accuracy(prediction, Y_train_tensor))
print(Accuracy(val_prediction, Y_valid_tensor))
print(Accuracy(test_prediction, Y_test_tensor))

0.9971910119056702
0.9375
0.9212598204612732


In [26]:
with open('/content/drive/My Drive/Data Master/Prediction_BOW_RegressaoLogistica/test_prediction_split' + str(n_split), 'wb') as file:
    pickle.dump(test_prediction.detach().numpy(), file)