In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from corpus import MovieReviewsCorpus
from models import BiLSTM
from preprocess import MRPipelineTokens
from utils import *
from torchtext.vocab import GloVe
from data import MovieReviewsDataset, get_data
import nltk

In [3]:
nltk.download("punkt")
nltk.download("movie_reviews")
nltk.download("subjectivity")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/alessandrozinni/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/alessandrozinni/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package subjectivity to
[nltk_data]     /Users/alessandrozinni/nltk_data...
[nltk_data]   Package subjectivity is already up-to-date!


True

In [4]:
global_vectors = GloVe(name='840B', dim=300)

In [5]:

mr_pipeline = MRPipelineTokens()
corpus = MovieReviewsCorpus(mr_pipeline)

embedding_matrix = corpus.get_embedding_matrix(global_vectors, 300)

ds = corpus.get_indexed_corpus()

In [11]:
oov = check_coverage(corpus.vocab, global_vectors)

100%|██████████| 39426/39426 [00:00<00:00, 39590.94it/s]


Found embeddings for 92.61% of vocab
Found embeddings for  97.15% of all text





In [12]:
dataset = MovieReviewsDataset(ds)
train_loader, test_loader = get_data(128, dataset, collate_fn=collate)

In [13]:
def training_step(net, data_loader, optimizer, cost_function, device = 'cuda'):
  cumulative_loss = 0
  cumulative_accuracy = 0
  samples = 0

  net.train()

  for batch_idx, (inputs, targets) in enumerate(data_loader):

    inputs = inputs.to(device)
    targets = targets.to(device)
    in_size = targets.size(dim=0)

    outputs = net(inputs)

    loss = cost_function(outputs, targets)

    loss.backward()

    optimizer.step()

    optimizer.zero_grad()
    
    samples += in_size
    cumulative_loss += loss.item()
    _, predicted = outputs.max(dim=1)

    cumulative_accuracy += predicted.eq(targets).sum().item()

  return cumulative_loss/samples, (cumulative_accuracy/samples)*100

In [14]:
def test_step(net, data_loader, cost_function, device = 'cuda'):
  cumulative_loss = 0
  cumulative_accuracy = 0
  samples = 0

  net.eval()

  with torch.no_grad():

    for batch_idx, (inputs, targets) in enumerate(data_loader):
      inputs = inputs.to(device)
      targets = targets.to(device)
      in_size = targets.size(dim=0)

      outputs = net(inputs)

      loss = cost_function(outputs, targets)

      samples += in_size
      cumulative_loss += loss.item()
      _, predicted = outputs.max(dim=1)

      cumulative_accuracy += predicted.eq(targets).sum().item()

    return cumulative_loss/samples, (cumulative_accuracy/samples)*100


In [15]:
from torch.utils.data import DataLoader
from torch.optim import Adam
import torch.nn as nn

def main(train_loader, test_loader, embedding_matrix, device = "cuda", epochs = 10):

  net = BiLSTM(embedding_matrix, device = device).to(device)

  optimizer = Adam(net.parameters(), 0.001, betas = (0.9, 0.999), amsgrad=True)

  cost_function = nn.CrossEntropyLoss()

  for e in range(epochs):
    print(f"epoch {e}:")
    train_loss, train_accuracy = training_step(net, train_loader, optimizer, cost_function, device)
    print(f"Training loss: {train_loss} \n Training accuracy: {train_accuracy}")
    test_loss, test_accuracy = test_step(net, test_loader, cost_function, device)
    print(f"Test loss: {test_loss} \n Test accuracy: {test_accuracy}")
    print("------------------------------------------------------------------")
  
  _, test_accuracy = test_step(net, test_loader, cost_function, device)


  return test_accuracy


In [16]:
main(train_loader, test_loader, embedding_matrix)

: 

: 