<a href="https://colab.research.google.com/github/ahhCrap/Deepfake_Review_Detection/blob/master/SentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports

In [0]:
import nltk 
import random

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [0]:
from tqdm import tqdm

In [0]:
import torch
import torch.nn as nn

from torch import optim
from torchtext import data, datasets

## Configuration

In [0]:
device = torch.device("cuda:0" if (torch.cuda.is_available()) else "cpu")
device

device(type='cuda', index=0)

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


## Utils

In [0]:
def binary_accuracy(preds, y):
  rounded_preds = torch.round(torch.sigmoid(preds))
  correct = (rounded_preds==y).float() #convert into float for division 
  acc = correct.sum() / len(correct)
  return acc

# Sentiment Analysis

## Datset Generation

In [0]:
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [0]:
TEXT = data.Field(tokenize='spacy', include_lengths = True)
LABEL = data.LabelField(dtype = torch.float)

In [0]:
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

aclImdb_v1.tar.gz:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:02<00:00, 41.1MB/s]


In [0]:
train_data, valid_data = train_data.split(random_state = random.seed(SEED))

In [0]:
MAX_VOCAB_SIZE = 16384  # 2^14

TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)

.vector_cache/glove.6B.zip: 862MB [06:26, 2.23MB/s]                           
100%|█████████▉| 398893/400000 [00:15<00:00, 27230.05it/s]

In [0]:
BATCH_SIZE = 64

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=BATCH_SIZE,
    sort_within_batch=True,
    device=device)

## Architecture

In [0]:
class RNN(nn.Module):
  def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim,
               n_layers, bidirectional, dropout, pad_idx):
    super(RNN, self).__init__()
    self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx=pad_idx)
    self.rnn = nn.LSTM(embedding_dim, 
                       hidden_dim, 
                       num_layers=n_layers, 
                       bidirectional=bidirectional, 
                       dropout=dropout)
    self.linear = nn.Linear(2*hidden_dim, output_dim)
    self.dropout = nn.Dropout(dropout)
      
  def forward(self, x, text_lengths):
    embedded = self.dropout(self.embedding(x))
    
    packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)    
    packed_output, (hidden, cell) = self.rnn(packed_embedded)
    
    output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
    hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))

    return self.linear(hidden)
    
  def train(self, iterator):
    
    for batch in iterator:
      text, text_lengths = batch.text
      self.opt.zero_grad()
      
      prediction = self.forward(text, text_lengths).squeeze(1)
      self.loss(prediction, batch.label).backward()
      self.opt.step()
      
  def compile(self):
    self.opt = optim.Adam(self.parameters())
    self.loss = nn.BCEWithLogitsLoss()
  
  def evaluate(self, iterator):
    epoch_loss = 0
    epoch_acc = 0
        
    with torch.no_grad():
    
        for batch in iterator:
            text, text_lengths = batch.text
            predictions = self.forward(text, text_lengths).squeeze(1)
            
            loss = self.loss(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss/len(iterator), epoch_acc/len(iterator)

# MAIN

## Training

In [0]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)
model.compile()

In [0]:
pretrained_embeddings = TEXT.vocab.vectors
pretrained_embeddings.shape

torch.Size([16386, 100])

In [0]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [0]:
model = model.to(device)
model.loss = model.loss.to(device)

In [0]:
EPOCHS = 25

In [0]:
for epoch in tqdm(range(EPOCHS)):
  model.train(train_iterator)


100%|█████████▉| 398893/400000 [00:30<00:00, 27230.05it/s]
  4%|▍         | 1/25 [00:33<13:23, 33.49s/it][A
  8%|▊         | 2/25 [01:07<12:56, 33.74s/it][A
 12%|█▏        | 3/25 [01:43<12:37, 34.45s/it][A
 16%|█▌        | 4/25 [02:19<12:12, 34.90s/it][A
 20%|██        | 5/25 [02:55<11:43, 35.16s/it][A
 24%|██▍       | 6/25 [03:31<11:12, 35.40s/it][A
 28%|██▊       | 7/25 [04:07<10:39, 35.54s/it][A
 32%|███▏      | 8/25 [04:43<10:06, 35.70s/it][A
 36%|███▌      | 9/25 [05:19<09:32, 35.76s/it][A
 40%|████      | 10/25 [05:55<08:57, 35.83s/it][A
 44%|████▍     | 11/25 [06:31<08:21, 35.83s/it][A
 48%|████▊     | 12/25 [07:07<07:45, 35.82s/it][A
 52%|█████▏    | 13/25 [07:43<07:10, 35.86s/it][A
 56%|█████▌    | 14/25 [08:18<06:34, 35.86s/it][A
 60%|██████    | 15/25 [08:54<05:58, 35.88s/it][A
 64%|██████▍   | 16/25 [09:30<05:22, 35.87s/it][A
 68%|██████▊   | 17/25 [10:06<04:47, 35.88s/it][A
 72%|███████▏  | 18/25 [10:42<04:11, 35.88s/it][A
 76%|███████▌  | 19/25 [11:18<0

## Evaluating

In [0]:
model.evaluate(test_iterator)

(0.39924435729108504, 0.8653532609610302)

In [0]:
import spacy
nlp = spacy.load('en')

def predict_sentiment(model, sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    prediction = torch.sigmoid(model(tensor, length_tensor))
    return prediction.item()

In [0]:
predict_sentiment(model, "This film is so fucking shiete")

0.22272369265556335

## Save and Load

In [0]:
model_dir = '/content/drive/My Drive/POLIMI/Thesis/Data/Models/'
model_name = 'SA_RNN_' + str(EPOCHS)
model_path = model_dir + model_name
model_path

'/content/drive/My Drive/POLIMI/Thesis/Data/Models/SA_RNN_25'

In [0]:
torch.save(model, model_path)

  "type " + obj.__name__ + ". It won't be checked "


In [0]:
loaded_model = torch.load(model_path)

In [0]:
loaded_model.evaluate(test_iterator)

(0.3989354993414391, 0.8651774296980075)

In [0]:
predict_sentiment(loaded_model, "This film is so fucking shiete")

0.33202943205833435

In [0]:
model_dir = '/content/drive/My Drive/POLIMI/Thesis/Data/Models/'
stoi_name = 'S2I'
stoi_path = model_dir + stoi_name
stoi_path

'/content/drive/My Drive/POLIMI/Thesis/Data/Models/S2I'

In [0]:
torch.save(TEXT.vocab.stoi, stoi_path)