In [4]:
import torch
from torchtext import data
TEXT = data.Field(tokenize = 'spacy',batch_first = True)
LABEL = data.LabelField(dtype = torch.float)

In [5]:
from torchtext import datasets
train_data, test_data = datasets.IMDB.splits(
    TEXT, LABEL,
    root='/Users/ayushranjan/Desktop/Sentiment/data' 
    ,test='/Users/ayushranjan/Desktop/Sentiment/data/imdb/aclImdb/test', 
    train='/Users/ayushranjan/Desktop/Sentiment/data/imdb/aclImdb/train'
    )

In [7]:
import random
seed = 777
torch.manual_seed(seed)
test_data, valid_data = test_data.split(random_state = random.seed(seed))

In [13]:
max_words = 50000 
TEXT.build_vocab(train_data, max_size = max_words)
LABEL.build_vocab(train_data)

In [8]:
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = 64)

In [14]:
import torch.nn as nn
import torch.nn.functional as F

class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        self.conv_0 = nn.Conv2d(in_channels = 1, out_channels = 100,  kernel_size = (2, embedding_dim))
        self.conv_1 = nn.Conv2d(in_channels = 1, out_channels = 100, kernel_size = (3, embedding_dim))
        self.conv_2 = nn.Conv2d(in_channels = 1, out_channels = 100, kernel_size = (4, embedding_dim))
        self.conv_3 = nn.Conv2d(in_channels = 1,out_channels = 100, kernel_size = (5, embedding_dim))
        self.fc = nn.Linear(4 * 100, 1) 
        self.dropout = nn.Dropout(0.5)
        
        
    def forward(self, text):
        
        embedded = self.embedding(text)
        embedded = embedded.unsqueeze(1)
        
        op = self.conv_0(embedded)
        conved_0 = F.relu(op.squeeze(3))
        pooled_0 = F.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2)
      
        op2 = self.conv_1(embedded)
        conved_1 = F.relu(op2.squeeze(3))
        pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2)
    
        op3 = self.conv_2(embedded)
        conved_2 = F.relu(op3.squeeze(3))
        pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2)    
        
        op4 = self.conv_3(embedded)
        conved_3 = F.relu(op4.squeeze(3))
        pooled_3 = F.max_pool1d(conved_3, conved_3.shape[2]).squeeze(2)    
        
        cat = self.dropout(torch.cat((pooled_0, pooled_1, pooled_2,pooled_3), dim = 1))
        
        output = self.fc(cat)
        return output

In [15]:
vocab_size = len(TEXT.vocab)
EMBEDDING_DIM = 100
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = CNN(vocab_size, EMBEDDING_DIM,PAD_IDX)

In [16]:
import torch.optim as optim
optimizer = optim.Adam(model.parameters())
loss_fun = nn.BCEWithLogitsLoss()

In [23]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() 
    acc = (correct.sum() / len(correct))*100
    return acc

In [24]:
def train(model, iterator, optimizer, loss_fun):
    
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    for batch in iterator:
        
        optimizer.zero_grad()
        predictions = model(batch.text).squeeze(1)
        loss = loss_fun(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [25]:
def evaluate(model, iterator, loss_fun):
    
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad(): 
        for batch in iterator:
            predictions = model(batch.text).squeeze(1)
            loss = loss_fun(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            epoch_loss += loss.item()
            epoch_acc += acc.item() 
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [26]:
import time

def epoch_time(start_time, end_time):
    total_time = end_time - start_time
    total_mins = int(total_time / 60)
    total_secs = int(total_time - (total_mins * 60))
    return total_mins, total_secs

In [27]:
n = 5
best_valid_loss = float('inf')
for epoch in range(n):
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, loss_fun)
    valid_loss, valid_acc = evaluate(model, valid_iterator, loss_fun)
    
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'model_minor_FINAL.pt')
    print("epoch :" + str(epoch + 1 ))
    print("Time : " + str(epoch_mins ) + " mins " + str(epoch_secs) + " secs")
    print("Training loss : " + "{:.2f}".format(train_loss))
    print("Validation loss : "  + "{:.2f}".format(valid_loss))
    print("TRAINING ACCURACY : " + "{:.2f}".format(train_acc))
    print("VALIDATION  ACCURACY : " + "{:.2f}".format(valid_acc))
    print("**********************************************************************")

epoch :1
Time : 23 mins 5 secs
Training loss : 0.62
Validation loss : 0.45
TRAINING ACCURACY : 65.76
VALIDATION  ACCURACY : 78.49
**********************************************************************
epoch :2
Time : 22 mins 54 secs
Training loss : 0.47
Validation loss : 0.38
TRAINING ACCURACY : 77.64
VALIDATION  ACCURACY : 83.57
**********************************************************************
epoch :3
Time : 23 mins 2 secs
Training loss : 0.41
Validation loss : 0.34
TRAINING ACCURACY : 81.63
VALIDATION  ACCURACY : 85.85
**********************************************************************
epoch :4
Time : 23 mins 0 secs
Training loss : 0.35
Validation loss : 0.33
TRAINING ACCURACY : 84.57
VALIDATION  ACCURACY : 85.71
**********************************************************************
epoch :5
Time : 23 mins 6 secs
Training loss : 0.29
Validation loss : 0.28
TRAINING ACCURACY : 87.54
VALIDATION  ACCURACY : 88.68
*****************************************************************

In [29]:
model.load_state_dict(torch.load('model_minor_FINAL.pt'))

test_loss, test_acc = evaluate(model, test_iterator, loss_fun)

print("Test loss : "  + "{:.2f}".format(test_loss))
print("TEST ACCURACY :" + "{:.2f}".format(test_acc))

Test loss : 0.29
TEST ACCURACY :87.39


In [28]:
import spacy
nlp = spacy.load('en')

def predict_sentiment(model, sentence, min_len = 5):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    if len(tokenized) < min_len:
        tokenized += ['<pad>'] * (min_len - len(tokenized))
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    text = torch.LongTensor(indexed)
    text = text.unsqueeze(0)
    prediction = torch.sigmoid(model(text))
    return prediction.item()

In [30]:
if (round(predict_sentiment(model, "This film was a disaster") ) == 0) : 
    print("Negative")
else :
    print("Positive")

Negative


In [31]:
if (round(predict_sentiment(model, "This film was a good") ) == 0) : 
    print("Negative")
else :
    print("Positive")

Positive
