In [1]:
import numpy as np
import pandas as pd
import torch
import torchtext
import spacy

class CreateDataset(torch.utils.data.Dataset):

    def __init__(self, root_dir, batch_size=32):
        self.root_dir = root_dir
        self.batch_size = batch_size
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        self.spacy = spacy.load("en_core_web_sm")

        self.TEXT = torchtext.data.Field(sequential=True, tokenize="spacy")
        self.LABEL = torchtext.data.LabelField(dtype=torch.long, sequential=False)

        self.initData()
        self.initEmbed()

        self.makeData()

    def initData(self):
        
        df_path = self.root_dir + 'imdb-dataset-sentiment-analysis-in-csv-format'

        self.train_data, self.valid_data, self.test_data = torchtext.data.TabularDataset.splits(
                        path=df_path, 
                        train="Train.csv", validation="Valid.csv", test="Test.csv", 
                        format="csv", 
                        skip_header=True, 
                        fields=[('Text', self.TEXT), ('Label', self.LABEL)])

    def initEmbed(self):
        
        embed_path = self.root_dir + 'glove6b300dtxt/glove.6B.300d.txt'

        self.TEXT.build_vocab(self.train_data,
                         vectors=torchtext.vocab.Vectors(embed_path), 
                         max_size=20000, 
                         min_freq=10)
        self.LABEL.build_vocab(self.train_data)

    def makeData(self):
        self.train_iterator, self.valid_iterator, self.test_iterator = torchtext.data.BucketIterator.splits(
                        (self.train_data, self.valid_data, self.test_data), 
                        sort_key=lambda x: len(x.Text), 
                        batch_size=self.batch_size,
                        device=self.device)

    def lengthData(self):
        return len(self.train_data), len(self.valid_data), len(self.test_data)
    
    def lengthVocab(self):
        return len(self.TEXT.vocab), len(self.LABEL.vocab)

    def freqLABEL(self):
        return self.LABEL.vocab.freqs

    def getData(self):
        return self.train_iterator, self.valid_iterator, self.test_iterator

    def getEmbeddings(self):
        return self.TEXT.vocab.vectors

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dataset = CreateDataset('../input/')
train_iterator, valid_iterator, test_iterator = dataset.getData()
pretrained_embeddings = dataset.getEmbeddings()
pretrained_embeddings.to(device)

100%|█████████▉| 399999/400000 [00:59<00:00, 6721.28it/s]


tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0466,  0.2132, -0.0074,  ...,  0.0091, -0.2099,  0.0539],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       device='cuda:0')

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class KimCNN(nn.Module):
    def __init__(self, input_dim, embed_dim, n_filters, filter_sizes, output_dim):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embed_dim)
        self.embedding.load_state_dict({'weight': pretrained_embeddings})
        self.embedding.weight.requires_grad = False
        
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ])

        self.fc = nn.Linear(n_filters*len(filter_sizes), output_dim)
        
    
    def forward(self,x):
        x = x.permute(1,0)
        x = self.embedding(x)
        x = x.unsqueeze(1)
        convs_x = [F.relu(conv(x)).squeeze(3) for conv in self.convs]
        pooled_x = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in convs_x]
        cat_x = torch.cat(pooled_x, dim = 1)
        x = self.fc(cat_x)
        return x
        

In [None]:
input_dim = dataset.lengthVocab()[0]
embedding_dim = 300
n_filters = 100
filters = [3,4,5]
output_dim = 2
model = KimCNN(input_dim, embedding_dim, n_filters, filters, output_dim)
model.to(device)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class RNN(torch.nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        
        self.embedding = torch.nn.Embedding(input_dim, embedding_dim)
        self.rnn = torch.nn.RNN(embedding_dim, hidden_dim)
        self.linear = torch.nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):
        
        embedded = self.embedding(text)
        
        output, hidden = self.rnn(embedded)
        output = torch.mean(output, dim = 0)
        out = self.linear(output)
        return out

class LSTM(torch.nn.Module):
    def __init__(self, input_dim, embedding_dim, num_layers, hidden_dim, dropout = 0.2, bidirectional = False):
        super(LSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.bidirectional = bidirectional
        self.dropout = torch.nn.Dropout(p=dropout)

        self.embedding = torch.nn.Embedding(input_dim, embedding_dim)
        self.embedding.load_state_dict({'weight': pretrained_embeddings})
        self.embedding.weight.requires_grad = False

        self.lstm = torch.nn.LSTM(embedding_dim, hidden_dim, 
                                         num_layers=num_layers,
                                         bidirectional=bidirectional,
                                         dropout = dropout)
        if bidirectional:
            self.linear = torch.nn.Linear(hidden_dim*2, 2)
        else:
            self.linear = torch.nn.Linear(hidden_dim*num_layers, 2)
    def forward(self, text):
        embedded = self.embedding(text)
        #embedded = torch.transpose(embedded, dim0=1, dim1=0)
        lstm_out, (hidden, cell) = self.lstm(embedded)
        lstm_out = torch.mean(lstm_out, 0)
        out = self.linear(self.dropout(lstm_out))
        return out

In [None]:
input_dim = dataset.lengthVocab()[0]
embedding_dim = 300
hidden_dim = 256
output_dim = 2
num_layers = 2
batch_size = 32
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RNN(input_dim, embedding_dim, hidden_dim, output_dim)
#model = LSTM(input_dim, embedding_dim, num_layers, hidden_dim, bidirectional = True)
model.to(device)

In [5]:
import torch.optim as optim
import torch.nn as nn

optimizer = optim.SGD(model.parameters(), lr=1e-2)
criterion = nn.CrossEntropyLoss()
model = model.to(device)
criterion = criterion.to(device)

In [6]:
import torch.nn.functional as F

def accuracy(preds, y):

    preds, ind= torch.max(F.softmax(preds, dim=-1), 1)
    correct = (ind == y).float()
    acc = correct.sum()/float(len(correct))
    return acc

In [7]:
import pyprind

def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    bar = pyprind.ProgBar(len(iterator), bar_char='█')
    for batch in iterator:
        
        optimizer.zero_grad()
                
        predictions = model(batch.Text).squeeze(0)

        loss = criterion(predictions, batch.Label)

        acc = accuracy(predictions, batch.Label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        bar.update()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        #bar = pyprind.ProgBar(len(iterator), bar_char='█')
        for batch in iterator:

            predictions = model(batch.Text).squeeze(0)
            
            loss = criterion(predictions, batch.Label)
            
            acc = accuracy(predictions, batch.Label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
            #bar.update()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [8]:
epochs = 20
best_acc = 0
for epoch in range(epochs):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    if valid_acc > best_acc:
        torch.save(model.state_dict(), 'weights_kim_sentiment.pth')
    print(f'Epoch: {epoch+1} \t Train Loss: {train_loss:.3f}  \t Train Acc: {train_acc*100:.2f}% \nVal. Loss: {valid_loss:.3f} \t Val. Acc: {valid_acc*100:.2f}% ')


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:48


Epoch: 1 	 Train Loss: 0.577  	 Train Acc: 70.79% 
Val. Loss: 0.422 	 Val. Acc: 81.57% 


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:47


Epoch: 2 	 Train Loss: 0.357  	 Train Acc: 85.00% 
Val. Loss: 0.354 	 Val. Acc: 84.63% 


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:47


Epoch: 3 	 Train Loss: 0.299  	 Train Acc: 87.44% 
Val. Loss: 0.307 	 Val. Acc: 86.46% 


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:47


Epoch: 4 	 Train Loss: 0.265  	 Train Acc: 89.19% 
Val. Loss: 0.294 	 Val. Acc: 87.10% 


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:47


Epoch: 5 	 Train Loss: 0.239  	 Train Acc: 90.56% 
Val. Loss: 0.279 	 Val. Acc: 87.86% 


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:47


Epoch: 6 	 Train Loss: 0.217  	 Train Acc: 91.56% 
Val. Loss: 0.279 	 Val. Acc: 87.98% 


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:47


Epoch: 7 	 Train Loss: 0.196  	 Train Acc: 92.61% 
Val. Loss: 0.266 	 Val. Acc: 88.46% 


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:47


Epoch: 8 	 Train Loss: 0.177  	 Train Acc: 93.61% 
Val. Loss: 0.278 	 Val. Acc: 88.08% 


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:47


Epoch: 9 	 Train Loss: 0.161  	 Train Acc: 94.36% 
Val. Loss: 0.260 	 Val. Acc: 89.17% 


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:47


Epoch: 10 	 Train Loss: 0.145  	 Train Acc: 95.11% 
Val. Loss: 0.262 	 Val. Acc: 89.01% 


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:47


Epoch: 11 	 Train Loss: 0.129  	 Train Acc: 95.91% 
Val. Loss: 0.269 	 Val. Acc: 88.83% 


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:47


Epoch: 12 	 Train Loss: 0.116  	 Train Acc: 96.54% 
Val. Loss: 0.259 	 Val. Acc: 89.19% 


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:47


Epoch: 13 	 Train Loss: 0.102  	 Train Acc: 97.21% 
Val. Loss: 0.267 	 Val. Acc: 88.87% 


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:47


Epoch: 14 	 Train Loss: 0.089  	 Train Acc: 97.80% 
Val. Loss: 0.258 	 Val. Acc: 89.33% 


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:47


Epoch: 15 	 Train Loss: 0.078  	 Train Acc: 98.29% 
Val. Loss: 0.273 	 Val. Acc: 89.03% 


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:47


Epoch: 16 	 Train Loss: 0.069  	 Train Acc: 98.69% 
Val. Loss: 0.264 	 Val. Acc: 89.19% 


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:47


Epoch: 17 	 Train Loss: 0.060  	 Train Acc: 99.10% 
Val. Loss: 0.265 	 Val. Acc: 89.19% 


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:48


Epoch: 18 	 Train Loss: 0.052  	 Train Acc: 99.34% 
Val. Loss: 0.279 	 Val. Acc: 88.71% 


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:47


Epoch: 19 	 Train Loss: 0.045  	 Train Acc: 99.58% 
Val. Loss: 0.279 	 Val. Acc: 88.81% 


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:47


Epoch: 20 	 Train Loss: 0.040  	 Train Acc: 99.72% 
Val. Loss: 0.274 	 Val. Acc: 89.63% 


In [9]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)
print(f'test. Loss: {test_loss:.3f} \t test. Acc: {test_acc*100:.2f}% ')

test. Loss: 0.291 	 test. Acc: 89.41% 


In [10]:
model.load_state_dict(torch.load('./weights_kim_sentiment.pth'))
train_loss, train_acc = evaluate(model, train_iterator, criterion)
print(f'train. Loss: {train_loss:.3f} \t train. Acc: {train_acc*100:.2f}% ')
val_loss, val_acc = evaluate(model, valid_iterator, criterion)
print(f'Val. Loss: {val_loss:.3f} \t Val. Acc: {val_acc*100:.2f}% ')
test_loss, test_acc = evaluate(model, test_iterator, criterion)
print(f'test. Loss: {test_loss:.3f} \t test. Acc: {test_acc*100:.2f}% ')

train. Loss: 0.034 	 train. Acc: 99.87% 
Val. Loss: 0.274 	 Val. Acc: 89.63% 
test. Loss: 0.291 	 test. Acc: 89.41% 


In [11]:
def tokenize_en(sentence):
    return [tok.text for tok in dataset.spacy.tokenizer(sentence)]
review = 'It was a very interesting movie. One of the best movies I have ever seen. I would recodmend everyone to watch it'
a = tokenize_en(review)
inputs = [dataset.TEXT.vocab.stoi[word] for word in a]
inputs = torch.tensor(inputs)
inputs = inputs.unsqueeze(1)
inputs = inputs.to(device)
preds = model(inputs)
preds, ind= torch.max(F.softmax(preds, dim=-1), 1)
if (ind == 0 ):
    print('negative')
else:
    print('positive')

positive
