In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['traindf.csv', 'testdf.csv', 'valdf.csv']


In [2]:
 import os, sys
import re
import string
import pathlib
import random
from collections import Counter, OrderedDict
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
from tqdm import tqdm, tqdm_notebook, tnrange
tqdm.pandas(desc='Progress')

import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

import torchtext
from torchtext import data
from torchtext import vocab

from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from sklearn.metrics import accuracy_score

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity='all'

import warnings
warnings.filterwarnings('ignore')

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [3]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english")) 
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = re.sub(r'[^A-Za-z0-9]+', ' ', text)
    text = re.sub(r'https?:/\/\S+', ' ', text)
    text = [lemmatizer.lemmatize(token) for token in text.split(' ')]
    text = [word for word in text if not word in stop_words]
    text = " ".join(text)
    return text.strip()
#for text in df.review:
#    print(clean_text(text))
#df.review = df.review.progress_apply(lambda x: clean_text(x))

In [4]:
%%time
nlp = spacy.load('en')
def tokenizer(s): return [w.text.lower() for w in nlp(clean_text(s))]

CPU times: user 288 ms, sys: 60 ms, total: 348 ms
Wall time: 489 ms


In [5]:
txt_field = data.Field(sequential=True, tokenize=tokenizer, include_lengths=True, use_vocab=True)
label_field = data.Field(sequential=False, use_vocab=False, pad_token=None, unk_token=None)

In [6]:
train_val_fields = [
    ('review', txt_field),
    ('sentiment', label_field),
]

In [7]:
%%time
trainds, valds, testds = data.TabularDataset.splits(path='../input/', format='csv',
                                            train='traindf.csv', validation='valdf.csv', test='testdf.csv',
                                            fields=train_val_fields, skip_header=True)

CPU times: user 18min 4s, sys: 4.04 s, total: 18min 8s
Wall time: 18min 9s


In [8]:
%%time
txt_field.build_vocab(trainds, valds, testds, max_size=100000, vectors="glove.6B.100d")
label_field.build_vocab(trainds,testds)

.vector_cache/glove.6B.zip: 862MB [01:24, 10.2MB/s]                           
100%|█████████▉| 399503/400000 [00:15<00:00, 26462.69it/s]

CPU times: user 37.2 s, sys: 7.47 s, total: 44.7 s
Wall time: 2min 14s


In [9]:
txt_field.vocab.vectors[txt_field.vocab.stoi["the"]]

tensor([-0.0382, -0.2449,  0.7281, -0.3996,  0.0832,  0.0440, -0.3914,  0.3344,
        -0.5755,  0.0875,  0.2879, -0.0673,  0.3091, -0.2638, -0.1323, -0.2076,
         0.3340, -0.3385, -0.3174, -0.4834,  0.1464, -0.3730,  0.3458,  0.0520,
         0.4495, -0.4697,  0.0263, -0.5415, -0.1552, -0.1411, -0.0397,  0.2828,
         0.1439,  0.2346, -0.3102,  0.0862,  0.2040,  0.5262,  0.1716, -0.0824,
        -0.7179, -0.4153,  0.2033, -0.1276,  0.4137,  0.5519,  0.5791, -0.3348,
        -0.3656, -0.5486, -0.0629,  0.2658,  0.3020,  0.9977, -0.8048, -3.0243,
         0.0125, -0.3694,  2.2167,  0.7220, -0.2498,  0.9214,  0.0345,  0.4674,
         1.1079, -0.1936, -0.0746,  0.2335, -0.0521, -0.2204,  0.0572, -0.1581,
        -0.3080, -0.4162,  0.3797,  0.1501, -0.5321, -0.2055, -1.2526,  0.0716,
         0.7056,  0.4974, -0.4206,  0.2615, -1.5380, -0.3022, -0.0734, -0.2831,
         0.3710, -0.2522,  0.0162, -0.0171, -0.3898,  0.8742, -0.7257, -0.5106,
        -0.5203, -0.1459,  0.8278,  0.27

In [10]:
class BatchGenerator:
    def __init__(self, dl, x_field, y_field):
        self.dl, self.x_field, self.y_field = dl, x_field, y_field
        
    def __len__(self):
        return len(self.dl)
    
    def __iter__(self):
        for batch in self.dl:
            X = getattr(batch, self.x_field)
            y = getattr(batch, self.y_field)
            yield (X,y)

In [11]:
vocab_size = len(txt_field.vocab)
embedding_dim = 100
n_hidden = 64
n_out = 2

In [12]:
class Network(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_hidden, n_out, pretrained_vec, bidirectional=True):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.n_hidden = n_hidden
        self.n_out = n_out
        self.bidirectional = bidirectional
        
        self.emb = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.emb.weight.data.copy_(pretrained_vec)
        self.emb.weight.requires_grad = False
        self.gru = nn.GRU(self.embedding_dim, self.n_hidden, bidirectional=bidirectional)
        if bidirectional:
            self.out = nn.Linear(self.n_hidden*2*2, self.n_out)
        else:
            self.out = nn.Linear(self.n_hidden*2, self.n_out)
        
    def forward(self, seq, lengths):
        bs = seq.size(1)
        self.h = self.init_hidden(bs)
        seq = seq.transpose(0,1)
        embs = self.emb(seq)
        embs = embs.transpose(0,1)
        embs = pack_padded_sequence(embs, lengths)
        gru_out, self.h = self.gru(embs, self.h)
        gru_out, lengths = pad_packed_sequence(gru_out)        
        
        avg_pool = F.adaptive_avg_pool1d(gru_out.permute(1,2,0),1).view(bs,-1)
        max_pool = F.adaptive_max_pool1d(gru_out.permute(1,2,0),1).view(bs,-1)        
        outp = self.out(torch.cat([avg_pool,max_pool],dim=1))
        return F.log_softmax(outp)
    
    def init_hidden(self, batch_size): 
        if self.bidirectional:
            return torch.zeros((2,batch_size,self.n_hidden)).to(device)
        else:
            return torch.zeros((1,batch_size,self.n_hidden)).cuda().to(device)

In [13]:
def fit(model, train_dl, val_dl, loss_fn, opt, epochs=3, tollerance=5):
    num_batch = len(train_dl)
    from_valacc = 0
    path_to_best_model = "../network.py"
    for epoch in tnrange(epochs):      
        y_true_train = list()
        y_pred_train = list()
        total_loss_train = 0          
        
        t = tqdm_notebook(iter(train_dl), leave=False, total=num_batch)
        for (X,lengths),y in t:
            t.set_description(f'Epoch {epoch}')
            lengths = lengths.cpu().numpy()
            
            opt.zero_grad()
            pred = model(X, lengths)
            loss = loss_fn(pred, y)
            loss.backward()
            opt.step()
            
            t.set_postfix(loss=loss.item())
            pred_idx = torch.max(pred, dim=1)[1]
            
            y_true_train += list(y.cpu().data.numpy())
            y_pred_train += list(pred_idx.cpu().data.numpy())
            total_loss_train += loss.item()
            
        train_acc = accuracy_score(y_true_train, y_pred_train)
        train_loss = total_loss_train/len(train_dl)
        
        y_true_val = list()
        y_pred_val = list()
        total_loss_val = 0
        for (X,lengths),y in tqdm_notebook(val_dl, leave=False):
            pred = model(X, lengths.cpu().numpy())
            loss = loss_fn(pred, y)
            pred_idx = torch.max(pred, 1)[1]
            y_true_val += list(y.cpu().data.numpy())
            y_pred_val += list(pred_idx.cpu().data.numpy())
            total_loss_val += loss.item()
        valacc = accuracy_score(y_true_val, y_pred_val)
        valloss = total_loss_val/len(valdl)
        if from_valacc>valacc:
            tollerance -=1
        if from_valacc<valacc:
            from_valacc=valacc
            tollerance = 5
            torch.save(model.state_dict(),path_to_best_model)
        print(f'Epoch {epoch}: train_loss: {train_loss:.4f} train_acc: {train_acc:.4f} | val_loss: {valloss:.4f} val_acc: {valacc:.4f} | tollerance: {tollerance:}')
        if tollerance<=0:
            break
    model.load_state_dict(torch.load(path_to_best_model))
    print("Training stopped")

In [14]:
traindl, valdl = data.BucketIterator.splits(datasets=(trainds, valds), batch_sizes=(512,512), sort_key=lambda x: len(x.review), device=device, sort_within_batch=True, repeat=False)
train_batch_it = BatchGenerator(traindl, 'review', 'sentiment')
val_batch_it = BatchGenerator(valdl, 'review', 'sentiment')
testdl = data.BucketIterator(dataset=testds, batch_size=512, sort_key=lambda x: len(x.review), device=device, sort_within_batch=True, repeat=False)
test_batch_it = BatchGenerator(testdl, 'review', 'sentiment')

In [15]:
m = Network(vocab_size, embedding_dim, n_hidden, n_out, trainds.fields['review'].vocab.vectors).to(device)
opt = optim.Adam(filter(lambda p: p.requires_grad, m.parameters()), 1e-3)

fit(model=m, train_dl=train_batch_it, val_dl=val_batch_it, loss_fn=F.nll_loss, opt=opt, epochs=50)

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

HBox(children=(IntProgress(value=0, max=63), HTML(value='')))

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

Epoch 0: train_loss: 0.6205 train_acc: 0.6515 | val_loss: 0.5293 val_acc: 0.7396 | tollerance: 5


HBox(children=(IntProgress(value=0, max=63), HTML(value='')))

100%|█████████▉| 399503/400000 [00:30<00:00, 26462.69it/s]

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

Epoch 1: train_loss: 0.4528 train_acc: 0.7899 | val_loss: 0.4273 val_acc: 0.8085 | tollerance: 5


HBox(children=(IntProgress(value=0, max=63), HTML(value='')))

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

Epoch 2: train_loss: 0.3884 train_acc: 0.8274 | val_loss: 0.3797 val_acc: 0.8337 | tollerance: 5


HBox(children=(IntProgress(value=0, max=63), HTML(value='')))

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

Epoch 3: train_loss: 0.3584 train_acc: 0.8432 | val_loss: 0.3507 val_acc: 0.8510 | tollerance: 5


HBox(children=(IntProgress(value=0, max=63), HTML(value='')))

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

Epoch 4: train_loss: 0.3370 train_acc: 0.8536 | val_loss: 0.3360 val_acc: 0.8595 | tollerance: 5


HBox(children=(IntProgress(value=0, max=63), HTML(value='')))

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

Epoch 5: train_loss: 0.3148 train_acc: 0.8641 | val_loss: 0.3256 val_acc: 0.8656 | tollerance: 5


HBox(children=(IntProgress(value=0, max=63), HTML(value='')))

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

Epoch 6: train_loss: 0.3022 train_acc: 0.8716 | val_loss: 0.3170 val_acc: 0.8669 | tollerance: 5


HBox(children=(IntProgress(value=0, max=63), HTML(value='')))

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

Epoch 7: train_loss: 0.2936 train_acc: 0.8758 | val_loss: 0.5023 val_acc: 0.7806 | tollerance: 4


HBox(children=(IntProgress(value=0, max=63), HTML(value='')))

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

Epoch 8: train_loss: 0.3104 train_acc: 0.8672 | val_loss: 0.3083 val_acc: 0.8720 | tollerance: 5


HBox(children=(IntProgress(value=0, max=63), HTML(value='')))

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

Epoch 9: train_loss: 0.2731 train_acc: 0.8867 | val_loss: 0.3016 val_acc: 0.8715 | tollerance: 4


HBox(children=(IntProgress(value=0, max=63), HTML(value='')))

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

Epoch 10: train_loss: 0.2754 train_acc: 0.8838 | val_loss: 0.2963 val_acc: 0.8762 | tollerance: 5


HBox(children=(IntProgress(value=0, max=63), HTML(value='')))

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

Epoch 11: train_loss: 0.2614 train_acc: 0.8907 | val_loss: 0.3006 val_acc: 0.8748 | tollerance: 4


HBox(children=(IntProgress(value=0, max=63), HTML(value='')))

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

Epoch 12: train_loss: 0.2559 train_acc: 0.8946 | val_loss: 0.2892 val_acc: 0.8782 | tollerance: 5


HBox(children=(IntProgress(value=0, max=63), HTML(value='')))

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

Epoch 13: train_loss: 0.2468 train_acc: 0.8999 | val_loss: 0.3016 val_acc: 0.8716 | tollerance: 4


HBox(children=(IntProgress(value=0, max=63), HTML(value='')))

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

Epoch 14: train_loss: 0.2408 train_acc: 0.9018 | val_loss: 0.2824 val_acc: 0.8844 | tollerance: 5


HBox(children=(IntProgress(value=0, max=63), HTML(value='')))

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

Epoch 15: train_loss: 0.2575 train_acc: 0.8921 | val_loss: 0.2828 val_acc: 0.8845 | tollerance: 5


HBox(children=(IntProgress(value=0, max=63), HTML(value='')))

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

Epoch 16: train_loss: 0.2316 train_acc: 0.9066 | val_loss: 0.2786 val_acc: 0.8859 | tollerance: 5


HBox(children=(IntProgress(value=0, max=63), HTML(value='')))

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

Epoch 17: train_loss: 0.2239 train_acc: 0.9101 | val_loss: 0.2986 val_acc: 0.8732 | tollerance: 4


HBox(children=(IntProgress(value=0, max=63), HTML(value='')))

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

Epoch 18: train_loss: 0.2179 train_acc: 0.9127 | val_loss: 0.2786 val_acc: 0.8875 | tollerance: 5


HBox(children=(IntProgress(value=0, max=63), HTML(value='')))

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

Epoch 19: train_loss: 0.2268 train_acc: 0.9083 | val_loss: 0.2925 val_acc: 0.8768 | tollerance: 4


HBox(children=(IntProgress(value=0, max=63), HTML(value='')))

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

Epoch 20: train_loss: 0.2059 train_acc: 0.9180 | val_loss: 0.2815 val_acc: 0.8852 | tollerance: 3


HBox(children=(IntProgress(value=0, max=63), HTML(value='')))

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

Epoch 21: train_loss: 0.1960 train_acc: 0.9224 | val_loss: 0.2739 val_acc: 0.8866 | tollerance: 2


HBox(children=(IntProgress(value=0, max=63), HTML(value='')))

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

Epoch 22: train_loss: 0.1888 train_acc: 0.9250 | val_loss: 0.2817 val_acc: 0.8886 | tollerance: 5


HBox(children=(IntProgress(value=0, max=63), HTML(value='')))

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

Epoch 23: train_loss: 0.1907 train_acc: 0.9235 | val_loss: 0.2993 val_acc: 0.8796 | tollerance: 4


HBox(children=(IntProgress(value=0, max=63), HTML(value='')))

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

Epoch 24: train_loss: 0.1808 train_acc: 0.9293 | val_loss: 0.2831 val_acc: 0.8836 | tollerance: 3


HBox(children=(IntProgress(value=0, max=63), HTML(value='')))

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

Epoch 25: train_loss: 0.1847 train_acc: 0.9267 | val_loss: 0.2856 val_acc: 0.8786 | tollerance: 2


HBox(children=(IntProgress(value=0, max=63), HTML(value='')))

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

Epoch 26: train_loss: 0.1887 train_acc: 0.9258 | val_loss: 0.3363 val_acc: 0.8646 | tollerance: 1


HBox(children=(IntProgress(value=0, max=63), HTML(value='')))

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

Epoch 27: train_loss: 0.1732 train_acc: 0.9338 | val_loss: 0.2831 val_acc: 0.8831 | tollerance: 0
Training stopped


In [16]:
from sklearn import metrics
y_true_test = list()
y_pred_test = list()
for (X,lengths),y in tqdm_notebook(test_batch_it, leave=False):
    pred = m(X, lengths.cpu().numpy())
    loss = F.nll_loss(pred, y)
    pred_idx = torch.max(pred, 1)[1]
    y_true_test += list(y.cpu().data.numpy())
    y_pred_test += list(pred_idx.cpu().data.numpy())
#testacc = accuracy_score(y_true_val, y_pred_val)
print(metrics.classification_report(y_pred_test,y_true_test))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

              precision    recall  f1-score   support

           0       0.88      0.90      0.89      4915
           1       0.90      0.88      0.89      5085

   micro avg       0.89      0.89      0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

