In [3]:
%matplotlib inline
import sys
import re
import pathlib
from collections import Counter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
from tqdm.auto import tqdm, trange
tqdm.pandas(desc='Progress')

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

import ignite
from ignite.engine import Engine, Events
from ignite.metrics import Accuracy, Loss, RunningAverage
from ignite.handlers import ModelCheckpoint, EarlyStopping
from ignite.contrib.handlers import ProgressBar

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity='all'

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [1]:
!pip install pytorch-ignite

Collecting pytorch-ignite
[?25l  Downloading https://files.pythonhosted.org/packages/35/55/41e8a995876fd2ade29bdba0c3efefa38e7d605cb353c70f3173c04928b5/pytorch_ignite-0.3.0-py2.py3-none-any.whl (103kB)
[K    100% |████████████████████████████████| 112kB 4.7MB/s 
[31mmenpo 0.8.1 has requirement matplotlib<2.0,>=1.4, but you'll have matplotlib 3.0.2 which is incompatible.[0m
[31mmenpo 0.8.1 has requirement pillow<5.0,>=3.0, but you'll have pillow 5.4.0 which is incompatible.[0m
[31mmenpo 0.8.1 has requirement scipy<1.0,>=0.16, but you'll have scipy 1.2.0 which is incompatible.[0m
Installing collected packages: pytorch-ignite
Successfully installed pytorch-ignite-0.3.0
[33mYou are using pip version 10.0.1, however version 20.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [14]:
df = pd.read_csv('/floyd/input/imdb/file1.csv', error_bad_lines=False)
df.head(5)

Unnamed: 0.1,Unnamed: 0,sentiment,review1
0,0,1,one of the other reviewers has mentioned that ...
1,1,1,a wonderful little production the filming tech...
2,2,1,i thought this was a wonderful way to spend ti...
3,3,0,basically there is a family where a little boy...
4,4,1,petter mattei is love in the time of money is ...


In [15]:
# split the data into train and validation
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df[['sentiment']])
train_df, val_df = train_df.reset_index(drop=True), val_df.reset_index(drop=True)

train_df.shape, val_df.shape

((40000, 3), (10000, 3))

In [16]:
PAD = 0
UNK = 1

class SentimentDataset(Dataset):
    
    def __init__(self, df, word2idx=None, idx2word=None, max_vocab_size=50000):
        print('Processing Data')
        self.df = df
        print('Removing white space...')
        self.df.review1 = self.df.review1.progress_apply(lambda x: x.strip())
        self.nlp = spacy.load('en',disable=['parser', 'tagger', 'ner'])
        if word2idx is None:
            print('Building Counter...')
            word_counter = self.build_counter()
            print('Building Vocab...')
            self.word2idx, self.idx2word = self.build_vocab(word_counter, max_vocab_size)
        else:
            self.word2idx, self.idx2word = word2idx, idx2word
        print('*'*100)
        print('Dataset info:')
        print(f'Number of Tweets: {self.df.shape[0]}')
        print(f'Vocab Size: {len(self.word2idx)}')
        print('*'*100)
    
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        sent = self.df.review1[idx]
        tokens = [w.text.lower() for w in self.nlp(self.tweet_clean(sent))]
        vec = self.vectorize(tokens, self.word2idx)
        return vec, self.df.sentiment[idx]
    
    def tweet_clean(self, text):
        """Very basic text cleaning. This function can be built upon for
           better preprocessing
        """
        text = re.sub(r'[\s]+', ' ', text) # replace multiple white spaces with single space
#         text = re.sub(r'@[A-Za-z0-9]+', ' ', text) # remove @ mentions
        text = re.sub(r'https?:/\/\S+', ' ', text) # remove links
        text = re.sub(r'[^A-Za-z0-9]+', ' ', text) # remove non alphanumeric character
        return text.strip()
    
    def build_counter(self):
        words_counter = Counter()
        for sent in tqdm(self.df.review1.values):
            words_counter.update(w.text.lower() for w in self.nlp(self.tweet_clean(sent)))
        return words_counter
    
    def build_vocab(self, words_counter, max_vocab_size):
        word2idx = {'<PAD>': PAD, '<UNK>': UNK}
        word2idx.update({word:i+2 for i, (word, count) in tqdm(enumerate(words_counter.most_common(max_vocab_size)))})
        idx2word = {idx: word for word, idx in tqdm(word2idx.items())}
        return word2idx, idx2word
    
    def vectorize(self, tokens, word2idx):
        vec = [word2idx.get(token, UNK) for token in tokens]
        return vec

In [17]:
vocab_size = 100000

In [18]:
train_ds = SentimentDataset(train_df, max_vocab_size=vocab_size)

Processing Data
Removing white space...


HBox(children=(IntProgress(value=0, description='Progress', max=40000, style=ProgressStyle(description_width='…


Building Counter...


HBox(children=(IntProgress(value=0, max=40000), HTML(value='')))


Building Vocab...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=0, max=92543), HTML(value='')))


****************************************************************************************************
Dataset info:
Number of Tweets: 40000
Vocab Size: 92543
****************************************************************************************************


In [19]:
val_ds = SentimentDataset(val_df, word2idx=train_ds.word2idx, idx2word=train_ds.idx2word)

Processing Data
Removing white space...


HBox(children=(IntProgress(value=0, description='Progress', max=10000, style=ProgressStyle(description_width='…


****************************************************************************************************
Dataset info:
Number of Tweets: 10000
Vocab Size: 92543
****************************************************************************************************


In [32]:

batch_size = 100

In [33]:
def collate_fn(data):
    """This function will be used to pad the review to max length
       in the batch and transpose the batch from 
       batch_size x max_seq_len to max_seq_len x batch_size.
       It will return padded vectors, labels and lengths of each tweets (before padding)
       It will be used in the Dataloader
    """
    data.sort(key=lambda x: len(x[0]), reverse=True)
    lens = [len(sent) for sent, label in data]
    labels = []
    padded_sents = torch.zeros(len(data), max(lens)).long()
    for i, (sent, label) in enumerate(data):
        padded_sents[i,:lens[i]] = torch.LongTensor(sent)
        labels.append(label)
    
    padded_sents = padded_sents.transpose(0,1)
    return padded_sents, torch.tensor(labels).long(), lens

In [34]:
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

In [35]:
val_dl = DataLoader(val_ds, batch_size=batch_size, collate_fn=collate_fn)

In [36]:
class ConcatPoolingGRUAdaptive(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_hidden, n_out):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.n_hidden = n_hidden
        self.n_out = n_out
        self.emb = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.emb_drop = nn.Dropout(0.3)
        self.gru = nn.GRU(self.embedding_dim, self.n_hidden, dropout=0.3)
        self.out = nn.Linear(self.n_hidden*3, self.n_out)
        
    def forward(self, seq, lengths):
        self.h = self.init_hidden(seq.size(1))
        embs = self.emb_drop(self.emb(seq))
        embs = pack_padded_sequence(embs, lengths)
        gru_out, self.h = self.gru(embs, self.h)
        gru_out, lengths = pad_packed_sequence(gru_out)        
        
        avg_pool = F.adaptive_avg_pool1d(gru_out.permute(1,2,0),1).view(seq.size(1),-1)
        max_pool = F.adaptive_max_pool1d(gru_out.permute(1,2,0),1).view(seq.size(1),-1)

        outp = self.out(torch.cat([self.h[-1],avg_pool,max_pool],dim=1))             
        return F.log_softmax(outp, dim=-1) # it will return log of softmax
    
    def init_hidden(self, batch_size):
        return torch.zeros((1, batch_size,self.n_hidden), requires_grad=True).to(device)

In [37]:
# (vocab_size + 2) is because of pad and unk added to the vocab
model_vocab_size = vocab_size + 2
embedding_dim = 100
rnn_hidden = 124
n_out = 2

model = ConcatPoolingGRUAdaptive(model_vocab_size, embedding_dim, rnn_hidden, n_out).to(device) 
optimizer = optim.Adam(model.parameters(), 1e-3)
loss_fn = F.nll_loss

In [38]:
def process_function(engine, batch):
    """Single training loop to be attached to trainer Engine
    """
    model.train()
    optimizer.zero_grad()
    x, y, lens = batch
    x, y = x.to(device), y.to(device)
    y_pred = model(x, lens)
    loss = loss_fn(y_pred, y)
    loss.backward()
    optimizer.step()
    return loss.item(), torch.max(y_pred, dim=1)[1], y


def eval_function(engine, batch):
    """Single evaluator loop to be attached to trainer and evaluator Engine
    """
    model.eval()
    with torch.no_grad():
        x, y, lens = batch
        x, y = x.to(device), y.to(device)
        y_pred = model(x, lens)
        return y_pred, y
    
trainer = Engine(process_function)
train_evaluator = Engine(eval_function)
validation_evaluator = Engine(eval_function)

In [39]:
def max_output_transform(output):
    """It convers the predicted ouput probabilties to indexes for accuracy calculation
    """
    y_pred, y = output
    return torch.max(y_pred, dim=1)[1], y

# attach running loss (will be displayed in progess bar)
RunningAverage(output_transform=lambda x: x[0]).attach(trainer, 'loss')

# attach running accuracy (will be displayed in progess bar)
RunningAverage(Accuracy(output_transform=lambda x: [x[1], x[2]])).attach(trainer, 'acc')

# attach accuracy and loss to train_evaluator
Accuracy(output_transform=max_output_transform).attach(train_evaluator, 'accuracy')
Loss(loss_fn).attach(train_evaluator, 'bce')

# attach accuracy and loss to validation_evaluator
Accuracy(output_transform=max_output_transform).attach(validation_evaluator, 'accuracy')
Loss(loss_fn).attach(validation_evaluator, 'bce')


In [40]:
pbar = ProgressBar(persist=True, bar_format="")
pbar.attach(trainer, ['loss', 'acc'])

In [41]:
@trainer.on(Events.EPOCH_COMPLETED)
def log_training_results(engine):
    """This function will run after each epoch and 
       report the training loss and accuracy (defined above)
    """
    train_evaluator.run(train_dl)
    metrics = train_evaluator.state.metrics
    avg_accuracy = metrics['accuracy']
    avg_bce = metrics['bce']
    pbar.log_message(
        f'Training Results - Epoch: {engine.state.epoch}  Avg accuracy: {avg_accuracy:.4f} Avg loss: {avg_bce:.4f}')
    
@trainer.on(Events.EPOCH_COMPLETED)
def log_validation_results(engine):
    """This function will run after each epoch and 
       report the validation loss and accuracy (defined above)
    """
    validation_evaluator.run(val_dl)
    metrics = validation_evaluator.state.metrics
    avg_accuracy = metrics['accuracy']
    avg_bce = metrics['bce']
    pbar.log_message(
        f'Validation Results - Epoch: {engine.state.epoch}  Avg accuracy: {avg_accuracy:.4f} Avg loss: {avg_bce:.4f}')
    pbar.n = pbar.last_print_n = 0

In [42]:
def score_function(engine):
    """EarlyStopping will call this function to check if score improved
    """
    val_loss = engine.state.metrics['bce']
    return -val_loss


early_stopping = EarlyStopping(patience=3, score_function=score_function, trainer=trainer)
validation_evaluator.add_event_handler(Events.COMPLETED, early_stopping)

checkpointer = ModelCheckpoint(
    './models', 
    'text_gru_concat', 
    save_interval=1, 
    n_saved=1, 
    create_dir=True, 
    save_as_state_dict=True)

trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpointer, {'sentiment': model})

<ignite.engine.engine.RemovableEventHandle at 0x7fb5e9118550>

<ignite.engine.engine.RemovableEventHandle at 0x7fb5e90e2668>

In [43]:
trainer.run(train_dl, max_epochs=10)

HBox(children=(IntProgress(value=0, max=400), HTML(value='')))

Training Results - Epoch: 1  Avg accuracy: 0.8497 Avg loss: 0.3975
Validation Results - Epoch: 1  Avg accuracy: 0.8453 Avg loss: 0.4177


HBox(children=(IntProgress(value=0, max=400), HTML(value='')))

Training Results - Epoch: 2  Avg accuracy: 0.9037 Avg loss: 0.2615
Validation Results - Epoch: 2  Avg accuracy: 0.8852 Avg loss: 0.3213


HBox(children=(IntProgress(value=0, max=400), HTML(value='')))

Training Results - Epoch: 3  Avg accuracy: 0.8990 Avg loss: 0.3431
Validation Results - Epoch: 3  Avg accuracy: 0.8777 Avg loss: 0.4548


HBox(children=(IntProgress(value=0, max=400), HTML(value='')))

Training Results - Epoch: 4  Avg accuracy: 0.9420 Avg loss: 0.1617
Validation Results - Epoch: 4  Avg accuracy: 0.9057 Avg loss: 0.2845


HBox(children=(IntProgress(value=0, max=400), HTML(value='')))

Training Results - Epoch: 5  Avg accuracy: 0.9366 Avg loss: 0.1686
Validation Results - Epoch: 5  Avg accuracy: 0.8962 Avg loss: 0.3143


HBox(children=(IntProgress(value=0, max=400), HTML(value='')))

Training Results - Epoch: 6  Avg accuracy: 0.9575 Avg loss: 0.1166
Validation Results - Epoch: 6  Avg accuracy: 0.9067 Avg loss: 0.2922


HBox(children=(IntProgress(value=0, max=400), HTML(value='')))

Training Results - Epoch: 7  Avg accuracy: 0.9701 Avg loss: 0.0847
Validation Results - Epoch: 7  Avg accuracy: 0.9092 Avg loss: 0.3098


State:
	iteration: 2800
	epoch: 7
	epoch_length: 400
	max_epochs: 10
	output: <class 'tuple'>
	batch: <class 'tuple'>
	metrics: <class 'dict'>
	dataloader: <class 'torch.utils.data.dataloader.DataLoader'>
	seed: 12