In [None]:
import torch

import pandas as pd
import numpy as np

from dataloader import *
from model import *
from train import *

# Data Preprocessing

Ensuring unbiased split of dataset by distributing the classes equally.

In [None]:
PATH = %pwd
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
main_df = pd.read_csv(os.path.join(PATH, 'train.csv'))
main_df = main_df.sample(n=main_df.shape[0])
main_df = main_df[["question_text", "target"]]

In [None]:
o_class = main_df.loc[main_df.target == 0, :]
l_class = main_df.loc[main_df.target == 1, :]

In [None]:
test_o = o_class.iloc[:10000, :]
test_l = l_class.iloc[:10000, :]

valid_o = o_class.iloc[10000:20000, :]
valid_l = l_class.iloc[10000:20000, :]

train_o = o_class.iloc[20000:, :]
train_l = l_class.iloc[20000:, :]

In [None]:
train = pd.concat([train_o, train_l], axis=0)
valid = pd.concat([valid_o, valid_l], axis=0)
test = pd.concat([test_o, test_l], axis=0)

In [None]:
!mkdir inputs

### Saving the preprocessed dataset 
To avoid multiple computations.

In [None]:
train.to_csv(os.path.join(PATH, "inputs/train.csv"), index=False)
test.to_csv(os.path.join(PATH, "inputs/test.csv"), index=False)
valid.to_csv(os.path.join(PATH, "inputs/valid.csv"), index=False)

In [None]:
# Deleting the useless dataframes to free up memory
del main_df, train, test, valid, train_l, train_o, test_l, test_o, valid_l,valid_o, o_class, l_class

## Loading the Dataset

**Field**</br> 
- Defines a datatype together with instructions for converting to Tensor.
- It holds a Vocab object that defines the set of possible values for elements of the field and their corresponding numerical representations.
- The Field object also holds other parameters relating to how a datatype should be numericalized, such as a tokenization method and the kind of Tensor that should be produced.

**Label Field**</br>
- A shallow wrapper around a standard field designed to hold labels for a classification task.

**Tokenizer**</br>
- During processing, _spaCy_ first tokenizes the text, i.e. segments it into words, punctuation and so on. This is done by applying rules specific to each language.

- ![Tokenizer](https://spacy.io/tokenization-9b27c0f6fe98dcb26239eba4d3ba1f3d.svg)

**Bulid Vocab**</br>
- Construct the Vocab object for this field from the dataset which occurs with a minimum frequency.
- Converts and Stores the numerical representations of the vocabulary using the pre trained weights.

**Pretrained Weights: [Glove Embeddings](https://nlp.stanford.edu/projects/glove/)**</br>
- Training is performed on aggregated global word-word co-occurrence statistics from a corpus.
- The resulting representations showcase interesting linear substructures of the word vector space.
    * The Euclidean distance between two word vectors provides an effective method for measuring the linguistic or semantic similarity of the corresponding words.



**Bucket Iterator**</br>
- Defines an iterator that batches examples of similar lengths together.
- Minimizes amount of padding needed while producing freshly shuffled batches for each new epoch.


In [None]:
import torch
import os
import spacy
import nltk
import torchtext

class CreateDataset(torch.utils.data.Dataset):

    def __init__(self, PATH, batch_size=32):
        self.PATH = PATH
        self.batch_size = batch_size
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        self.spacy = spacy.load("en_core_web_sm")

        self.TEXT = torchtext.legacy.data.Field(sequential=True, tokenize="spacy")
        self.LABEL = torchtext.legacy.data.LabelField(dtype=torch.long, sequential=False)

        self.initData()
        self.initEmbed()

        self.makeData()

    def initData(self):
        DATA = os.path.join(self.PATH, 'inputs/')

        self.train_data, self.valid_data, self.test_data = torchtext.legacy.data.TabularDataset.splits(
                        path=DATA, 
                        train="train.csv", validation="valid.csv", test="test.csv", 
                        format="csv", 
                        skip_header=True, 
                        fields=[('Text', self.TEXT), ('Label', self.LABEL)])

    def initEmbed(self):
        EMBED = os.path.join(self.PATH, "embeddings/glove.840B.300d/glove.840B.300d.txt")

        self.TEXT.build_vocab(self.train_data,
                         vectors=torchtext.vocab.Vectors(EMBED), 
                         max_size=20000, 
                         min_freq=10)
        self.LABEL.build_vocab(self.train_data)

    def makeData(self):
        self.train_iterator, self.valid_iterator, self.test_iterator = torchtext.legacy.data.BucketIterator.splits(
                        (self.train_data, self.valid_data, self.test_data), 
                        sort_key=lambda x: len(x.Text), 
                        batch_size=self.batch_size,
                        device=self.device)

    def lengthData(self):
        return len(self.train_data), len(self.valid_data), len(self.test_data)
    
    def lengthVocab(self):
        return len(self.TEXT.vocab), len(self.LABEL.vocab)

    def freqLABEL(self):
        return self.LABEL.vocab.freqs

    def getData(self):
        return self.train_iterator, self.valid_iterator, self.test_iterator

    def getEmbeddings(self):
        return self.TEXT.vocab.vectors

In [None]:
dataset = CreateDataset(PATH)

In [None]:
train_iterator, valid_iterator, test_iterator = dataset.getData()

In [None]:
pretrained_embeddings = dataset.getEmbeddings()

In [None]:
input_dim = dataset.lengthVocab()[0]
embedding_dim = 300
hidden_dim = 374
output_dim = 2
num_layers = 2
batch_size = 32

# Initializing The Model
### Long-Short Term Memory Networks
Long-Short Term Memory Networks, or __LSTMS__, are a special kind of RNN, capable of learning long-term dependencies. LSTMs are explicitly designed to avoid the long-term dependency problem.

**Recurrent Neural Networks**</br>
All RNNs have the form of a chain of repeating modules of neural network. In standard RNNs, this repeating module will have a very simple structure, such as a single tanh layer.
![RNN](https://colah.github.io/posts/2015-08-Understanding-LSTMs/img/LSTM3-SimpleRNN.png)

**[Long-Short Term Memory Networks](https://colah.github.io/posts/2015-08-Understanding-LSTMs/)**</br>
LSTMs also have this chain like structure, but the repeating module has a different structure. Instead of having a single neural network layer, there are four, interacting in a very special way.
![LSTM](https://colah.github.io/posts/2015-08-Understanding-LSTMs/img/LSTM3-chain.png)
<p align="center">
  <img width="300" height="200" src="https://wikimedia.org/api/rest_v1/media/math/render/svg/7dee414820d5c0162ae1fff1899e58b08923944f">
</p>

In [None]:
class LSTM(torch.nn.Module):
    def __init__(self, input_dim, embedding_dim, num_layers, hidden_dim, static=False, dropout=0.2):
        super(LSTM, self).__init__()
        self.hidden_dim = hidden_dim

        self.dropout = torch.nn.Dropout(p=dropout)

        self.embedding = torch.nn.Embedding(input_dim, embedding_dim)
        if static:
            self.embedding.weight.requires_grad = False

        self.lstm = torch.nn.LSTM(embedding_dim, hidden_dim, 
                                         num_layers=num_layers,
                                         bidirectional=True, 
                                         dropout=dropout, 
                                         batch_first=True)
        self.linear = torch.nn.Linear(hidden_dim*num_layers*2, 1)
    
    def forward(self, text):
        embedded = self.embedding(text)
        embedded = torch.transpose(embedded, dim0=1, dim1=0)
        lstm_out, (hidden, cell) = self.lstm(embedded)
        out = self.linear(self.dropout(torch.cat([cell[i,:, :] for i in range(cell.shape[0])], dim=1)))
        return out

In [None]:
model = LSTM(input_dim, embedding_dim, hidden_dim, num_layers)

In [None]:
model.embedding.weight.data = pretrained_embeddings.to(device)
class_weights = torch.tensor([1.0, 15.0]).to(device)

In [None]:
import torch.optim as optim
import torch.nn as nn

optimizer = optim.SGD(model.parameters(), lr=1e-4)
criterion = nn.BCEWithLogitsLoss()

In [None]:
model = model.to(device)
criterion = criterion.to(device)

In [None]:
epoch_train_losses = []
epoch_test_losses = []
epoch_val_losses = []
accu_train_epoch = []
accu_test_epoch = []
accu_val_epoch = []

In [None]:
import torch.nn.functional as F

def binary_accuracy(preds, y):

    preds = torch.sigmoid(preds)
    preds = torch.round(preds)

    correct = (preds == y).float()
    acc = correct.sum()/float(len(correct))
    return acc

# Training

In [None]:
import pyprind

def train(model, iterator, optimizer, criterion):
    
    train_loss_batch = []
    accu_train_batch = []
    model.train()
    bar = pyprind.ProgBar(len(iterator), bar_char='█')
    for batch in iterator:
        
        optimizer.zero_grad()
                
        predictions = model.forward(batch.Text).view(-1)
        batch.Label = (batch.Label).type_as(predictions)
        train_loss = criterion(predictions, batch.Label)
        acc = binary_accuracy(predictions, batch.Label)
        
        train_loss.backward()
        optimizer.step()
        
        train_loss_batch.append(train_loss)
        accu_train_batch.append(acc)
        bar.update()

    epoch_train_losses.append(sum(train_loss_batch)/len(iterator))
    accu_train_epoch.append(sum(accu_train_batch)/len(iterator))

    return epoch_train_losses[-1], accu_train_epoch[-1]

In [None]:
def evaluate(model, iterator, criterion):
    
    val_loss_batch = []
    accu_val_batch = []
    model.eval()
    
    with torch.no_grad():
        bar = pyprind.ProgBar(len(iterator), bar_char='█')
        for batch in iterator:

            predictions = model.forward(batch.Text).view(-1)
            batch.Label = (batch.Label).type_as(predictions)
            val_loss = criterion(predictions, batch.Label)
            
            acc = binary_accuracy(predictions, batch.Label)

            val_loss_batch.append(val_loss)
            accu_val_batch.append(acc)
            bar.update()
        epoch_val_losses.append(sum(val_loss_batch)/len(iterator))
        accu_val_epoch.append(sum(accu_val_batch)/len(iterator))
    return epoch_val_losses[-1], accu_val_epoch[-1]

In [None]:
epochs = 2

for epoch in range(epochs):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')