# Table of contents

[Supervised - basic models](./supervised_basic.ipynb)

[Supervised - RNN models](./supervised_rnn.ipynb)

[Unsupervised - Word2Vec](./unsupervised_w2v.ipynb)

[Unsupervised - Dimensionality Reduction](./unsupervised_dim.ipynb)

[Unsupervised - LDA](./unsupervised_LDA.ipynb)

[Performance Evaluation](./evaluation.ipynb)

In [28]:
import torch
import torch.nn as nn
import pickle

from torchtext.legacy.data import Field, LabelField, TabularDataset, BucketIterator, Iterator 
from torchtext.vocab import Vectors
 
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import tqdm
import time

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer

from evaluation_helper import evaluation_helper
from preprocess_helper import preprocess_helper

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\allenhu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\allenhu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\allenhu\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\allenhu\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
RANDOM_SEED = 42

In [3]:
torch.manual_seed(RANDOM_SEED)
torch.set_num_threads(4)
torch.set_num_interop_threads(4)

In [4]:
lem=WordNetLemmatizer()
retokenizer = RegexpTokenizer(r"\w+")
def tokenizer(text):
    words=[word for word in retokenizer.tokenize(text)]
    words=[lem.lemmatize(w) for w in words]
    return words

In [5]:
#text_field = Field(tokenize=tokenizer, lower=False, include_lengths=True, batch_first=True)

In [6]:
text_field = Field(tokenize='basic_english', lower=False, include_lengths=True, batch_first=True)

In [7]:
label_field = LabelField(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float)

In [8]:
fields = [('original_text',text_field),('label', label_field)]

In [9]:
training_data=TabularDataset(path = 'data/WikiLarge_Train.csv',format = 'csv',fields = fields,skip_header = True)

In [10]:
print(len(training_data))

416768


In [11]:
import random
train_data, dev_data = training_data.split(split_ratio=0.8, random_state=random.seed(RANDOM_SEED))

In [12]:
text_field.build_vocab(train_data, min_freq=50, vectors='charngram.100d')
 

#No. of unique tokens in text
print("Size of TEXT vocabulary:",len(text_field.vocab))


#Commonly used words
print(text_field.vocab.freqs.most_common(10))  



Size of TEXT vocabulary: 9551
[('the', 456951), (',', 389835), ('.', 357180), ('of', 232093), ('in', 202680), ("'", 167878), ('and', 167598), ('a', 156818), ('is', 131249), ('to', 99407)]


In [13]:
type(text_field.vocab.vectors)

torch.Tensor

In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  

trainIterator = BucketIterator(train_data, batch_size= 64, sort_key=lambda x: len(x.original_text),
                            device=device, sort=False, sort_within_batch=True)
dev_iter = BucketIterator(dev_data,  batch_size= 64, sort_key=lambda x: len(x.original_text),
                            device=device, sort=False, sort_within_batch=True)


In [15]:
embeddingSize = 100
hiddenSize = 10
dropoutRate = 0.5
numEpochs = 5
vocabSize = len(text_field.vocab)
pad = 1
unk = 0

class MyRNN(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.name = model
        self.LSTM = (model == 'LSTM' or model == 'BiLSTM')
        self.bidir = (model == 'BiLSTM')
        
        self.embed = nn.Embedding(vocabSize, embeddingSize, padding_idx = pad)
        
        if model == 'RNN': 
            self.rnn = nn.RNN(embeddingSize, hiddenSize)
        elif model == 'GRU': 
            self.rnn = nn.GRU(embeddingSize, hiddenSize)
        else: 
            self.rnn = nn.LSTM(embeddingSize, hiddenSize, bidirectional=self.bidir)

        self.dense = nn.Linear(hiddenSize * (2 if self.bidir else 1), 1)
        self.dropout = nn.Dropout(dropoutRate)
        
    def forward(self, text, textLengths):
        embedded = self.dropout(self.embed(text))
        
        packedEmbedded = nn.utils.rnn.pack_padded_sequence(embedded, textLengths, batch_first=True)
        if self.LSTM: 
            packedOutput, (hidden, cell) = self.rnn(packedEmbedded)
        else: 
            packedOutput, hidden = self.rnn(packedEmbedded)

        output, outputLengths = nn.utils.rnn.pad_packed_sequence(packedOutput, batch_first=True)
        if self.bidir: 
            hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
        else: 
            hidden = hidden[0]

        return self.dense(self.dropout(hidden))

In [16]:
basicRNN =  MyRNN(model='RNN')
GRU =  MyRNN(model='GRU') 
LSTM = MyRNN(model='LSTM') 
biLSTM = MyRNN(model='BiLSTM') 
models = [basicRNN, GRU, LSTM, biLSTM]

In [17]:
for model in models:
    if model is None:
        continue
    model.embed.weight.data.copy_(text_field.vocab.vectors)
    model.embed.weight.data[unk] = torch.zeros(embeddingSize)
    model.embed.weight.data[pad] = torch.zeros(embeddingSize)

In [18]:
criterion = nn.BCEWithLogitsLoss()

def batchAccuracy(preds, targets):
    roundedPreds = (preds >= 0)
    return (roundedPreds == targets).sum().item() / len(preds)

In [19]:
# Training
numEpochs = 10

train_time = {}
for model in models: 
    if model is not None:
        model.train()

for model in models:
    if model is None:
        continue

    

    start_time = time.time()
    torch.manual_seed(0)
    optimizer = torch.optim.Adam(model.parameters())
    for epoch in range(numEpochs):
        epochLoss = 0
        i=0
        j=0
        for batch in trainIterator:
            optimizer.zero_grad()
            examples = [example for example in batch] #added 
            text, textLen = examples[0]
            for n in range(len(textLen)):
                if textLen[n] ==0:
                    textLen[n]=1
                    j+=1
            predictions = model(text, textLen).squeeze(1)
            loss = criterion(predictions, examples[1])
            loss.backward()
            optimizer.step()
            epochLoss += loss.item()
            i+=1
        print(i,j)  
        print(f'Model: {model.name}, Epoch: {epoch + 1}, Train Loss: {epochLoss / i}')
    train_time[model.name] = time.time() - start_time
    print()

5210 4
Model: RNN, Epoch: 1, Train Loss: 0.6809835895066527
5210 4
Model: RNN, Epoch: 2, Train Loss: 0.6614409737818072
5210 4
Model: RNN, Epoch: 3, Train Loss: 0.6412096203260138
5210 4
Model: RNN, Epoch: 4, Train Loss: 0.6305526934406808
5210 4
Model: RNN, Epoch: 5, Train Loss: 0.6247683097442144
5210 4
Model: RNN, Epoch: 6, Train Loss: 0.6216648888736678
5210 4
Model: RNN, Epoch: 7, Train Loss: 0.6192985341153081
5210 4
Model: RNN, Epoch: 8, Train Loss: 0.6214612549646344
5210 4
Model: RNN, Epoch: 9, Train Loss: 0.6073265984597225
5210 4
Model: RNN, Epoch: 10, Train Loss: 0.6031984176779892

5210 4
Model: GRU, Epoch: 1, Train Loss: 0.6205459449147079
5210 4
Model: GRU, Epoch: 2, Train Loss: 0.5692922777440864
5210 4
Model: GRU, Epoch: 3, Train Loss: 0.554219114036798
5210 4
Model: GRU, Epoch: 4, Train Loss: 0.5447768971295366
5210 4
Model: GRU, Epoch: 5, Train Loss: 0.5372544190476357
5210 4
Model: GRU, Epoch: 6, Train Loss: 0.5304315085288659
5210 4
Model: GRU, Epoch: 7, Train Loss

In [20]:
train_time

{'RNN': 971.8303790092468,
 'GRU': 1230.415951013565,
 'LSTM': 1226.0426499843597,
 'BiLSTM': 1697.0781235694885}

In [23]:
evaluator = evaluation_helper()

In [25]:
# Evaluation

for model in models: 
    if model is not None:
        model.eval()

with torch.no_grad():
    
    for model in models:
        
        if model is None:
            continue

        accuracy = 0.0
        i=0
        j=0
        for batch in dev_iter:
            examples = [example for example in batch] #added 
            text, textLen = examples[0]
            for n in range(len(textLen)):
                if textLen[n] ==0:
                    textLen[n]=1
            predictions = model(text, textLen).squeeze(1)
            loss = criterion(predictions, examples[1])
            acc = batchAccuracy(predictions, examples[1])
            accuracy += acc
#             evaluator.evaluate(model.name, examples[1], predictions, train_time[model.name])
            i+=1
        print(i, j)
        print('Model: {}, Validation Accuracy: {}%'.format(model.name, accuracy / i * 100))

1303 0
Model: RNN, Validation Accuracy: 67.68948506405337%
1303 0
Model: GRU, Validation Accuracy: 71.85738163409883%
1303 0
Model: LSTM, Validation Accuracy: 72.38085925969656%
1303 0
Model: BiLSTM, Validation Accuracy: 72.66856440758015%


In [26]:
model.eval()

MyRNN(
  (embed): Embedding(9551, 100, padding_idx=1)
  (rnn): LSTM(100, 10, bidirectional=True)
  (dense): Linear(in_features=20, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)