In [1]:
# bi-directional LSTM
# bases on ideas of: https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/2%20-%20Upgraded%20Sentiment%20Analysis.ipynb

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim

from torchtext import data
from torchtext.data import TabularDataset
from torchtext import data
from torchtext.data import Iterator, BucketIterator

import pandas as pd
import re 
import contractions
from spacy.lemmatizer import Lemmatizer
from sklearn.metrics import confusion_matrix  
from sklearn.metrics import accuracy_score

import numpy as np
import matplotlib.pyplot as plt

import spacy
spacy_en = spacy.load('en_core_web_sm')

SEED = 1

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [3]:
path = '../../data/Kaggle'

In [4]:
def clean(text):
    text = re.sub( '\s+', ' ',  text).strip()  # remove duplicate whitespaces   
    text = contractions.fix(text)  # replace contractions
    return text

def lemmatize(text):
    doc = spacy_en(text)
    return " ".join([token.lemma_ for token in doc])

def tokenizer(text): 
    text = clean(text) 
    # text = lemmatize(text)   
    tokens = [tok.text for tok in spacy_en.tokenizer(text)]
    if (len(tokens) == 0):
        tokens = ['.']
    return tokens

In [5]:
TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True)
LABEL = data.LabelField(dtype=torch.float)

# train_data = TabularDataset(f'{path}/train.tsv', 'tsv', fields=[('PhraseId', None),('SentenceId', None),('text', TEXT),('label', LABEL)], skip_header=True) # original train dataset
train = TabularDataset(f'{path}/train_local.csv', 'csv', fields=[('PhraseId', None),('SentenceId', None),('text', TEXT),('label', LABEL)], skip_header=True) # local train dataset
#train = TabularDataset(f'{path}/train_local_balanced.csv', 'csv', fields=[('PhraseId', None),('SentenceId', None),('text', TEXT),('label', LABEL)], skip_header=True) # local balanced train dataset

validate = TabularDataset(f'{path}/valid_local.csv', 'csv', fields=[('PhraseId', None),('SentenceId', None),('text', TEXT),('label', LABEL)], skip_header=True) # local valid dataset

# test_data = TabularDataset(f'{path}/test.tsv', 'tsv', fields=[('PhraseId', None),('SentenceId', None),('text', TEXT)], skip_header=True) # original test data
test_data = TabularDataset(f'{path}/test_local.csv', 'csv', fields=[('PhraseId', None),('SentenceId', None),('text', TEXT)], skip_header=True) # local test dataset
test_sentences = TabularDataset(f'{path}/test_local_sentences.csv', 'csv', fields=[('PhraseId', None),('SentenceId', None),('text', TEXT)], skip_header=True) # local test sentences

# split train_data into train and validate
#---train, validate = train_data.split(split_ratio = 0.8) # only if not local!
 
TEXT.build_vocab(train, vectors = "glove.6B.100d")  
LABEL.build_vocab(train)

print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")
print(LABEL.vocab.stoi)

Unique tokens in TEXT vocabulary: 11691
Unique tokens in LABEL vocabulary: 5
defaultdict(<function _default_unk_index at 0x000001C4CF3C98C8>, {'2': 0, '3': 1, '1': 2, '4': 3, '0': 4})


In [6]:
BATCH_SIZE = 32

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
valid_iter = BucketIterator(validate,batch_size=BATCH_SIZE, device=device, 
                          sort_key=lambda x: len(x.text), # how to group the data
                          sort_within_batch=False, shuffle = True, repeat=False 
)

train_iter = BucketIterator(train, batch_size=BATCH_SIZE, device=device, sort_key=lambda x: len(x.text), sort_within_batch=False,
 shuffle = True, repeat=False 
)

test_iter = Iterator(test_data, batch_size=BATCH_SIZE, sort=False, device=device, sort_within_batch=False, repeat=False)

In [7]:
class biLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers, bidirectional, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.biLstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=bidirectional, dropout=dropout)
        self.linear = nn.Linear(hidden_dim*2, output_dim)  # multipy with 2 because we use bidirectional lstm
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        #--- embedded = self.embedding(x)
        output, (hidden, cell) = self.biLstm(embedded)   
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        #---hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        return self.linear(hidden)

In [8]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100 
HIDDEN_DIM = 512 
OUTPUT_DIM = 5
NUM_LAYERS = 4 
BIDIRECTIONAL = True
DROPOUT = 0.6 #0.5

model = biLSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, NUM_LAYERS, BIDIRECTIONAL, DROPOUT)

In [9]:
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.3552,  0.4732,  0.8660,  ...,  0.2483, -0.0049,  0.8731],
        [-0.1431,  0.0487,  0.0565,  ..., -0.0402, -0.3744,  0.5650],
        [ 0.1580, -0.2077,  0.0084,  ..., -1.2656, -0.2771, -0.3230]])

In [None]:
optimizer = optim.Adam(model.parameters())

criterion = nn.CrossEntropyLoss(reduction='sum')  #weight = torch.tensor([15, 2, 2,2,2]).float()

model = model.to(device)
criterion = criterion.to(device)

In [None]:
def get_labels(predictions):
    max_idx = torch.max(predictions,1)[1]
    return max_idx

In [None]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label.long())
        
        pred_labels = get_labels(predictions).float()
        acc = accuracy_score(batch.label, pred_labels)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    conf_matrix = np.zeros((5,5))
        
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label.long())
            
            pred_labels = get_labels(predictions).float()        
            acc = accuracy_score(batch.label, pred_labels)
             
            #conf_matrix += confusion_matrix(batch.label.long(), pred_labels, labels = [0,1,2,3,4])

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    # print('conf_matrix: ', conf_matrix)
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
EPOCHS = 5

loss = np.empty([2, EPOCHS])
acc = np.empty([2, EPOCHS])

for epoch in range(EPOCHS):

    train_loss, train_acc = train(model, train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iter, criterion)
    
    loss[0, epoch] = train_loss
    loss[1, epoch] = valid_loss
    acc[0, epoch] = train_acc
    acc[1, epoch] = valid_acc
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Valid Loss: {valid_loss:.3f} | Valid Acc: {valid_acc*100:.2f}% |')

| Epoch: 01 | Train Loss: 32.744 | Train Acc: 57.96% | Valid Loss: 31.466 | Valid Acc: 59.76% |


In [None]:
def save_plot(train, validate, ylabel, plotname, EPOCHS):
    epochs = np.arange(EPOCHS)+1
    plt.plot(epochs,train, 'b', label='train')
    plt.plot(epochs,validate, 'g', label='validate')
    
    plt.xlabel('epochs')
    plt.ylabel(ylabel)
    plt.legend(shadow=False)
    plt.savefig(plotname)
    plt.show()
    plt.close()

save_plot(loss[0,:], loss[1,:], 'loss', 'BLSTM_loss.png', EPOCHS)
save_plot(acc[0,:], acc[1,:], 'accuracy','BLSTM_acc.png', EPOCHS)

In [None]:
def predict(model, test_data):
    prediction = torch.zeros(len(test_data))
    with torch.no_grad():
        for i in range(len(test_data)):
            test_sen = test_data[i].text
            test_sen = [TEXT.vocab.stoi[x] for x in test_sen]
            tensor = torch.LongTensor(test_sen).to(device)
            tensor = tensor.unsqueeze(1) # tensor has shape [len(test_sen) x 1]
            output = model(tensor)
            out = F.softmax(output, 1)
            pred_idx = torch.argmax(out[0]) 
            prediction[i] = int(LABEL.vocab.itos[pred_idx])
    return prediction

In [None]:
#sub = pd.read_csv(f'{path}/sampleSubmission.csv')

#sub.Sentiment = predict(model, test_data)
#sub.Sentiment = sub.Sentiment.astype(int)
#sub.to_csv('sub_blstm.csv', header = True, index=False)  

In [None]:
# local test data
def predict_local(test_filename, test_data, model):
    test_local = pd.read_csv(f'{path}/{test_filename}')
    true = test_local.iloc[:,-1]
    pred_local = predict(model, test_data)
    acc = accuracy_score(true, pred_local)
    print(acc)
    return acc

acc_test_local_sentences = predict_local('test_local_sentences.csv', test_sentences, model)
acc_test_local = predict_local('test_local.csv',test_data, model)

