In [1]:
# CNN
# based on ideas of the paper 'Convolutional Neural Networks for Sentence Classification' by Yoon Kim (https://arxiv.org/abs/1408.5882)
# and the following github code: https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/4%20-%20Convolutional%20Sentiment%20Analysis.ipynb

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim

from torchtext import data
from torchtext.data import TabularDataset
from torchtext import data
from torchtext.data import Iterator, BucketIterator

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re 
import contractions
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

import spacy
from spacy.lemmatizer import Lemmatizer
spacy_en = spacy.load('en_core_web_sm')

SEED = 1
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [3]:
path = '../../data/Kaggle'

In [4]:
MINLEN = 100 # = maximum filter size

def clean(text):
    text = re.sub( '\s+', ' ',  text).strip()  # remove duplicate whitespaces   
    text = contractions.fix(text)  # replace contractions
    return text

def lemmatize(text):
    doc = spacy_en(text)
    lemmatized = " ".join([token.lemma_ for token in doc])
    return(lemmatized)

def tokenizer(text):
    text = clean(text)  
    # text = lemmatize(text) 
    tokens = [tok.text for tok in spacy_en.tokenizer(text)]  
    if len(tokens) < MINLEN:    # pad each phrase to MINLEN so that all filters can be applied
        tokens += ['<pad>'] * (MINLEN - len(tokens))
    return tokens

In [5]:
TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True)
LABEL = data.LabelField(dtype=torch.float)

# train_data = TabularDataset(f'{path}/train.tsv', 'tsv', fields=[('PhraseId', None),('SentenceId', None),('text', TEXT),('label', LABEL)], skip_header=True) # original train dataset
train = TabularDataset(f'{path}/train_local.csv', 'csv', fields=[('PhraseId', None),('SentenceId', None),('text', TEXT),('label', LABEL)], skip_header=True) # local train dataset
#train = TabularDataset(f'{path}/train_local_balanced.csv', 'csv', fields=[('PhraseId', None),('SentenceId', None),('text', TEXT),('label', LABEL)], skip_header=True) # local balanced train dataset

validate = TabularDataset(f'{path}/valid_local.csv', 'csv', fields=[('PhraseId', None),('SentenceId', None),('text', TEXT),('label', LABEL)], skip_header=True) # local valid dataset

# test_data = TabularDataset(f'{path}/test.tsv', 'tsv', fields=[('PhraseId', None),('SentenceId', None),('text', TEXT)], skip_header=True) # original test data
test_data = TabularDataset(f'{path}/test_local.csv', 'csv', fields=[('PhraseId', None),('SentenceId', None),('text', TEXT)], skip_header=True) # local test dataset
test_sentences = TabularDataset(f'{path}/test_local_sentences.csv', 'csv', fields=[('PhraseId', None),('SentenceId', None),('text', TEXT)], skip_header=True) # local test sentences

# split train_data into train and validate
# train, validate = train_data.split(split_ratio=0.8)  ## only for non-local!

In [6]:
TEXT.build_vocab(train, vectors="glove.6B.100d")  
LABEL.build_vocab(train)   # von train_data zu train 

In [7]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")
print(LABEL.vocab.stoi)

Unique tokens in TEXT vocabulary: 11691
Unique tokens in LABEL vocabulary: 5
defaultdict(<function _default_unk_index at 0x0000025AEBB688C8>, {'2': 0, '3': 1, '1': 2, '4': 3, '0': 4})


In [8]:
BATCH_SIZE = 32

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
valid_iter = BucketIterator(validate, batch_size=BATCH_SIZE, device=device, 
                          sort_key=lambda x: len(x.text), # how to group the data
                          sort_within_batch=False, shuffle = True, repeat=False 
)

train_iter = BucketIterator(train, batch_size=BATCH_SIZE, device=device, sort_key=lambda x: len(x.text), sort_within_batch=False,
 shuffle = True, repeat=False 
)

test_iter = Iterator(test_data, batch_size=BATCH_SIZE, sort=False, device=device, sort_within_batch=False, repeat=False)

In [9]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(fs,embedding_dim)) for fs in filter_sizes])
        self.fc = nn.Linear(len(filter_sizes)*n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        # x = [phrase length, batch size]        
        x = x.permute(1, 0)                
        # x = [batch size, phrase length]               
        
        #-----embedded = self.embedding(x)
        embedded = self.dropout(self.embedding(x))  
        # embedded = [batch size, phrase length, embedding dim]     
        embedded = embedded.unsqueeze(1)  # unsqueeze(1) adds dimension of size 1 at index 1; to get required input shape for Conv2D
        # embedded = [batch size, 1, phrase length, embedding dim]        
       
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs] # squeeze(3) removes dimension of size 1 at index 3; W_out = 1 
        # conv_n = [batch size, n_filters, phrase length - filter_sizes[n] + 1]        
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]  
        # len(pooled) = len(filter_sizes); for each filter_size: calculate for each of the n_filters filters the max value
            # pooled_n = [batch size, n_filters]       
        cat = torch.cat(pooled, dim=1)         
        ##---cat = self.dropout(torch.cat(pooled, dim=1))
        # cat = [batch size, n_filters * len(filter_sizes)]
        return self.fc(cat)
    
    
#----- Explainations concerning dimension:
# Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True):
#   - input: (N,C_in,H,W) with  N=batch size, C_in=num of channels, H=height of input planes, W=width
#   - output: (N, C_out, H_out, W_out), calculation of H_out and W_out see https://pytorch.org/docs/stable/nn.html
#   - here: N=batch_size, C_out=n_filters, H_out = sent len - filter-height
# F.relu(..) applies rectified linear unit function element-wise: F.relu(x)= max(0, x)
# torch.nn.functional.max_pool1d(*args, **kwargs): Applies 1D max pooling over an input signal composed of several input planes.
#   - here: max_pool1d(input, kernel_size); kernel_size = size of window to take a max over
#   - output: [batch size, n_filters, 1] --> apply squeeze(2) to get the shape [batch size, n_filters]
# torch.cat(tensors, dim=..): 
#  - Concatenates the given sequence of seq tensors in the given dimension. 
#  - All tensors must either have the same shape (except in the concatenating dimension) or be empty.
# torch.nn.functional.dropout(input, p=0.5, training=True, inplace=False) 
#  - During training, randomly zeroes some of the elements of the input tensor with probability p using samples from a Bernoulli distribution.

In [None]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
FILTER_SIZES = [50,100] # [3,4,6,8] 
N_FILTERS = 120
OUTPUT_DIM = 5
DROPOUT = 0.6 

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT)

pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()  #weight = torch.tensor([15, 1, 1, 1, 1]).float()  {'2': 0, '3': 1, '1': 2, '4': 3, '0': 4})

model = model.to(device)
criterion = criterion.to(device)

In [None]:
def get_labels(predictions):
    max_idx = torch.max(predictions,1)[1]
    return max_idx

In [None]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label.long())
        
        pred_labels = get_labels(predictions).float()
        acc = accuracy_score(batch.label, pred_labels)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def plot_confusion_matrix(cm):
    labels = LABEL.vocab.itos
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(cm)
    plt.title('Confusion matrix')
    fig.colorbar(cax)
    ax.set_xticklabels([''] + labels)
    ax.set_yticklabels([''] + labels)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

In [None]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    conf_matrix = np.zeros((5,5))
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label.long())
            
            pred_labels = get_labels(predictions).float()        
            acc = accuracy_score(batch.label, pred_labels)
            
            # conf_matrix += confusion_matrix(batch.label.long(), pred_labels, labels = [0,1,2,3,4])

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    
    # print('conf_matrix: ', conf_matrix)
    # plot_confusion_matrix(conf_matrix)
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [None]:
EPOCHS = 6

loss = np.empty([2, EPOCHS])
acc = np.empty([2, EPOCHS])

for epoch in range(EPOCHS):

    train_loss, train_acc = train(model, train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iter, criterion)
    
    loss[0, epoch] = train_loss
    loss[1, epoch] = valid_loss
    acc[0, epoch] = train_acc
    acc[1, epoch] = valid_acc
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Valid Loss: {valid_loss:.3f} | Valid Acc: {valid_acc*100:.2f}% |')

| Epoch: 01 | Train Loss: 1.010 | Train Acc: 58.89% | Valid Loss: 1.010 | Valid Acc: 58.81% |
| Epoch: 02 | Train Loss: 0.886 | Train Acc: 63.72% | Valid Loss: 0.983 | Valid Acc: 59.43% |
| Epoch: 03 | Train Loss: 0.830 | Train Acc: 65.81% | Valid Loss: 0.961 | Valid Acc: 61.11% |


In [None]:
def save_plot(train, validate, ylabel, plotname, EPOCHS):
    epochs = np.arange(EPOCHS)+1
    plt.plot(epochs,train, 'b', label='train')
    plt.plot(epochs,validate, 'g', label='validate')
    plt.xlabel('epochs')
    plt.ylabel(ylabel)
    plt.legend(shadow=False) 
    plt.savefig(plotname)
    plt.show()
    plt.close()
     
save_plot(loss[0,:], loss[1,:], 'loss', 'cnn_loss.png', EPOCHS)
save_plot(acc[0,:], acc[1,:], 'accuracy','cnn_acc.png', EPOCHS)

In [None]:
def predict(model, test_data):
    prediction = torch.zeros(len(test_data))
    with torch.no_grad():
        for i in range(len(test_data)):
            test_sen = test_data[i].text
            test_sen = [TEXT.vocab.stoi[x] for x in test_sen]
            tensor = torch.LongTensor(test_sen).to(device)
            tensor = tensor.unsqueeze(1) # tensor has shape [len(test_sen) x 1]
            output = model(tensor)
            out = F.softmax(output, 1)
            pred_idx = torch.argmax(out[0]) 
            prediction[i] = int(LABEL.vocab.itos[pred_idx])
    return prediction

In [None]:
#sub = pd.read_csv(f'{path}/sampleSubmission.csv')

#sub.Sentiment = predict(model, test_data)
#sub.Sentiment = sub.Sentiment.astype(int)
#sub.to_csv('sub_cnn.csv', header = True, index=False)  

In [None]:
# local test data
def predict_local(test_data_filename, test_data, model):
    test_local = pd.read_csv(f'{path}/{test_data_filename}')
    true = test_local.iloc[:,-1]
    pred_local = predict(model, test_data)
    acc = accuracy_score(true, pred_local)
    print(acc)
    return acc

acc_test_local_sentences = predict_local('test_local_sentences.csv', test_sentences, model)
acc_test_local = predict_local('test_local.csv', test_data, model)

