In [1]:
from google.colab import drive

drive.mount('/content/gdrive', force_remount = True)
dataset_path = 'gdrive/My Drive/Deep Learning/sentiment analysis/'

Mounted at /content/gdrive


In [None]:
!pip install pytreebank

Collecting pytreebank
  Downloading https://files.pythonhosted.org/packages/e0/12/626ead6f6c0a0a9617396796b965961e9dfa5e78b36c17a81ea4c43554b1/pytreebank-0.2.7.tar.gz
Building wheels for collected packages: pytreebank
  Building wheel for pytreebank (setup.py) ... [?25l[?25hdone
  Created wheel for pytreebank: filename=pytreebank-0.2.7-cp36-none-any.whl size=37072 sha256=a7fdfa82cd065cf1c2fe65cbdb3279cbfddebf50f5193f826dcd9a5fbf4be4dd
  Stored in directory: /root/.cache/pip/wheels/e0/b6/91/e9edcdbf464f623628d5c3aa9de28888c726e270b9a29f2368
Successfully built pytreebank
Installing collected packages: pytreebank
Successfully installed pytreebank-0.2.7


In [None]:
import pytreebank

In [None]:
import sys
import os

out_path = os.path.join(dataset_path,sys.path[0], 'sst_{}.txt')
print(out_path)
dataset = pytreebank.load_sst('./raw_data')
# Store train, dev and test in separate files
for category in ['train', 'test', 'dev']:
    with open(out_path.format(category), 'w') as outfile:
        for item in dataset[category]:
            outfile.write("{}\t{}\n".format(
                item.to_labeled_lines()[0][1],
                item.to_labeled_lines()[0][0] + 1,
            ))
# Print the length of the training set
print(len(dataset['train']))

gdrive/My Drive/Deep Learning/sentiment analysis/sst_{}.txt
8544


In [None]:
import pandas as pd
# Read train data
for file in ['sst_train','sst_test','sst_dev']:
  print(file)
  df = pd.read_csv(dataset_path+file+'.txt', sep='\t', header=None, names=['text', 'truth'])
  #df['truth'] = df['truth'].str.replace('__label__', '')
  df['truth'] = df['truth'].astype(int).astype('category')
  print(df.shape[0])
  print(df.head())
  df.to_csv(dataset_path + file+'.csv',index = False)


sst_train
8544
                                                text truth
0  The Rock is destined to be the 21st Century 's...     4
1  The gorgeously elaborate continuation of `` Th...     5
2  Singer/composer Bryan Adams contributes a slew...     4
3  You 'd think by now America would have had eno...     3
4               Yet the act is still charming here .     4
sst_test
2210
                                                text truth
0                     Effective but too-tepid biopic     3
1  If you sometimes like to go to the movies to h...     4
2  Emerges as something rare , an issue movie tha...     5
3  The film provides some great insight into the ...     3
4  Offers that rare combination of entertainment ...     5
sst_dev
1101
                                                text truth
0  It 's a lovely film with lovely performances b...     4
1  No one goes unindicted here , which is probabl...     3
2  And if you 're not nearly moved to tears by a ...     4
3             

In [2]:
import torch
from torchtext import data

SEED = 1234
import pandas as pd
import numpy as np
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchtext

import nltk

import random
from sklearn.metrics import classification_report

#import pyprind
%matplotlib inline  

In [3]:
import spacy
spacy_en = spacy.load('en')

In [4]:
is_cuda = torch.cuda.is_available()
print("Cuda Status on system is {}".format(is_cuda))

Cuda Status on system is True


In [5]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

Filters were of sizes 3, 4 and 5

In [24]:
FILTER_SIZES = [3,4,5]

In [27]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
import string

def tokenizer(text):
  token = [t.text for t in spacy_en.tokenizer(text)]
  if len(token) < FILTER_SIZES[-1]:
      for i in range(0, FILTER_SIZES[-1] - len(token)):
          token.append('<PAD>')
  return token

In [28]:
TEXT = data.Field(tokenize = tokenizer, batch_first = True)
LABEL = data.LabelField(dtype = torch.long)

In [29]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [30]:
train_data, valid_data, test_data = data.TabularDataset.splits(
    path=dataset_path, train="sst_train.csv", 
    validation="sst_dev.csv", test="sst_test.csv",format="csv", skip_header=True, 
    fields=[('text', TEXT), ('truth', LABEL)]
)

In [31]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of valid examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 8544
Number of valid examples: 1101
Number of testing examples: 2210


In [32]:
TEXT.build_vocab(train_data, vectors=torchtext.vocab.Vectors(dataset_path+"glove.840B.300d.txt"), 
                 max_size=25000,unk_init = torch.Tensor.normal_)
LABEL.build_vocab(train_data)

In [33]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 17166
Unique tokens in LABEL vocabulary: 5


In [34]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), sort_key=lambda x: len(x.text),
    batch_size=BATCH_SIZE,sort_within_batch = True,
    device=device)

Following is the CNN architecture. Filters of sizes 3, 4 and 5 are applied on the text along with max pooling. At the end, the information is passed on to a fully connected layer, which then outputs scores through softmax

In [35]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
                
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
                
        #text = [batch size, sent len]
        
        embedded = self.embedding(text)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1)
        
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
            
        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
                
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim = 1))

        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)


The best set of hyperparameters

In [36]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = 5
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

In [37]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')


The model has 5,511,605 trainable parameters


In [38]:
pretrained_embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0120,  0.2075, -0.1258,  ...,  0.1387, -0.3605, -0.0350],
        ...,
        [ 0.0495, -0.2737, -0.2819,  ..., -0.2686,  0.5445,  0.1999],
        [ 0.8430, -0.0559, -0.0837,  ...,  0.9208, -0.2708, -0.4322],
        [ 0.4218,  0.2891,  0.6224,  ..., -0.0994, -0.3216, -0.2066]])

In [39]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [40]:
#class_weights = torch.tensor([1.0, 15.0]).cuda()

optimizer = optim.Adam(model.parameters(), lr=1e-4)#,weight_decay=0.001)
criterion = nn.CrossEntropyLoss()
model = model.to(device)
criterion = criterion.to(device)

In [41]:
def binary_accuracy(preds1, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    preds, ind= torch.max(F.softmax(preds1),1)
    correct = (ind == y).float()
    acc = correct.sum()/float(len(correct))
    return acc

In [42]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.truth)
        
        acc = binary_accuracy(predictions, batch.truth)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [43]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.truth)
            
            acc = binary_accuracy(predictions, batch.truth)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [44]:
N_EPOCHS = 10

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')


  """


| Epoch: 01 | Train Loss: 1.562 | Train Acc: 28.68% | Val. Loss: 1.521 | Val. Acc: 36.08% |
| Epoch: 02 | Train Loss: 1.485 | Train Acc: 36.10% | Val. Loss: 1.466 | Val. Acc: 37.91% |
| Epoch: 03 | Train Loss: 1.415 | Train Acc: 40.75% | Val. Loss: 1.413 | Val. Acc: 39.46% |
| Epoch: 04 | Train Loss: 1.349 | Train Acc: 43.90% | Val. Loss: 1.369 | Val. Acc: 40.59% |
| Epoch: 05 | Train Loss: 1.285 | Train Acc: 47.76% | Val. Loss: 1.339 | Val. Acc: 40.85% |
| Epoch: 06 | Train Loss: 1.238 | Train Acc: 49.21% | Val. Loss: 1.318 | Val. Acc: 41.89% |
| Epoch: 07 | Train Loss: 1.193 | Train Acc: 51.98% | Val. Loss: 1.302 | Val. Acc: 43.11% |
| Epoch: 08 | Train Loss: 1.151 | Train Acc: 54.52% | Val. Loss: 1.290 | Val. Acc: 42.68% |
| Epoch: 09 | Train Loss: 1.118 | Train Acc: 56.61% | Val. Loss: 1.280 | Val. Acc: 43.64% |
| Epoch: 10 | Train Loss: 1.073 | Train Acc: 58.80% | Val. Loss: 1.277 | Val. Acc: 44.33% |


In [None]:
test_add_notan = []

In [45]:

#model.load_state_dict(torch.load(dataset_path + 'tut1-model.pt'))
test_loss, test_acc = evaluate(model, test_iterator, criterion)
#test_add_notan.append(test_acc*100)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 1.246 | Test Acc: 46.05%


  """


In [None]:
test_add_notan

[46.919642857142854, 48.16964285714286, 46.517857142857146, 47.58928571428571]

In [None]:
sum(test_norm_tanh)/len(test_norm_tanh)

47.27678571428571

In [None]:
sum(test_1)/len(test_1)

47.69642857142857

In [None]:
sum(test)/len(test)

47.84821428571429

In [None]:
test_tanh

[46.964285714285715,
 47.767857142857146,
 48.16964285714286,
 47.5,
 47.00892857142857,
 46.60714285714286]

In [None]:
sum(test_tanh[:-1])/len(test_tanh[:-1])

47.482142857142854

In [None]:
sum(test_add)/len(test_add)

47.481142857142856

In [None]:
sum(test_add_notan)/len(test_add_notan)

47.29910714285714