In [1]:
!pip install wget

Collecting wget
  Downloading https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-cp36-none-any.whl size=9681 sha256=92bae75e37b0477403e2454705fa8ba8186c94cdfb78a1923ffface9abbc2664
  Stored in directory: /root/.cache/pip/wheels/40/15/30/7d8f7cea2902b4db79e3fea550d7d7b85ecb27ef992b618f3f
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [2]:
url = "http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip"
import wget

wget.download(url, 'stanfordSentimentTreebank.zip')
import zipfile
with zipfile.ZipFile("stanfordSentimentTreebank.zip", 'r') as zip_ref:
    zip_ref.extractall("stanfordSentimentTreebank")

In [3]:
data_sentences_txt_path="./stanfordSentimentTreebank/stanfordSentimentTreebank/datasetSentences.txt"
sentiment_labels_txt_path="./stanfordSentimentTreebank/stanfordSentimentTreebank/sentiment_labels.txt"

In [4]:
    def get_merged_dataset(sst_dir):
        sentiment_labels = pd.read_csv(os.path.join(sst_dir, "sentiment_labels.txt"), sep="|")
        sentence_ids = pd.read_csv(os.path.join(sst_dir, "datasetSentences.txt"), sep="\t")
        dictionary = pd.read_csv(os.path.join(sst_dir, "dictionary.txt"), sep="|", names=['phrase', 'phrase ids'])
        train_test_split = pd.read_csv(os.path.join(sst_dir, "datasetSplit.txt"))
        sentence_phrase_merge = pd.merge(sentence_ids, dictionary, left_on='sentence', right_on='phrase')
        sentence_phrase_split = pd.merge(sentence_phrase_merge, train_test_split, on='sentence_index')
        return pd.merge(sentence_phrase_split, sentiment_labels, on='phrase ids').sample(frac=1)

In [5]:
import pandas as pd
import os
import torch
from collections import defaultdict
# df=get_merged_dataset("./stanfordSentimentTreebank/stanfordSentimentTreebank")

In [6]:
sst_dir = "./stanfordSentimentTreebank/stanfordSentimentTreebank"
df_phrases = pd.read_csv(os.path.join(sst_dir, "dictionary.txt"), sep='|', header=None)
df_labels = pd.read_csv(os.path.join(sst_dir, "sentiment_labels.txt"), sep='|')

df = pd.merge(df_phrases, df_labels, how='inner', left_on=1, right_on='phrase ids')
df.rename(columns={0: 'sentence'}, inplace=True)
df.sentence[0]

'!'

In [7]:
df.head()

Unnamed: 0,sentence,1,phrase ids,sentiment values
0,!,0,0,0.5
1,! ',22935,22935,0.52778
2,! '',18235,18235,0.5
3,! Alas,179257,179257,0.44444
4,! Brilliant,22936,22936,0.86111


In [8]:
df.columns

Index(['sentence', 1, 'phrase ids', 'sentiment values'], dtype='object')

In [9]:
df[["sentence","sentiment values"]].shape

(239232, 2)

## Defining Fields

Now we shall be defining Sentiment as a LabelField, which is a subclass of Field that sets sequen tial to False (as it’s our numerical category class). Sentence is a standard Field object, where we have decided to use the spaCy tokenizer and convert all the text to lower‐ case.

In [10]:
# Import Library
import random
import torch, torchtext
from torchtext import data 

# Manual Seed
SEED = 43
torch.manual_seed(SEED)

<torch._C.Generator at 0x7f3bd8349c48>

In [11]:
Sentence = data.Field(sequential = True, tokenize = 'spacy', batch_first =True, include_lengths=True)
Sentiment = data.LabelField(tokenize ='spacy', is_target=True, batch_first =True, sequential =False)

In [12]:
fields = [('sentence', Sentence),('sentiment',Sentiment)]

In [13]:
def discretize_label(label):
  if label <= 0.2: return 0
  if label <= 0.4: return 1
  if label <= 0.6: return 2
  if label <= 0.8: return 3
  return 4

In [14]:
example = [data.Example.fromlist([df.sentence[i],discretize_label(df["sentiment values"][i])], fields) for i in range(df.shape[0])] 

In [15]:
# train = [data.Example.fromlist([df.sentence[i],discretize_label(df["sentiment values"][i])], fields) for i in range(df.shape[0]) if df.splitset_label[i] == 1] 

In [16]:
# test = [data.Example.fromlist([df.sentence[i],discretize_label(df["sentiment values"][i])], fields) for i in range(df.shape[0]) if df.splitset_label[i] == 1] 

In [17]:
sst_dataset = data.Dataset(example, fields)
# train = data.Dataset(train, fields)
# test = data.Dataset(test, fields)

In [18]:
(train, valid) = sst_dataset.split(split_ratio=[0.85, 0.15])

In [19]:
Sentence.build_vocab(train)
Sentiment.build_vocab(train)

In [20]:
print('Size of input vocab : ', len(Sentence.vocab))
print('Size of label vocab : ', len(Sentiment.vocab))
print('Top 10 words appreared repeatedly :', list(Sentence.vocab.freqs.most_common(10)))
print('Labels : ', Sentiment.vocab.stoi)

Size of input vocab :  20824
Size of label vocab :  5
Top 10 words appreared repeatedly : [('the', 65188), (',', 60177), ('a', 46544), ('of', 44506), ('and', 44192), ('.', 32372), ('to', 31654), ('-', 30786), ("'s", 23948), ('is', 19519)]
Labels :  defaultdict(<function _default_unk_index at 0x7f3b87fccd90>, {2: 0, 3: 1, 1: 2, 4: 3, 0: 4})


In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [22]:
train_iterator, test_iterator = data.BucketIterator.splits((train, valid), batch_size = 32, 
                                                            sort_key = lambda x: len(x.sentence),
                                                            sort_within_batch=True, device = device)

In [23]:
import os, pickle
with open('tokenizer.pkl', 'wb') as tokens: 
    pickle.dump(Sentence.vocab.stoi, tokens)

In [24]:
import torch.nn as nn
import torch.nn.functional as F

class classifier(nn.Module):
    
    # Define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        
        super().__init__()          
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # LSTM layer
        self.encoder = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           dropout=dropout,
                           batch_first=True)
        # try using nn.GRU or nn.RNN here and compare their performances
        # try bidirectional and compare their performances
        
        # Dense layer
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text, text_lengths):
        
        # text = [batch size, sent_length]
        embedded = self.embedding(text)
        # embedded = [batch size, sent_len, emb dim]
      
        # packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True)
        
        packed_output, (hidden, cell) = self.encoder(packed_embedded)
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
    
        # Hidden = [batch size, hid dim * num directions]
        dense_outputs = self.fc(hidden)   
        
        # Final activation function softmax
        output = F.softmax(dense_outputs[0], dim=1)
            
        return output

In [25]:
# Define hyperparameters
size_of_vocab = len(Sentence.vocab)
embedding_dim = 300
num_hidden_nodes = 200
num_output_nodes = 5
num_layers = 2
dropout = 0.2

# Instantiate the model
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes, num_output_nodes, num_layers, dropout = dropout)

In [26]:
print(model)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

classifier(
  (embedding): Embedding(20824, 300)
  (encoder): LSTM(300, 200, num_layers=2, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=200, out_features=5, bias=True)
)
The model has 6,971,405 trainable parameters


In [27]:
import torch.optim as optim

# define optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=2e-4)
criterion = nn.CrossEntropyLoss()

# define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    _, predictions = torch.max(preds, 1)
    
    correct = (predictions == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
# push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

In [28]:
def train(model, iterator, optimizer, criterion):
    
    # initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    # set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        # resets the gradients after every batch
        optimizer.zero_grad()   
        
        # retrieve text and no. of words
        tweet, tweet_lengths = batch.sentence   
        
        # convert to 1D tensor
        predictions = model(tweet, tweet_lengths).squeeze()  
        
        # compute the loss
        loss = criterion(predictions, batch.sentiment)        
        
        # compute the binary accuracy
        acc = binary_accuracy(predictions, batch.sentiment)   
        
        # backpropage the loss and compute the gradients
        loss.backward()       
        
        # update the weights
        optimizer.step()      
        
        # loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [29]:
def evaluate(model, iterator, criterion):
    
    # initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    # deactivating dropout layers
    model.eval()
    
    # deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            # retrieve text and no. of words
            tweet, tweet_lengths = batch.sentence
            
            # convert to 1d tensor
            predictions = model(tweet, tweet_lengths).squeeze()
            
            # compute loss and accuracy
            loss = criterion(predictions, batch.sentiment)
            acc = binary_accuracy(predictions, batch.sentiment)
            
            # keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [30]:
N_EPOCHS = 10
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    # train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    # evaluate the model
    valid_loss, valid_acc = evaluate(model, test_iterator, criterion)
    
    # save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}% \n')

	Train Loss: 1.372 | Train Acc: 53.37%
	 Val. Loss: 1.328 |  Val. Acc: 57.03% 

	Train Loss: 1.305 | Train Acc: 59.54%
	 Val. Loss: 1.299 |  Val. Acc: 60.19% 

	Train Loss: 1.273 | Train Acc: 62.92%
	 Val. Loss: 1.288 |  Val. Acc: 61.38% 

	Train Loss: 1.252 | Train Acc: 65.08%
	 Val. Loss: 1.279 |  Val. Acc: 62.31% 

	Train Loss: 1.236 | Train Acc: 66.82%
	 Val. Loss: 1.274 |  Val. Acc: 62.73% 

	Train Loss: 1.223 | Train Acc: 68.12%
	 Val. Loss: 1.272 |  Val. Acc: 62.94% 

	Train Loss: 1.213 | Train Acc: 69.22%
	 Val. Loss: 1.268 |  Val. Acc: 63.43% 

	Train Loss: 1.204 | Train Acc: 70.09%
	 Val. Loss: 1.269 |  Val. Acc: 63.32% 

	Train Loss: 1.197 | Train Acc: 70.87%
	 Val. Loss: 1.268 |  Val. Acc: 63.43% 

	Train Loss: 1.190 | Train Acc: 71.48%
	 Val. Loss: 1.268 |  Val. Acc: 63.38% 

