In [1]:
import pandas as pd
import torch
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer


In [2]:

df_train = pd.read_csv('fine-grained-sentiment/data/sst/sst_train.txt', header=None,names=['sentiment', 'review'],sep='\t')
df_train.head()

Unnamed: 0,sentiment,review
0,__label__4,The Rock is destined to be the 21st Century 's...
1,__label__5,The gorgeously elaborate continuation of `` Th...
2,__label__4,Singer/composer Bryan Adams contributes a slew...
3,__label__3,You 'd think by now America would have had eno...
4,__label__4,Yet the act is still charming here .


In [3]:
df_train['sentiment'] = df_train['sentiment'].str.replace('__label__', '')
df_train['sentiment'] = df_train['sentiment'].astype(int)

df_test = pd.read_csv('fine-grained-sentiment/data/sst/sst_test.txt', sep='\t', header=None,
                   names=['sentiment', 'review'],
                  )
df_test['sentiment'] = df_test['sentiment'].str.replace('__label__', '')
df_test['sentiment'] = df_test['sentiment'].astype(int)


df_dev = pd.read_csv('fine-grained-sentiment/data/sst/sst_dev.txt', sep='\t', header=None,
                   names=['sentiment', 'review'],
                  )
df_dev['sentiment'] = df_dev['sentiment'].str.replace('__label__', '')
df_dev['sentiment'] = df_dev['sentiment'].astype(int)

In [4]:
df_train.shape,df_test.shape,df_dev.shape

((8544, 2), (2210, 2), (1101, 2))

In [5]:
df_train.sentiment.value_counts()

4    2322
2    2218
3    1624
5    1288
1    1092
Name: sentiment, dtype: int64

### Apply augmentation on training data

In [6]:
import random
# random.randint(2,100)
instances=[random.randint(1,8000) for i in range(4000) ]
len(set(instances))

3139

In [7]:
def random_deletion(words, p=0.3):
    words_list=words.copy()
    if len(words_list) ==1: # return if single word
        return words_list
    remaining = list(filter(lambda x: random.uniform(0,1) > p,words_list))
    if len(remaining) == 0: # if not left, sample a random word
        return [random.choice(words_list)] 
    else:
        return remaining

In [8]:
def random_swap(sentence, n=5): 
    length = range(len(sentence)) 
    for _ in range(n):
        idx1, idx2 = random.sample(length, 2)
        sentence[idx1], sentence[idx2] = sentence[idx2], sentence[idx1] 
    return sentence

In [9]:
df_train_aug = pd.DataFrame(columns=['sentiment','review'])
tknzr = TweetTokenizer()

In [10]:
nltk_stopwords = stopwords.words('english')
nltk_stopwords = nltk_stopwords + ['-','.',',',"'s",'--',"...","","'","`","(",")"]

In [11]:
instances=list(set(instances))

In [12]:
for i in instances:
    temp=df_train.loc[i]
    if random.randint(1,2)==1:
        x_list =[x for x in random_deletion(tknzr.tokenize(temp['review'])) if x not in nltk_stopwords]
        temp['review'] = temp['review']=(' ').join(x_list)
    else:
        x_list =[x for x in random_swap(tknzr.tokenize(temp['review'])) if x not in nltk_stopwords]
        temp['review'] = temp['review']=(' ').join(x_list)
    df_train_aug = df_train_aug.append(temp)
# x_list

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [13]:
df_train_aug.shape

(3139, 2)

In [14]:
df_train = df_train.append(df_train_aug)
df_train.shape

(11683, 2)

In [15]:
import random
import torch, torchtext
from torchtext import data

In [16]:
SEED=43
torch.manual_seed(SEED)

<torch._C.Generator at 0x7f5d30c99438>

### TODO:
- https://github.com/dsfsi/textaugment

### Data Field helps to do common data/text processing and converts to tensor:

- Define the tokenizer, mention if the data is sequential in nature
- batch_first : If set to true, will have the batch dimension first such as [1,1,28,28] First 1 is the batch size. Rest is size of the image
- is_target: if this field is target variable
- Stop_words can also be provided here
- unknown and padding token can be explicity defined

In [17]:
import re  
def cleanup_text(texts):
    cleaned_text = []
    for text in texts:
        # remove punctuation
        text = re.sub('[^a-zA-Z0-9]', ' ', text)
        # remove multiple spaces
        text = re.sub(r' +', ' ', text)
        # remove newline
        text = re.sub(r'\n', ' ', text)
        cleaned_text.append(text)
    return cleaned_text

In [18]:
Review = data.Field(sequential = True, tokenize='spacy', batch_first=True,lower=True,
                   include_lengths=True)
Label = data.LabelField(sequential = True,tokenize='spacy', is_target=True,
                        batch_first=True)



#### Create a tuple of text and label

In [19]:
fields = [('review',Review),('label',Label)]

In [20]:
df_train.reset_index(drop=True,inplace=True)
df_dev.reset_index(drop=True,inplace=True)
df_test.reset_index(drop=True,inplace=True)

In [21]:
df_train['len'] = df_train.review.apply(lambda x: len(x))
df_train.loc[df_train.len==0].shape

(4, 3)

In [22]:
df_train = df_train.loc[df_train.len!=0]
df_train.reset_index(drop=True,inplace=True)

#### Create a example object from list of reviews and sentiments from field tuple
To access individual elements:

example[0].review,example[0].label

In [23]:
train_example = [data.Example.fromlist([df_train.review[i],df_train.sentiment[i]], fields) for i in range(df_train.shape[0])]
test_example = [data.Example.fromlist([df_test.review[i],df_test.sentiment[i]], fields) for i in range(df_test.shape[0])]
valid_example = [data.Example.fromlist([df_dev.review[i],df_dev.sentiment[i]], fields) for i in range(df_dev.shape[0])]



In [24]:
train_example[0].review,train_example[0].label

(['the',
  'rock',
  'is',
  'destined',
  'to',
  'be',
  'the',
  '21st',
  'century',
  "'s",
  'new',
  '``',
  'conan',
  "''",
  'and',
  'that',
  'he',
  "'s",
  'going',
  'to',
  'make',
  'a',
  'splash',
  'even',
  'greater',
  'than',
  'arnold',
  'schwarzenegger',
  ',',
  'jean',
  '-',
  'claud',
  'van',
  'damme',
  'or',
  'steven',
  'segal',
  '.'],
 4)

In [25]:
train_Dataset = data.Dataset(train_example, fields)
test_Dataset = data.Dataset(test_example, fields)
valid_Dataset = data.Dataset(valid_example,fields)

In [26]:
(len(train_Dataset), len(test_Dataset), len(valid_Dataset))

(11679, 2210, 1101)

In [27]:
vars(test_Dataset.examples[10])

{'review': ['what',
  'really',
  'surprises',
  'about',
  'wisegirls',
  'is',
  'its',
  'low',
  '-',
  'key',
  'quality',
  'and',
  'genuine',
  'tenderness',
  '.'],
 'label': 4}

### Creating a vocabulary

In [28]:
Review.build_vocab(train_Dataset, max_size=10000)
Label.build_vocab(train_Dataset)

In [29]:
print('Size of input vocab : ', len(Review.vocab))
print('Size of label vocab : ', len(Label.vocab))
print('Top 10 words appreared repeatedly :', list(Review.vocab.freqs.most_common(10)))
print('Labels : ', Label.vocab.stoi)

Size of input vocab :  10002
Size of label vocab :  5
Top 10 words appreared repeatedly : [('.', 8055), ('the', 7793), (',', 7131), ('a', 5579), ('and', 4544), ('of', 4482), ('-', 3577), ('to', 3063), ('it', 2634), ('is', 2574)]
Labels :  defaultdict(None, {4: 0, 2: 1, 3: 2, 5: 3, 1: 4})


In [30]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [31]:
train_iterator, valid_iterator = data.BucketIterator.splits((train_Dataset, test_Dataset), batch_size = 32, 
                                                            sort_key = lambda x: len(x.review),
                                                            sort_within_batch=True, device = device)



In [32]:
import os, pickle
with open('tokenizer.pkl', 'wb') as tokens: 
    pickle.dump(Review.vocab.stoi, tokens)

In [33]:
import torch.nn as nn
import torch.nn.functional as F

class classifier(nn.Module):
    
    # Define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        
        super().__init__()          
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # LSTM layer
        self.encoder = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           dropout=dropout,
                           batch_first=True)
        # try using nn.GRU or nn.RNN here and compare their performances
        # try bidirectional and compare their performances
        
        # Dense layer
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text, text_lengths):
        
        # text = [batch size, sent_length]
        embedded = self.embedding(text)
        # embedded = [batch size, sent_len, emb dim]
      
        # packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True)
        
        packed_output, (hidden, cell) = self.encoder(packed_embedded)
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
    
        # Hidden = [batch size, hid dim * num directions]
        dense_outputs = self.fc(hidden)   
        
        # Final activation function softmax
        output = F.softmax(dense_outputs[0], dim=1)
            
        return output

In [34]:
# Define hyperparameters
size_of_vocab = len(Review.vocab)
embedding_dim = 300
num_hidden_nodes = 100
num_output_nodes = 5
num_layers = 2
dropout = 0.2

# Instantiate the model
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes, num_output_nodes, num_layers, dropout = dropout)

In [35]:
print(model)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

classifier(
  (embedding): Embedding(10002, 300)
  (encoder): LSTM(300, 100, num_layers=2, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=100, out_features=5, bias=True)
)
The model has 3,242,705 trainable parameters


In [47]:
import torch.optim as optim

# define optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=2e-4)
criterion = nn.CrossEntropyLoss()

# define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    _, predictions = torch.max(preds, 1)
    
    correct = (predictions == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
# push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

In [48]:
def train(model, iterator, optimizer, criterion):
    
    # initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    # set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        # resets the gradients after every batch
        optimizer.zero_grad()   
        
        # retrieve text and no. of words
        review, review_lengths = batch.review   
        
        # convert to 1D tensor
        predictions = model(review, review_lengths).squeeze()  
        
        # compute the loss
        loss = criterion(predictions, batch.label)        
        
        # compute the binary accuracy
        acc = binary_accuracy(predictions, batch.label)   
        
        # backpropage the loss and compute the gradients
        loss.backward()       
        
        # update the weights
        optimizer.step()      
        
        # loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [49]:
def evaluate(model, iterator, criterion):
    
    # initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    # deactivating dropout layers
    model.eval()
    
    # deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            # retrieve text and no. of words
            review, review_lengths = batch.review
            
            # convert to 1d tensor
            predictions = model(review, review_lengths).squeeze()
            
            # compute loss and accuracy
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            
            # keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [50]:
N_EPOCHS = 10
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    # train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    # evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    # save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}% \n')

	Train Loss: 1.301 | Train Acc: 59.88%
	 Val. Loss: 1.587 |  Val. Acc: 29.96% 

	Train Loss: 1.279 | Train Acc: 62.30%
	 Val. Loss: 1.588 |  Val. Acc: 30.00% 

	Train Loss: 1.264 | Train Acc: 63.75%
	 Val. Loss: 1.589 |  Val. Acc: 29.73% 

	Train Loss: 1.254 | Train Acc: 64.89%
	 Val. Loss: 1.589 |  Val. Acc: 29.73% 

	Train Loss: 1.247 | Train Acc: 65.65%
	 Val. Loss: 1.590 |  Val. Acc: 29.69% 

	Train Loss: 1.241 | Train Acc: 66.17%
	 Val. Loss: 1.591 |  Val. Acc: 29.64% 

	Train Loss: 1.236 | Train Acc: 66.73%
	 Val. Loss: 1.591 |  Val. Acc: 29.73% 

	Train Loss: 1.232 | Train Acc: 67.09%
	 Val. Loss: 1.591 |  Val. Acc: 29.91% 

	Train Loss: 1.228 | Train Acc: 67.50%
	 Val. Loss: 1.591 |  Val. Acc: 29.91% 

	Train Loss: 1.224 | Train Acc: 67.90%
	 Val. Loss: 1.592 |  Val. Acc: 29.60% 



In [73]:
nlp = spacy.load("en_core_web_sm")
def predict(model, sentence):
    tokenized = tokenized = [tok.text for tok in nlp.tokenizer(sentence)]  #tokenize the sentence 
    indexed = [Review.vocab.stoi[t] for t in tokenized]          #convert to integer sequence
    length = [len(indexed)]                                    #compute no. of words
    tensor = torch.LongTensor(indexed).to(device)              #convert to tensor
    tensor = tensor.unsqueeze(1).T                             #reshape in form of batch,no. of words
    length_tensor = torch.LongTensor(length)                   #convert to tensor
    prediction = model(tensor, length_tensor)                  #prediction 
    return torch.argmax(prediction)

In [74]:
predict(model,df_test.loc[3]['review'])

tensor(2, device='cuda:0')

In [82]:
for i in range(25):
    print('Review: '+df_test.loc[i]['review'])
    print('Sentiment: '+str(df_test.loc[i]['sentiment']))
    print('Model Predictions:'+str(predict(model,df_test.loc[i]['review'])))
    print('-'*20)

Review: Effective but too-tepid biopic
Sentiment: 3
Model Predictions:tensor(1, device='cuda:0')
--------------------
Review: If you sometimes like to go to the movies to have fun , Wasabi is a good place to start .
Sentiment: 4
Model Predictions:tensor(0, device='cuda:0')
--------------------
Review: Emerges as something rare , an issue movie that 's so honest and keenly observed that it does n't feel like one .
Sentiment: 5
Model Predictions:tensor(1, device='cuda:0')
--------------------
Review: The film provides some great insight into the neurotic mindset of all comics -- even those who have reached the absolute top of the game .
Sentiment: 3
Model Predictions:tensor(2, device='cuda:0')
--------------------
Review: Offers that rare combination of entertainment and education .
Sentiment: 5
Model Predictions:tensor(0, device='cuda:0')
--------------------
Review: Perhaps no picture ever made has more literally showed that the road to hell is paved with good intentions .
Sentiment: 4