# Sentiment Analysis

## Architecture of the model

    Dataset:
     train - 3235,6
     test  - 
     
    Input: (count of words in a tweet, batch_size, seq_length of word)
    hidden: (256)
    output: (batch_size, 1)

In [1]:
#Importing key libraries
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
import os

In [3]:
path = '/data'
file_name = 'train.csv'
file_path = os.path.join(path, file_name)

In [4]:
dataset = pd.read_csv(file_path)
print(dataset.shape)

(3235, 6)


In [5]:
dataset.sample(4)

Unnamed: 0,id,original_text,lang,retweet_count,original_author,sentiment_class
926,1.245019e+18,Happy mother's Day everyone :) What are you gu...,en,0,Fred_Bot_,1
1488,1.246527e+18,Good morning and Happy day 1 lockdown! My apol...,en,0,huntsvillepage,0
1934,1.245011e+18,Our beautiful swing gals doin our thang for mo...,en,1,8thlinecreation,-1
1142,1.245415e+18,We would like to say Happy Mothers Day to all!...,en,0,LiveGoodProduct,0


In [6]:
tweets = dataset['original_text']
sentiments = dataset['sentiment_class']

### preprocessing tweets

In [7]:
all_tweets = ''.join(tweets)
print(all_tweets[:200])

Happy #MothersDay to all you amazing mothers out there! I know it's hard not being able to see your mothers today but it's on all of us to do what we can to protect the most vulnerable members of our 


In [8]:
tweets[:3]

0    Happy #MothersDay to all you amazing mothers o...
1    Happy Mothers Day Mum - I'm sorry I can't be t...
2    Happy mothers day To all This doing a mothers ...
Name: original_text, dtype: object

In [9]:
#removing the punctuations
punc = '!@#-)(_$;:/'
tweets_punc = []
for tweet in tweets:
    t = [c.lower() for c in tweet.split(' ') if c not in punc]
    tw = ' '.join(t)
    tweets_punc.append(tw)
    
print(len(tweets_punc))

3235


In [10]:
print(len(all_tweets.split(' ')))

108215


In [11]:
all_tweets_punc = ' '.join(tweets_punc)
print(len(all_tweets_punc.split(' ')))

110968


In [12]:
#Counting total number of words
from collections import Counter
words = all_tweets_punc.split(' ')
total_words = len(words)

count_words = Counter(words)
sorted_words = count_words.most_common(total_words)

In [39]:
sorted_words

[('to', 3926),
 ('happy', 3413),
 ('day', 3293),
 ('the', 3015),
 ('and', 2579),
 ('mothers', 2424),
 ('you', 2259),
 ('all', 2044),
 ('a', 1854),
 ('#mothersday', 1593),
 ('my', 1322),
 ('…', 1230),
 ('of', 1192),
 ('i', 1154),
 ('for', 1149),
 ('in', 1046),
 ('we', 956),
 ('https://www.', 868),
 ('.', 861),
 ('love', 740),
 ('your', 677),
 ('are', 676),
 ('out', 670),
 ('is', 654),
 ('our', 634),
 ('this', 605),
 ("mother's", 601),
 ('mother’s', 593),
 ('mum', 590),
 ('with', 583),
 ('have', 574),
 ('be', 548),
 ('so', 486),
 ('mums', 480),
 ('her', 479),
 ('but', 454),
 ('from', 429),
 ('who', 426),
 ('that', 399),
 ('at', 388),
 ('me', 387),
 ('on', 386),
 ('&', 379),
 ('it', 343),
 ('today', 329),
 ('not', 324),
 ('there', 310),
 ('amazing', 307),
 ('us', 294),
 ('as', 292),
 ('hope', 287),
 ('mother', 284),
 ('thank', 276),
 ('she', 272),
 ('their', 264),
 ('https://', 262),
 ('#mothersday2020', 258),
 ('wonderful', 249),
 ('do', 241),
 ('those', 225),
 ('very', 216),
 ('one', 21

In [13]:
vocab_to_int = {w:i+1 for i,(w,c) in enumerate(sorted_words)}
vocab_to_int['others'] = len(sorted_words)+1
print(vocab_to_int, vocab_to_int['others'])



In [14]:
def tweets_int_func(tweets_punc):
    tweets_int = []
    for tweet in tweets_punc:
        
        r = [vocab_to_int[w] for w in tweet.split(' ') if w ]
        tweets_int.append(r)
        
    return np.array(tweets_int)

In [15]:
tweets_int = tweets_int_func(tweets_punc)
print(tweets_int.shape)

(3235,)


In [16]:
def pad_sequence(seq_length, tweets_int):
    features = np.zeros((len(tweets_int), seq_length), dtype = int)
    
    for i, tweet in enumerate(tweets_int):
        tweet_len = len(tweet)
        
        if tweet_len <= seq_length:
            zeros = list(np.zeros(seq_length - tweet_len))
            new = zeros + tweet
            
        elif tweet_len > seq_length:
            new = tweet[:seq_length]
        features[i,:] = np.array(new)
        
    return features

In [17]:
seq_length = 64
inp = pad_sequence(seq_length, tweets_int)
print(inp.shape)

(3235, 64)


In [18]:
print(type(sentiments.values), sentiments.values.shape)

<class 'numpy.ndarray'> (3235,)


In [19]:
def tweets2rep(total_words, embed_dim, feature):
    embed = nn.Embedding(total_words, embed_dim)
    tweet_rep = torch.zeros(feature.shape[1],feature.shape[0], embed_dim)
    tweet_ = embed(torch.tensor(feature))
    tweet_rep = tweet_.permute(1,0,2)
    return tweet_rep

In [20]:
tweet_rep = tweets2rep(total_words, 512, inp)
print(tweet_rep.shape)

torch.Size([64, 3235, 512])


In [43]:
def class2rep(sentiments):
    sentiment_labels = torch.tensor(sentiments, dtype = torch.float32)
    return torch.reshape(sentiment_labels, (sentiment_labels.shape[0],1))

In [44]:
sentiment_labels = class2rep(sentiments.values)
print(sentiment_labels.shape)

torch.Size([3235, 1])


In [45]:
print(sentiment_labels.dtype)

torch.float32


### Sentiment Analysis

In [23]:
class sentimentAnalysisRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(sentimentAnalysisRNN, self).__init__()
        
        self.hidden_size = hidden_size
        
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.tanh = nn.Tanh()
        
    def forward(self, inp, hidden):
        combined = torch.cat((inp, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.tanh(output)
        
        return output, hidden
    
    def init_hidden(self):
        return torch.zeros(1, self.hidden_size)

In [24]:
n_hidden = 256
embed_dim = 512
output_size = 1
rnn = sentimentAnalysisRNN(embed_dim, n_hidden, output_size)

In [25]:
input_tweet = tweets2rep(total_words, embed_dim, inp[0:1,:])
print(input_tweet.shape)

torch.Size([64, 1, 512])


In [26]:
print(tweets[0], '\n', sentiment_labels[0])

Happy #MothersDay to all you amazing mothers out there! I know it's hard not being able to see your mothers today but it's on all of us to do what we can to protect the most vulnerable members of our society. #BeatCoronaVirus pic.twitter.com/va4nFjFQ5B 
 tensor([0])


In [27]:
hidden = torch.zeros(1, n_hidden)
output, next_hidden = rnn(input_tweet[0], hidden)

In [46]:
hidden = torch.zeros(1, n_hidden)
inp_tweet = tweet_rep[:, 1:2, :]
print(hidden.shape, inp_tweet.shape)

for word in inp_tweet:
    #print(word.shape)
    out, hidden = rnn(word, hidden)
    
print(output)

torch.Size([1, 256]) torch.Size([64, 1, 512])
tensor([[0.6717]], grad_fn=<TanhBackward>)


In [29]:
print(output)

tensor([[0.6717]], grad_fn=<TanhBackward>)


In [30]:
from torch.utils.data import DataLoader, TensorDataset
train_data = TensorDataset(tweet_rep, sentiment_labels)

AssertionError: 

In [33]:
import random

def randomChoice(n):
    return random.randint(0, n - 1)

def randomTrainingExample():
    training_size = len(tweet_rep)
    choice = randomChoice(training_size)
    sentiment_label = sentiment_labels[choice:choice+1,:]
    tweet = tweet_rep[:, choice:choice+1,:]
    return tweet, sentiment_label

In [47]:
x, y = randomTrainingExample()
print(x.shape, y.shape)

torch.Size([64, 1, 512]) torch.Size([1, 1])


### Training RNN

In [35]:
import torch.optim as optim
learning_rate = 0.01
criterion = nn.MSELoss()
optim = optim.Adam(rnn.parameters(), lr = learning_rate)

In [60]:
def train(net, tweet, sentiment):
    hidden = net.init_hidden()
    net.zero_grad()
    hidden = hidden.detach_()
    hidden = Variable(hidden.data, requires_grad = True)
    for word in tweet:
        output, hidden = net(word, hidden)
        
    loss = criterion(output, sentiment[0])
    loss.backward(retain_graph = True)
    
    for p in net.parameters():
        p.data.add_(p.grad.data, alpha = -learning_rate)
        
    return output, loss.item()

In [61]:
import time
import math

n_iters = 10
print_every = 2
plot_every = 1

current_loss = 0
all_losses = []

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s = s - m*60
    return '%dm %ds' %(m,s)


In [62]:
from torch.autograd import Variable
start = time.time()
for iter in range(1, n_iters + 1):
    tweet, sentiment = randomTrainingExample()
    output, loss = train(rnn, tweet, sentiment)
    current_loss = current_loss + loss
    
    if iter%print_every == 0:
        correct = 'x'
        if sentiment == output[0]:
            correct = ',/'
        print(sentiment, output, '-----',correct)
        print('time: ',timeSince(start))
    

RuntimeError: Trying to backward through the graph a second time, but the buffers have already been freed. Specify retain_graph=True when calling backward the first time.

In [127]:
class sentiment_analysis_lstm(nn.Module):
    
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.lstm_cell = nn.LSTM(input_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size)
        self.tanh = nn.Tanh()
        
    def forward(self, batch_inp, hidden):
        out, hidden = self.lstm_cell(batch_inp, hidden)
        
        output = self.h2o(hidden[0].view(-1, self.hidden_size))
        output = self.tanh(output)
        return output, hidden
    
    def init_hidden(self, batch_size = 1):
        return (torch.zeros(batch_size, 1, self.hidden_size), torch.zeros(batch_size, 1,self.hidden_size))

In [128]:
n_hidden = 256
input_size = 512
net = sentiment_analysis_lstm(input_size, n_hidden, 1)

In [125]:
batch_size = 10
train_loader = DataLoader(train_data, shuffle = True, batch_size = batch_size)

In [None]:
def batched_dataloader(npoints, tweet_rep, sentiment_label, verbose=False, device = 'cpu'):
    tweets = []
    sentiments = []
    tweet_lengths = []
    
    for i in range(npoints):
        index_ = np.random.randint(len(X_))
        tweet, sentiment = tweet_rep[index_,:,:], sentiment_label[index_,:]
        X_lengths.append(len(tweet))
    max_length = max(X_lengths)
    
    tweets_rep = tweet_rep[n_points,:,:].to(device)
    langs_rep = sentiment_label[n_points,:].to(device)
    
    padded_tweets_rep = torch.nn.utils.rnn.pack_padded_sequence(names_rep, X_lengths, enforce_sorted = False)
    
    if verbose:
        print(names_rep.shape, padded_names_rep.data.shape)
        print('--')
    
    if verbose:
        print(names)
        print_char(names_rep)
        print('--')
    
    if verbose:
        print_char(padded_names_rep.data)
        print('Lang Rep', langs_rep.data)
        print('Batch sizes', padded_names_rep.batch_sizes)
    
    
    return padded_names_rep.to(device), langs_rep

In [131]:
def train_batch(net, opt, criterion,batch_size, dataiter, device = 'cpu'):
    
    net.train().to(device)
    opt.zero_grad()
    
    batch_input, batch_groundtruth = dataiter.next()
    h0 = net.init_hidden(batch_size)
    print(h0[0].shape)
    output, hidden = net(batch_input.view(batch_size, 1,-1), h0)
    print(output.shape, hidden[0].shape)
    loss = criterion(output, batch_groundtruth)
    
    loss.backward()
    opt.step()
    return loss

In [132]:
def train_setup(net,lr = 0.01, n_batches = 100, batch_size = 10, momentum = 0.9, display_freq = 5, device = 'cpu'):
    net = net.to(device)
    criterion = nn.MSELoss()
    opt = optim.SGD(net.parameters(), lr = lr, momentum = momentum)
    
    loss_arr = np.zeros(n_batches + 1)
    
    for i in range(n_batches):
        dataiter = iter(train_loader)
        loss_arr[i+1] = (loss_arr[i]*i + train_batch(net, opt, criterion, batch_size, dataiter, device))/(i+1)
        
        if i%display_freq == display_freq - 1:
            clear_output(wait=True)
            
            print('Iteration: ', i, '\tLoss: ', loss_arr[i])
            plt.figure()
            plt.plot(loss_arr[1:i], '-*')
            plt.xlabel('Iteration')
            plt.ylabel('Loss')
            
            plt.show()
            print('\n\n')

In [133]:
train_setup(net, batch_size = 1)

torch.Size([1, 1, 256])


RuntimeError: input.size(-1) must be equal to input_size. Expected 512, got 327680

In [102]:
tweet_rep.shape

torch.Size([3235, 64, 512])

In [130]:
inputs = [torch.randn(1,3) for _ in range(5)]
for i in inputs

AttributeError: 'list' object has no attribute 'shape'