<a href="https://colab.research.google.com/github/asanoop24/dl-nlp/blob/master/kaggle_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [0]:
import pandas

In [0]:
!unzip /content/kaggle_sentiment_analysis/train.tsv.zip

Archive:  /content/kaggle_sentiment_analysis/train.tsv.zip
  inflating: train.tsv               


In [0]:
!unzip /content/kaggle_sentiment_analysis/test.tsv.zip

Archive:  /content/kaggle_sentiment_analysis/test.tsv.zip
  inflating: test.tsv                


In [0]:
train_df = pandas.read_table('/content/kaggle_sentiment_analysis/train.tsv')
test_df = pandas.read_table('/content/kaggle_sentiment_analysis/test.tsv')

In [0]:
train_df[:30]

In [0]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [0]:
all_text = ' '.join([r['Phrase'] for i,r in train_df.iterrows()])
all_text = ''.join([c for c in all_text if c not in string.punctuation])

In [0]:
from collections import Counter
words = [w.lower() for w in all_text.split()]
count_words = Counter(words)
total_words = len(words)
sorted_words = count_words.most_common(total_words)
vocab_words = {w:i for i,(w,c) in enumerate(sorted_words)}

In [0]:
vocab_targets = {s:i for i,s in enumerate(sorted(train_df['Sentiment'].value_counts().index))}
vocab_targets

{0: 0, 1: 1, 2: 2, 3: 3, 4: 4}

In [0]:
vocab_words['and']

3

In [0]:
def sequence_to_tensor(sequence, vocab, dtype=torch.long):
    idxs = [vocab[word] if word in vocab else len(vocab)+1 for word in sequence]
    return torch.tensor(idxs, dtype=dtype)

In [0]:

#.keys()

True

In [0]:
train_seq = [sequence_to_tensor(''.join([c.lower() for c in sentence if c not in string.punctuation]).split(), vocab_words) for sentence in train_df['Phrase'].tolist()]
test_seq = [sequence_to_tensor(''.join([c.lower() for c in sentence if c not in string.punctuation]).split(), vocab_words) for sentence in test_df['Phrase'].tolist()]

In [0]:
import numpy as np
def pad_features(inputs, seq_length):
    ''' Return features of review_ints, where each review is padded with 0's or truncated to the input seq_length.
    '''
    features = np.zeros((len(inputs), 48), dtype = int)
    
    for i, review in enumerate(inputs):
        review_len = len(review)
        
        if review_len <= seq_length:
            zeroes = list(np.zeros(seq_length-review_len))
            new = zeroes+review
        elif review_len > seq_length:
            new = review[0:seq_length]
        
        features[i,:] = np.array(new)
    
    return features

In [0]:
train_seq_padded = pad_features([i.tolist() for i in train_seq], 48)

In [0]:
train_seq_padded[:4]

array([[    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     1,   303,     2, 15109,  5905,
            0,  6498,     8,    50,     7,    48,    12,     0,  3513,
            7,   166,    48,    12,     0, 11380,    61,     2,    74,
          614, 10452,    18,   575,     2,    74,  2002,     4,    53,
            2,     1,    39],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     1,   303,
            2, 15109,  5905,     0,  6498,     8,    50,     7,    48,
           12,     0,  3513],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,   

In [0]:
class SentimentLSTM(nn.Module):
  def __init__(embedding_dim, hidden, vocab_dim, target_dim, num_layers, dropout):
    super(SentimentLSTM, self).__init__()
    self.embedding = nn.Embedding(vocab_dim, embedding_dim)
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, dropout)
    self.dropout = nn.Dropout(dropout)
    self.fc = nn.Linear(hidden_dim, target_dim)
    self.softmax = nn.LogSoftmax(dim=1)

  def forward(self, sequence):
    embedding = self.embedding(sequence)
    out, hidden = self.lstm(embedding, hidden)
    out = self.dropout(out)
    out = self.fc(out)
    out = self.softmax(out)
    return out, hidden

In [0]:
torch.device('cuda')

device(type='cuda')