<a href="https://colab.research.google.com/github/asanoop24/dl-nlp/blob/master/kaggle_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [0]:
import pandas

In [0]:
!unzip /content/kaggle_sentiment_analysis/train.tsv.zip

Archive:  /content/kaggle_sentiment_analysis/train.tsv.zip
  inflating: train.tsv               


In [0]:
!unzip /content/kaggle_sentiment_analysis/test.tsv.zip

Archive:  /content/kaggle_sentiment_analysis/test.tsv.zip
  inflating: test.tsv                


In [0]:
train_df = pandas.read_table('/content/kaggle_sentiment_analysis/train.tsv')
test_df = pandas.read_table('/content/kaggle_sentiment_analysis/test.tsv')

In [0]:
train_df[:30]

In [0]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [0]:
all_text = ' '.join([r['Phrase'] for i,r in train_df.iterrows()])
all_text = ''.join([c for c in all_text if c not in string.punctuation])

In [0]:
from collections import Counter
words = [w.lower() for w in all_text.split()]
count_words = Counter(words)
total_words = len(words)
sorted_words = count_words.most_common(total_words)
vocab_words = {w:i for i,(w,c) in enumerate(sorted_words)}

In [0]:
vocab_targets = {s:i for i,s in enumerate(sorted(train_df['Sentiment'].value_counts().index))}
vocab_targets

{0: 0, 1: 1, 2: 2, 3: 3, 4: 4}

In [0]:
vocab_words['and']

3

In [0]:
def sequence_to_tensor(sequence, vocab, dtype=torch.long):
    idxs = [vocab[word] if word in vocab else len(vocab)+1 for word in sequence]
    return torch.tensor(idxs, dtype=dtype)

In [0]:

#.keys()

True

In [0]:
train_seq = [sequence_to_tensor(''.join([c.lower() for c in sentence if c not in string.punctuation]).split(), vocab_words) for sentence in train_df['Phrase'].tolist()]
test_seq = [sequence_to_tensor(''.join([c.lower() for c in sentence if c not in string.punctuation]).split(), vocab_words) for sentence in test_df['Phrase'].tolist()]

In [0]:
import numpy as np
def pad_features(inputs, seq_length):
    ''' Return features of review_ints, where each review is padded with 0's or truncated to the input seq_length.
    '''
    features = np.zeros((len(inputs), 48), dtype = int)
    
    for i, review in enumerate(inputs):
        review_len = len(review)
        
        if review_len <= seq_length:
            zeroes = list(np.zeros(seq_length-review_len))
            new = zeroes+review
        elif review_len > seq_length:
            new = review[0:seq_length]
        
        features[i,:] = np.array(new)
    
    return features

In [0]:
train_seq_padded = pad_features([i.tolist() for i in train_seq], 48)

In [0]:
train_labels = [l for l in train_df['Sentiment'].tolist()]

In [0]:
class SentimentLSTM(nn.Module):
  def __init__(self, embedding_dim, hidden, vocab_dim, target_dim, num_layers, dropout):
    super(SentimentLSTM, self).__init__()
    self.embedding = nn.Embedding(vocab_dim, embedding_dim)
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, dropout)
    self.dropout = nn.Dropout(dropout)
    self.fc = nn.Linear(hidden_dim, target_dim)
    self.softmax = nn.LogSoftmax(dim=1)

    self.n_layers = num_layers
    self.hidden_dim = hidden_dim

  def forward(self, sequence, hidden):
    embedding = self.embedding(sequence)
    out, hidden = self.lstm(embedding, hidden)
    out = self.dropout(out)
    out = self.fc(out)
    out = self.softmax(out)
    return out, hidden

  def init_hidden(self, batch_size):
      ''' Initializes hidden state '''
      # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
      # initialized to zero, for hidden state and cell state of LSTM
      weight = next(self.parameters()).data
      
      if (train_on_gpu):
          hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
      else:
          hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                    weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
      
      return hidden

In [0]:
split_frac = 0.8
len_feat = len(train_seq_padded)
train_x = np.array(train_seq_padded[0:int(split_frac*len_feat)])
train_y = np.array(train_labels[0:int(split_frac*len_feat)])
remaining_x = np.array(train_seq_padded[int(split_frac*len_feat):])
remaining_y = np.array(train_labels[int(split_frac*len_feat):])
valid_x = np.array(remaining_x[0:int(len(remaining_x)*0.5)])
valid_y = np.array(remaining_y[0:int(len(remaining_y)*0.5)])
test_x = np.array(remaining_x[int(len(remaining_x)*0.5):])
test_y = np.array(remaining_y[int(len(remaining_y)*0.5):])

In [0]:
import torch
from torch.utils.data import DataLoader, TensorDataset
# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(valid_x), torch.from_numpy(valid_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))
# dataloaders
batch_size = 50
# make sure to SHUFFLE your data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [67]:
# obtain one batch of training data
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()
print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)

Sample input size:  torch.Size([50, 48])
Sample input: 
 tensor([[    0,     0,     0,  ...,     3,  8668,  3443],
        [    0,     0,     0,  ...,     0,     0,  9618],
        [    0,     0,     0,  ...,     2,  2419,  6251],
        ...,
        [    0,     0,     0,  ...,     4,   237,   340],
        [    0,     0,     0,  ...,     0,     0, 15414],
        [    0,     0,     0,  ...,     0,     0,   841]])

Sample label size:  torch.Size([50])
Sample label: 
 tensor([1, 1, 2, 2, 2, 2, 3, 2, 2, 2, 2, 3, 2, 2, 2, 2, 3, 1, 4, 1, 1, 2, 3, 1,
        3, 2, 1, 1, 2, 1, 2, 3, 0, 2, 2, 1, 2, 4, 4, 3, 0, 2, 2, 2, 3, 2, 4, 2,
        2, 2])


In [0]:
class SentimentLSTM(nn.Module):
    """
    The RNN model that will be used to perform Sentiment analysis.
    """

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        """
        Initialize the model by setting up the layers.
        """
        super().__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        # dropout layer
        self.dropout = nn.Dropout(0.3)
        
        # linear and sigmoid layers
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()
        

    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        batch_size = x.size(0)

        # embeddings and lstm_out
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
    
        # stack up lstm outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # dropout and fully-connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        # sigmoid function
        sig_out = self.sig(out)
        
        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1] # get last batch of labels
        
        # return last sigmoid output and hidden state
        return sig_out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden

In [97]:
# Instantiate the model w/ hyperparams
vocab_size = len(vocab_words)+1 # +1 for the 0 padding
output_size = 5
embedding_dim = 400
hidden_dim = 256
n_layers = 2
dropout = 0.3
net = SentimentLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=dropout)
print(net)

SentimentLSTM(
  (embedding): Embedding(16404, 400)
  (lstm): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.3)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=5, bias=True)
  (sig): Sigmoid()
)


In [93]:
train_on_gpu = 0

# loss and optimization functions
lr=0.001

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)


# training params

epochs = 4 # 3-4 is approx where I noticed the validation loss stop decreasing

counter = 0
print_every = 100
clip=5 # gradient clipping

# move model to GPU, if available
if(train_on_gpu):
    print('GPU')
    net.cuda()

net.train()
# train for some number of epochs
for e in range(epochs):
    # initialize hidden state
    h = net.init_hidden(batch_size)

    # batch loop
    for inputs, labels in train_loader:
        counter += 1

        if(train_on_gpu):
            inputs, labels = inputs.cuda(), labels.cuda()

        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])

        # zero accumulated gradients
        net.zero_grad()

        # get the output from the model
        inputs = inputs.type(torch.LongTensor)
        output, h = net(inputs, h)

        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()

        # loss stats
        print(counter, end='\r')
        if counter % print_every == 0:
            # Get validation loss
            val_h = net.init_hidden(batch_size)
            val_losses = []
            net.eval()
            for inputs, labels in valid_loader:

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])

                if(train_on_gpu):
                    inputs, labels = inputs.cuda(), labels.cuda()

                inputs = inputs.type(torch.LongTensor)
                output, val_h = net(inputs, val_h)
                val_loss = criterion(output.squeeze(), labels.float())

                val_losses.append(val_loss.item())

            net.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))

RuntimeError: ignored