<a href="https://colab.research.google.com/github/radonys/Deep-Learning-Assignments/blob/master/LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install torch torchvision
!pip install pandas nltk

In [0]:
import pandas as pd
import nltk
nltk.download('all')
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import torch
from torch.utils.data import DataLoader, TensorDataset

In [0]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def string_form(value):
    return str(value).lower()

def clean_text(text):
   
    text = BeautifulSoup(text, "lxml").text
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub('', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text

def pad_features(reviews_int, seq_length=200):
    
    features = np.zeros((len(reviews_int), seq_length), dtype = int)
    
    for i, review in enumerate(reviews_int):
        
        review_len = len(review)
        
        if review_len <= seq_length:
            
            zeroes = list(np.zeros(seq_length-review_len))
            new = zeroes+review
        
        elif review_len > seq_length:
            
            new = review[0:seq_length]
        
        features[i,:] = np.array(new)
    
    return features

In [0]:
data = pd.read_table('mrdata.tsv')
data['Phrase'] = data['Phrase'].apply(string_form)
data['Phrase'] = data['Phrase'].apply(clean_text)
del data['PhraseId']
del data['SentenceId']
data.head(10)

In [0]:
all_text = ' '.join(data['Phrase'])
words = all_text.split()
count_words = Counter(words)
total_words = len(words)
sorted_words = count_words.most_common(total_words)
print(count_words)

In [0]:
vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}
reviews_int = []
for review in data['Phrase']:
    r = [vocab_to_int[w] for w in review.split()]
    reviews_int.append(r)
print (reviews_int[0:3])

In [0]:
encoded_labels = [int(i) for i in data['Sentiment']]
encoded_labels = np.array(encoded_labels)

In [0]:
reviews_len = [len(x) for x in reviews_int]
pd.Series(reviews_len).hist()
plt.show()
pd.Series(reviews_len).describe()

reviews_int = [ reviews_int[i] for i, l in enumerate(reviews_len) if l>0]
encoded_labels = [ encoded_labels[i] for i, l in enumerate(reviews_len) if l> 0]
features = pad_features(reviews_int)
print(features[:10,:])

In [0]:
split_frac = 0.8
train_x = features[0:int(split_frac*len_feat)]
train_y = encoded_labels[0:int(split_frac*len_feat)]
test_x = features[int(split_frac*len_feat):]
test_y = encoded_labels[int(split_frac*len_feat):]

In [0]:
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

batch_size = 50

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)