<a href="https://colab.research.google.com/github/ariG23498/GrokkingDeepLearning/blob/master/%239_SentimentIMDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [67]:
import pandas as pd
import nltk
import string
from sklearn.utils import shuffle
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [68]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [69]:
df = pd.read_csv('/content/drive/My Drive/Datasets/imdb_master.csv', encoding='latin-1')
df = df.drop(columns=['Unnamed: 0'])
df = shuffle(df)
df.head()

Unnamed: 0,type,review,label,file
69853,train,I haven't seen something like this since the H...,unsup,2786_0.txt
68471,train,"There are lots of these films. Generally, thes...",unsup,26625_0.txt
95719,train,I stumbled upon this movie on TCM already in p...,unsup,6148_0.txt
27453,train,"As an Army veteran, I was deeply offended by t...",neg,12208_2.txt
88826,train,"But you may easily be disappointed, if you see...",unsup,44945_0.txt


In [0]:
df.drop(df[df['label'] == 'unsup'].index, inplace = True)

In [71]:
df['label'].unique()

array(['neg', 'pos'], dtype=object)

In [72]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

df['label'].unique()

array([0, 1])

In [0]:
train = df[df['type'] == 'train'].drop(columns=['type','file']).values
test = df[df['type'] == 'test'].drop(columns=['type','file']).values

In [74]:
print('Length of Train: {}'.format(len(train)))
print('Length of Test: {}'.format(len(test)))

Length of Train: 25000
Length of Test: 25000


# Train

In [0]:
def preprocess(sentence):
  # Tokenize Contents
  contentsTokenized = nltk.tokenize.word_tokenize(sentence)

  # Remove the stop_words
  stop_word_set = set(nltk.corpus.stopwords.words("english"))
  filteredContents_afterstop = []
  for word in contentsTokenized:
    if word not in stop_word_set:
      filteredContents_afterstop.append(word)
  
  # Performing porterStemming
  porterStemmer = nltk.stem.PorterStemmer()
  filteredContents = [porterStemmer.stem(word) for word in filteredContents_afterstop]

  # Remove Punctuations
  excludePunctuation = set(string.punctuation)
    
  # manually add additional punctuation to remove
  doubleSingleQuote = '\'\''
  doubleDash = '--'
  doubleTick = '``'

  excludePunctuation.add(doubleSingleQuote)
  excludePunctuation.add(doubleDash)
  excludePunctuation.add(doubleTick)

  filteredContents_afterpunc = []
  for word in filteredContents_afterstop:
    if word not in excludePunctuation:
      filteredContents_afterpunc.append(word)

  # Convert all to lower case
  filteredContents_lower = [term.lower() for term in filteredContents_afterpunc]
  return filteredContents_lower

In [0]:
# token is a list of set
# each set consists of words in the a particular review

token = list(map(lambda x: set(preprocess(x)),train[:,0]))

In [77]:
print(len(token))

25000


In [78]:
# Total vocabulary
vocab = set()
for sentence in token:
    for word in sentence:
        if(len(word)>0):
            vocab.add(word)
vocab = list(vocab)

print('Total vocabulary count: {}'.format(len(vocab)))

Total vocabulary count: 114763


In [79]:
# For each word in vocab there is an index
word2index = {}
for i,word in enumerate(vocab):
    word2index[word] = i
print('Total word2index count: {}'.format(len(word2index)))

Total word2index count: 114763


In [0]:
import numpy as np

In [0]:
# input_dataset is used to keep the values of each words in each sentence
input_dataset = list()
for sentence in token:
    sent_indices = list()
    for word in sentence:
        try:
            sent_indices.append(word2index[word])
        except:
            ""
    input_dataset.append(list(sent_indices))

In [0]:
X_train = input_dataset
y_train = train[:,1]

In [83]:
print('Type of X_train: {}'.format(type(X_train)))
print('Len of X_train: {}'.format(len(X_train)))
print()
print('Type of y_train: {}'.format(type(y_train)))
print('Type of y_train: {}'.format(len(y_train)))

Type of X_train: <class 'list'>
Len of X_train: 25000

Type of y_train: <class 'numpy.ndarray'>
Type of y_train: 25000


In [0]:
np.random.seed(1)

In [0]:
def sigmoid(x):
  return 1/(1+np.exp(-x))

In [0]:
alpha,iteration = (0.01, 2)
hidden_size = 1000

weight_0_1 = 0.2*np.random.random((len(vocab),hidden_size)) - 0.1
weight_1_2 = 0.2*np.random.random((hidden_size,1)) - 0.1

In [0]:
correct = 0
total = 0

In [88]:
for iter in range(iteration):
  for i in range(len(X_train)):
    x,y = X_train[i], y_train[i]
    layer_1 = sigmoid(np.sum(weight_0_1[x],axis=0))
    layer_2 = sigmoid(np.dot(layer_1,weight_1_2))

    layer_2_delta = layer_2 - y
    layer_1_delta = layer_2_delta.dot(weight_1_2.T)

    weight_0_1[x] -= layer_1_delta * alpha
    weight_1_2 -= np.outer(layer_1,layer_2_delta) * alpha

    if(np.abs(layer_2_delta) < 0.5):
      correct += 1
    total += 1

    if(total % 1000 == 0):
      print('Progress: {} Training Accuracy: {}'.format(total, correct/total))


Progress: 1000 Training Accuracy: 0.61
Progress: 2000 Training Accuracy: 0.66
Progress: 3000 Training Accuracy: 0.6913333333333334
Progress: 4000 Training Accuracy: 0.7165
Progress: 5000 Training Accuracy: 0.7318
Progress: 6000 Training Accuracy: 0.7436666666666667
Progress: 7000 Training Accuracy: 0.7558571428571429
Progress: 8000 Training Accuracy: 0.762375
Progress: 9000 Training Accuracy: 0.7706666666666667
Progress: 10000 Training Accuracy: 0.7772
Progress: 11000 Training Accuracy: 0.7839090909090909
Progress: 12000 Training Accuracy: 0.7886666666666666
Progress: 13000 Training Accuracy: 0.7943846153846154
Progress: 14000 Training Accuracy: 0.7989285714285714
Progress: 15000 Training Accuracy: 0.8034
Progress: 16000 Training Accuracy: 0.805875
Progress: 17000 Training Accuracy: 0.8075882352941176
Progress: 18000 Training Accuracy: 0.8099444444444445
Progress: 19000 Training Accuracy: 0.8128421052631579
Progress: 20000 Training Accuracy: 0.8146
Progress: 21000 Training Accuracy: 0.

# Testing

In [0]:
token = list(map(lambda x: set(preprocess(x)),test[:,0]))

# input_dataset is used to keep the values of each words in each sentence
input_dataset_test = list()
for sentence in token:
    sent_indices = list()
    for word in sentence:
        try:
            sent_indices.append(word2index[word])
        except:
            ""
    input_dataset_test.append(list(sent_indices))

X_test = input_dataset_test
y_test = test[:,1]

In [90]:
len(X_test)

25000

In [92]:
correct,total = 0,0
for i in range(len(X_test)):
    x,y = X_test[i], y_test[i]
    layer_1 = sigmoid(np.sum(weight_0_1[x],axis=0))
    layer_2 = sigmoid(np.dot(layer_1,weight_1_2))

    layer_2_delta = layer_2 - y
    layer_1_delta = layer_2_delta.dot(weight_1_2.T)

    if(np.abs(layer_2_delta) < 0.5):
      correct += 1
    total += 1

    if(total % 1000 == 0):
      print('Progress: {} Testing Accuracy: {}'.format(total, correct/total))

Progress: 1000 Testing Accuracy: 0.878
Progress: 2000 Testing Accuracy: 0.87
Progress: 3000 Testing Accuracy: 0.8723333333333333
Progress: 4000 Testing Accuracy: 0.86975
Progress: 5000 Testing Accuracy: 0.8698
Progress: 6000 Testing Accuracy: 0.8676666666666667
Progress: 7000 Testing Accuracy: 0.8671428571428571
Progress: 8000 Testing Accuracy: 0.8705
Progress: 9000 Testing Accuracy: 0.8698888888888889
Progress: 10000 Testing Accuracy: 0.8704
Progress: 11000 Testing Accuracy: 0.8704545454545455
Progress: 12000 Testing Accuracy: 0.8704166666666666
Progress: 13000 Testing Accuracy: 0.8704615384615385
Progress: 14000 Testing Accuracy: 0.87
Progress: 15000 Testing Accuracy: 0.869
Progress: 16000 Testing Accuracy: 0.8681875
Progress: 17000 Testing Accuracy: 0.868764705882353
Progress: 18000 Testing Accuracy: 0.8695555555555555
Progress: 19000 Testing Accuracy: 0.8687368421052631
Progress: 20000 Testing Accuracy: 0.86785
Progress: 21000 Testing Accuracy: 0.8674285714285714
Progress: 22000 Te