<a href="https://colab.research.google.com/github/ariG23498/GrokkingDeepLearning/blob/master/9_SentimentIMDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [46]:
import pandas as pd
import nltk
import string
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [27]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
df = pd.read_csv('/content/drive/My Drive/Datasets/imdb_master.csv', encoding='latin-1')
df = df.drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,type,review,label,file
0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt


In [0]:
df.drop(df[df['label'] == 'unsup'].index, inplace = True)

In [30]:
df['label'].unique()

array(['neg', 'pos'], dtype=object)

In [31]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

df['label'].unique()

array([0, 1])

In [0]:
train = df[df['type'] == 'train'].drop(columns=['type','file']).values
test = df[df['type'] == 'test'].drop(columns=['type','file']).values

In [33]:
print('Length of Train: {}'.format(len(train)))
print('Length of Test: {}'.format(len(test)))

Length of Train: 25000
Length of Test: 25000


# Train

In [0]:
def preprocess(sentence):
  # Tokenize Contents
  contentsTokenized = nltk.tokenize.word_tokenize(sentence)

  # Remove the stop_words
  stop_word_set = set(nltk.corpus.stopwords.words("english"))
  filteredContents_afterstop = []
  for word in contentsTokenized:
    if word not in stop_word_set:
      filteredContents_afterstop.append(word)
  
  # Performing porterStemming
  porterStemmer = nltk.stem.PorterStemmer()
  filteredContents = [porterStemmer.stem(word) for word in filteredContents_afterstop]

  # Remove Punctuations
  excludePunctuation = set(string.punctuation)
    
  # manually add additional punctuation to remove
  doubleSingleQuote = '\'\''
  doubleDash = '--'
  doubleTick = '``'

  excludePunctuation.add(doubleSingleQuote)
  excludePunctuation.add(doubleDash)
  excludePunctuation.add(doubleTick)

  filteredContents_afterpunc = []
  for word in filteredContents_afterstop:
    if word not in excludePunctuation:
      filteredContents_afterpunc.append(word)

  # Convert all to lower case
  filteredContents_lower = [term.lower() for term in filteredContents_afterpunc]
  return filteredContents_lower

In [49]:
a = "Hey, I am Aritra Roy Gosthipaty."
b = preprocess(a)
print(b)

['hey', 'i', 'aritra', 'roy', 'gosthipaty']


In [0]:
# token is a list of set
# each set consists of words in the a particular review

token = list(map(lambda x: set(preprocess(x)),train[:,0]))

In [51]:
print(len(token))

25000


In [52]:
# Total vocabulary
vocab = set()
for sentence in token:
    for word in sentence:
        if(len(word)>0):
            vocab.add(word)
vocab = list(vocab)

print('Total vocabulary count: {}'.format(len(vocab)))

Total vocabulary count: 114763


In [53]:
# For each word in vocab there is an index
word2index = {}
for i,word in enumerate(vocab):
    word2index[word] = i
print('Total word2index count: {}'.format(len(word2index)))

Total word2index count: 114763


In [0]:
import numpy as np

In [0]:
# input_dataset is used to keep the values of each words in each sentence
input_dataset = list()
for sentence in token:
    sent_indices = list()
    for word in sentence:
        try:
            sent_indices.append(word2index[word])
        except:
            ""
    input_dataset.append(list(sent_indices))

In [0]:
X_train = input_dataset
y_train = train[:,1]

In [57]:
print('Type of X_train: {}'.format(type(X_train)))
print('Len of X_train: {}'.format(len(X_train)))
print()
print('Type of y_train: {}'.format(type(y_train)))
print('Type of y_train: {}'.format(len(y_train)))

Type of X_train: <class 'list'>
Len of X_train: 25000

Type of y_train: <class 'numpy.ndarray'>
Type of y_train: 25000


In [0]:
np.random.seed(1)

In [0]:
def sigmoid(x):
  return 1/(1+np.exp(-x))

In [0]:
alpha,iteration = (0.01, 2)
hidden_size = 1000

weight_0_1 = np.random.random((len(vocab),hidden_size))
weight_1_2 = np.random.random((hidden_size,1))

In [0]:
correct = 0
total = 0

In [62]:
for iter in range(5):
  for i in range(len(X_train)):
    x,y = X_train[i], y_train[i]
    layer_1 = sigmoid(np.sum(weight_0_1[x],axis=0))
    layer_2 = sigmoid(np.dot(layer_1,weight_1_2))

    layer_2_delta = layer_2 - y
    layer_1_delta = layer_2_delta.dot(weight_1_2.T)

    weight_0_1[x] -= layer_1_delta * alpha
    weight_1_2 -= np.outer(layer_1,layer_2_delta) * alpha

    if(np.abs(layer_2_delta) < 0.5):
      correct += 1
    total += 1

    if(total % 1000 == 0):
      print('Progress: {} Training Accuracy: {}'.format(total, correct/total))


Progress: 1000 Training Accuracy: 0.948
Progress: 2000 Training Accuracy: 0.974
Progress: 3000 Training Accuracy: 0.9826666666666667
Progress: 4000 Training Accuracy: 0.987
Progress: 5000 Training Accuracy: 0.9896
Progress: 6000 Training Accuracy: 0.9913333333333333
Progress: 7000 Training Accuracy: 0.9925714285714285
Progress: 8000 Training Accuracy: 0.9935
Progress: 9000 Training Accuracy: 0.9942222222222222
Progress: 10000 Training Accuracy: 0.9948
Progress: 11000 Training Accuracy: 0.9952727272727273
Progress: 12000 Training Accuracy: 0.9956666666666667
Progress: 13000 Training Accuracy: 0.9958461538461538
Progress: 14000 Training Accuracy: 0.9961428571428571
Progress: 15000 Training Accuracy: 0.9964
Progress: 16000 Training Accuracy: 0.996625
Progress: 17000 Training Accuracy: 0.9968235294117647
Progress: 18000 Training Accuracy: 0.997
Progress: 19000 Training Accuracy: 0.9971578947368421
Progress: 20000 Training Accuracy: 0.9973
Progress: 21000 Training Accuracy: 0.99742857142857

# Testing

In [0]:
token = list(map(lambda x: set(preprocess(x)),test[:,0]))

# input_dataset is used to keep the values of each words in each sentence
input_dataset_test = list()
for sentence in token:
    sent_indices = list()
    for word in sentence:
        try:
            sent_indices.append(word2index[word])
        except:
            ""
    input_dataset_test.append(list(sent_indices))

X_test = input_dataset_test
y_test = test[:,1]

In [65]:
len(X_test)

25000

In [66]:
correct,total = 0,0
for i in range(len(X_test)):
    x,y = X_test[i], y_test[i]
    layer_1 = sigmoid(np.sum(weight_0_1[x],axis=0))
    layer_2 = sigmoid(np.dot(layer_1,weight_1_2))

    layer_2_delta = layer_2 - y
    layer_1_delta = layer_2_delta.dot(weight_1_2.T)

    if(np.abs(layer_2_delta) < 0.5):
      correct += 1
    total += 1

    if(total % 1000 == 0):
      print('Progress: {} Training Accuracy: {}'.format(total, correct/total))

Progress: 1000 Training Accuracy: 0.0
Progress: 2000 Training Accuracy: 0.0
Progress: 3000 Training Accuracy: 0.0
Progress: 4000 Training Accuracy: 0.0
Progress: 5000 Training Accuracy: 0.0
Progress: 6000 Training Accuracy: 0.0
Progress: 7000 Training Accuracy: 0.0
Progress: 8000 Training Accuracy: 0.0
Progress: 9000 Training Accuracy: 0.0
Progress: 10000 Training Accuracy: 0.0
Progress: 11000 Training Accuracy: 0.0
Progress: 12000 Training Accuracy: 0.0
Progress: 13000 Training Accuracy: 0.038461538461538464
Progress: 14000 Training Accuracy: 0.10714285714285714
Progress: 15000 Training Accuracy: 0.16666666666666666
Progress: 16000 Training Accuracy: 0.21875
Progress: 17000 Training Accuracy: 0.2647058823529412
Progress: 18000 Training Accuracy: 0.3055555555555556
Progress: 19000 Training Accuracy: 0.34210526315789475
Progress: 20000 Training Accuracy: 0.375
Progress: 21000 Training Accuracy: 0.40476190476190477
Progress: 22000 Training Accuracy: 0.4318181818181818
Progress: 23000 Tra