# Assignment 3 on Natural Language Processing

## Date : 30th Sept, 2020

### Instructor : Prof. Sudeshna Sarkar

### Teaching Assistants : Alapan Kuila, Aniruddha Roy, Anusha Potnuru, Uppada Vishnu

The central idea of this assignment is to use Naive Bayes classifier and LSTM based classifier and compare the models by accuracy on IMDB dataset.



Please submit with outputs. 

In [None]:
import re
import pandas as pd
import numpy as np
import keras
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
IMDBdataset = pd.read_csv("IMDB Dataset.csv")
#IMDBdataset.head(10)

# Preprocessing
PrePrecessing that needs to be done on lower cased corpus

1. Remove html tags
2. Remove URLS
3. Remove non alphanumeric character
4. Remove Stopwords
5. Perform stemming and lemmatization

You can use regex from re. 

In [None]:
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
sw = set(stopwords.words('english'))


def preprocess(_str):
  _str = _str.lower()
  _str = re.sub(r"http\S+", "", _str)
  _str = re.sub(r'https?:\/\/.*[\r\n]*', '', _str)
  _str = re.compile(r'<[^>]+>').sub("",_str)   #removing_html_tags 
  _str = re.sub(r"[^a-zA-Z0-9]+", " ", _str)
  _list = word_tokenize(_str)
  sw = set(stopwords.words('english'))
  filtered_sentence = [word for word in _list if not word in sw]
  

  lemmatizer = WordNetLemmatizer()
  lem_ = [lemmatizer.lemmatize(word,pos="n") for word in filtered_sentence] 
  s = ' '.join(lem_)
  return s 

IMDBdataset["review"] = IMDBdataset["review"].apply(preprocess)

IMDBdataset.head(5)



# Print Statistics of Data like avg length of sentence , proposition of data w.r.t class labels
import matplotlib.pyplot as plt
import seaborn as sns
token_lens = []
for i in IMDBdataset.review[0:]:
  token_lens.append(len(i))
sns.distplot(token_lens)
plt.xlim([0, 1024]);
plt.xlabel('Token_count');

In [None]:
sent_stat = []
sent = IMDBdataset["review"].tolist()
for s in sent:
  tokens = word_tokenize(s)
  sent_stat.append(tokens)
    
#average and max length of of sentence
avg_max_len = [len(s) for s in sent_stat]
print("\nAverage length of Review: {}".format(np.mean(avg_max_len)))
print("\n max length of Review: {}".format(np.max(avg_max_len)))

p=0
n=0
for i in IMDBdataset.sentiment[0:]:
  if i=='positive':
    p = p+1
  else:
    n=n+1
plt.bar(['Positve','Negative'],[p,n])

# Naive Bayes classifier

In [None]:
# get reviews column from df
reviews = IMDBdataset["review"]

# get labels column from df
labels = IMDBdataset["sentiment"]


# Use label encoder to encode labels. Convert to 0/1
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)
print(encoder.classes_)
print(encoded_labels)



# Split the data into train and test (80% - 20%). 
# Use stratify in train_test_split so that both train and test have similar ratio of positive and negative samples.
train_sent, test_sent, train_labels, test_labels = train_test_split(reviews,encoded_labels, test_size=0.2,random_state=0 ,stratify=encoded_labels)
# train_sent, test_sent, train_labels, test_labels 

Here there are two approaches possible for building vocabulary for the naive Bayes.
1. Take the whole data (train + test) to build the vocab. In this way while testing there is no word which will be out of vocabulary.
2. Take the train data to build vocab. In this case, some words from the test set may not be in vocab and hence one needs to perform smoothing so that one the probability term is not zero.
 
You are supposed to go by the 2nd approach.
 
Also building vocab by taking all words in the train set is memory intensive, hence you are required to build vocab by choosing the top 2000 - 3000 frequent words in the training corpus.

> $ P(x_i | w_j) = \frac{ N_{x_i,w_j}\, +\, \alpha }{ N_{w_j}\, +\, \alpha*d} $


$N_{x_i,w_j}$ : Number of times feature $x_i$ appears in samples of class $w_j$

$N_{w_j}$ : Total count of features in class $w_j$

$\alpha$ : Parameter for additive smoothing. Here consider $\alpha$ = 1

$d$ : Dimentionality of the feature vector  $x = [x_1,x_2,...,x_d]$. In our case its the vocab size.






In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# Use Count vectorizer for frequency of words
'''
max_features parameter : If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus.
vec = CountVectorizer(max_features = 3000)
X = vec.fit_transform(Sentence_list)
'''
vector = CountVectorizer(max_features = 3000)
X = vector.fit_transform(train_sent)


V = X.toarray()
vocab_ = vector.vocabulary_
vocab_
V.shape


def freq_review(reviews,ys,vocab):
    answer = {}
    for y,review in zip(ys, reviews):
      _list = word_tokenize(review)
      for word in _list:
            # define the key, which is the word and label tuple
          pair = (word,y)
            # if the key exists in the dictionary, increment the count
          if pair in answer:
            answer[pair] += 1
            # else, if the key is new, add it to the dictionary and set the count to 1
          else:
            answer[pair] = 1
    return answer

freqs = freq_review(train_sent,train_labels,vocab_)

N_p =0 
N_n = 0
for w in vocab_.keys() :
  N_p += freqs.get((w,1),0)
  N_n += freqs.get((w,0),0)
print("N_positive is - " + N_p)
print("N_negative is - " + N_n)

In [None]:
# laplace smoothing included 
def naive_bayes(freq,_list,vocab_size,N_p,N_n,Vocab):
  prob_p=0
  prob_n=0
  for w in _list:
    if w in Vocab.keys():
      prob_p += np.log(freq.get((w,1),0)+1/(N_p+vocab_size))
      prob_n += np.log(freq.get((w,0),0)+1/(N_n+vocab_size))
    else :
      prob_p +=  np.log(1/(N_p+vocab_size))
      prob_n += np.log(1/(N_n+vocab_size))
  prob_p += np.log(N_p/(N_p+N_n))
  prob_n+= np.log(N_n/(N_p+N_n))
  if prob_p-prob_n > 0:
    return 1
  else :
    return 0

# Test the model on test set and report Accuracy
y_lo = []
for sent in test_sent: 
  y_lo.append(naive_bayes(freqs,word_tokenize(sent),3000,N_p,N_n,vocab_))
    
accuracy = 0
tot = 10000
for i in range(len(y_lo)):
  if y_lo[i] == test_labels[i]:
    accuracy+=1
print('Accuracy % is -->' + str((accuracy*100)/tot) + "%")

# *LSTM* based Classifier

Use the above train and test splits.

In [None]:
# Hyperparameters of the model
vocab_size = 3000 # choose based on statistics
oov_tok = '<OOK>'
embedding_dim = 100
max_length = 200 # choose based on statistics, for example 150 to 200
padding_type='post'
trunc_type='post'

In [None]:
# tokenize sentences
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sent)
word_index = tokenizer.word_index

# convert train dataset to sequence and pad sequences
train_sequences = tokenizer.texts_to_sequences(train_sent)
train_padded = pad_sequences(train_sequences, padding='post', maxlen=max_length)

# convert Test dataset to sequence and pad sequences
test_sequences = tokenizer.texts_to_sequences(test_sent)
test_padded = pad_sequences(test_sequences, padding='post', maxlen=max_length)

In [None]:
# model initialization
model = keras.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    keras.layers.Bidirectional(keras.layers.LSTM(64)),
    keras.layers.Dense(24, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

# compile model
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# model summary
model.summary()

In [None]:
num_epochs = 5
history = model.fit(train_padded, train_labels, 
                    epochs=num_epochs, verbose=1, 
                    validation_split=0.1)

In [None]:
# Calculate accuracy on Test data

prediction = model.predict(test_padded)

# Get probabilities
prediction = model.predict(test_padded)

# Get labels based on probability 1 if p>= 0.5 else 0
prediction = prediction>=0.5

# Accuracy : one can use classification_report from sklearn
accurate_prediction=0
for i in range(len(test_labels)):
  if prediction[i]==test_labels[i]:
    accurate_prediction = accurate_prediction+1

accuracy_1 = correct_predictions/len(test_labels)
print('Accuracy % is -->' + str(accuracy_1*100) + "%")
    
                      #OR
    
from sklearn.metrics import classification_report
print(classification_report(test_labels, prediction))

## Get predictions for random examples

In [None]:
# reviews on which we need to predict
sentence = ["The movie was very touching and heart whelming", 
            "I have never seen a terrible movie like this", 
            "the movie plot is terrible but it had good acting"]

# convert to a sequence
sequences = tokenizer.texts_to_sequences(sentence)

# pad the sequence
padded = pad_sequences(sequences, padding='post', maxlen=max_length)

# Get probabilities
# print(model.predict(padded))
predicted = model.predict(padded)

# Get labels based on probability 1 if p>= 0.5 else 0
predicted = predicted>=0.5 
predicted = predicted +1 -1
print(predicted)
