<a href="https://colab.research.google.com/github/ameervalki/Machine-Learning/blob/ML-Basics/MovieReviewClassification_using%20TF_HUB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Movie Review Classifier


```
TEAM ASA
```

<BR>

In [0]:
#Importing necessary libraries

import pandas as pd   # For the DataFrame 
import numpy as np
import random
import matplotlib.pyplot as plt
#%matplotlib notebook                      # In jupyter run this line to get an interactive plot

In [0]:
import tensorflow as tf
import tensorflow_hub as hub

In [0]:
#Creating the DataFrame
df=pd.read_csv('/content/drive/My Drive/colabutils/mrcproject/IMDB Dataset.csv')    #For others, import the dataset by replacing the path

<b><i>Exploratory Data Analysis on the Dataset</i></b>

In [0]:
df   #The complete DataFrame

In [0]:
df.info()

In [0]:
df.describe()

In [0]:
#To view a single sample

list(df.loc[np.random.randint(0,1000)].values)

<br><b><i>WordCloud of the Dataset</i></b><br><br>

In [0]:
# To See the wordcloud of both the sentiments - Positive and negative

import re                                      #Regex library used for removing spcl characters and for clean sentences
from wordcloud import WordCloud,STOPWORDS
STOPWORDS.add('movie')
STOPWORDS.add('one')

def dispwc():
  
  dg=df.groupby('sentiment')
  neg=dg.get_group('negative').drop('sentiment',axis=1).values
  pos=dg.get_group('positive').drop('sentiment',axis=1).values

  wc1=WordCloud(width=800,height=500,background_color="white",max_words=400,
              random_state=None, min_font_size=1,prefer_horizontal=0.7)
  wc2=WordCloud(width=800,height=500,background_color="black",max_words=400,
              random_state=None, min_font_size=1,prefer_horizontal=0.7)
  
  posstr=' '.join([x for x in ' '.join(re.split(r'\W+',str(str(pos).split()))).split() if not x in STOPWORDS and x != 'br'])
  negstr=' '.join([x for x in ' '.join(re.split(r'\W+',str(str(neg).split()))).split() if not x in STOPWORDS and x != 'br'])

  wc1.generate(posstr)
  wc2.generate(negstr)

  plt.figure(figsize=(40,80))
  plt.subplot(1,2,1)
  plt.imshow(wc1)
  plt.xlabel('Postive Reviews')
  plt.subplot(1,2,2)
  plt.imshow(wc2)
  plt.xlabel('Negative Reviews')


#dispwc()

#Model Building

<br><b><i>Feature Extraction</i></b><br>
We used Google News Text embedding from tensorflow hub. It converts Sentences into a 20 dimensional vector 
<br><br>



In [0]:
url="https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"

#url="https://tfhub.dev/google/universal-sentence-encoder/4"         #512 dimensional embedding

embed=hub.load(url)

#We can use-  embed(['sentence']) to get the embeddings

print(embed(['This is just a sample sentence, to show the text embeddings of any given sentence']).numpy(), end='\n\n')

print(embed(["The sentences can include even html tags, characters and numbers like- &nbsp; &, 65, <br>, * "]).numpy(),end='\n\n')

print(embed(["The sentences can include even html tags characters and numbers like nbsp 65 br "]).numpy(), end='\n\n') 

print('"Observe the similarity of embeddings of the last two sents." ^')

<br><b><i>Dataset creation</i></b><br><br>

In [0]:
df2=df.copy()                                     #Creating a copy for a possible future use

In [0]:
labels=df.pop('sentiment').values
reviews=df['review'].values

In [0]:
labels_ser=pd.Series(labels, dtype='category')    # Its a categorical series. That is it converts all the values.
                                                  # within that series into numbers
                                                  
newlabel=labels_ser.cat.codes.values              # This returns the values of the series.
newlabel

In [0]:
print(reviews)
print(newlabel)

In [0]:
#Splitting dataset into Train, validation and Test.

total=len(reviews)
trainratio , validratio, testratio = (0.7, 0.15, 0.15)

sep1=int(trainratio*total)
sep2=int(validratio*total)

reviews_tr=reviews[:sep1]
reviews_va=reviews[sep1:sep1+sep2]
reviews_te=reviews[sep1+sep2:total]

labels_tr=newlabel[:sep1]
labels_va=newlabel[sep1:sep1+sep2]
labels_te=newlabel[sep1+sep2:total]

labels_te.shape

In [0]:
#Creating the proper dataset for feeding to the network by using "tf.data.Dataset" Module

batchsize = 200

train_ds = tf.data.Dataset.from_tensor_slices((reviews_tr,labels_tr)).shuffle(1000).batch(batchsize)
valid_ds = tf.data.Dataset.from_tensor_slices((reviews_va,labels_va)).shuffle(1000).batch(batchsize)
test_ds  = tf.data.Dataset.from_tensor_slices((reviews_te,labels_te)).shuffle(1000).batch(batchsize)

train_ds

In [0]:
#To Look at how the dataset looks like

samp_ds  = tf.data.Dataset.from_tensor_slices((reviews[:10],newlabel[:10]))

for samp in samp_ds:
  print(samp)
  pass

<br><b><i>Building the Network</i></b>

We used Keras API to build a Neural Network model.<br>
The input pipeline and the network is built as shown below.<br>
<br>

In [0]:
# url = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1".   Using the same Url as the text embedding module

hub_layer=hub.KerasLayer(url, input_shape=[], dtype=tf.string, trainable=True)  # Creating a NN layer which acts as the input pipeline and vectorizes the input review
                                                                                # Since it is true, The embedder will have to train all its parameters
model = tf.keras.Sequential([
                             hub_layer,                                         # The embedding layer mmentioned above
                             tf.keras.layers.Dense(20,activation='relu'),       # A Hidden Dense layer with 16 nodes, with ReLU activation function
                             tf.keras.layers.Dense(20,activation='relu'),       # A Hidden Dense layer with 16 nodes, with ReLU activation function
                             tf.keras.layers.Dense(1,activation='tanh')         # A Dense layer with 1 node for the output. The Tanh activation gives an output of range (-1,1)
])


In [0]:
#Lets configure the learning process/algorithm

model.compile(
    optimizer='adam',
    loss='squared_hinge',
    metrics='accuracy'
)

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<b>The structure of the model can be seen below.</b>



![alt text](https://drive.google.com/uc?id=12zhO5I2DDjkpDNASj9qCA-n5V3E6jce5)

In [0]:
model.summary()

<br><b><i><u>Training the network</i></b><br></u>
Note that we use the validation data for checking the model performance in between <i>each</i> epoch.<br>
<br>

In [0]:
numepoch = 15


history = model.fit(
                    train_ds,
                    epochs=numepoch, 
                    validation_data=valid_ds,
                    verbose=1
                    )

In [0]:
# Plotting Training and validation accuracies to check underfitting or overfitting

tracc=history.history['accuracy']
valacc=history.history['val_accuracy']

plt.figure(figsize=(10,5))
plt.ylim(0.5,1)                                         # Y axis ranging from 0.5 to 1.0 accuracy
plt.plot(range(1,numepoch+1),tracc,label='Train')
plt.plot(range(1,numepoch+1),valacc,label='Valid')
plt.title('Accuracy')
plt.xlabel('Epochs   -->')

plt.legend(loc='upper left')
plt.show()

print('\n "The epoch with highest validation accuracy is ',np.argmax(valacc)+1,'"')

<br><b><i><u>Evaluating the model</i></b><br></u>

<br>

In [0]:
loss, acc = model.evaluate(test_ds)
print('The Model achieved an accuracy of: {} %'.format(acc*100))

In [0]:
def predict(string):
  prob=model.predict([string])
  print('Model Output: ',prob[0][0],'\n')

  if prob[0][0]>0:
    print("The Review is POSITIVE!")
  else:
    print("The Review is NEGATIVE!")
  print('The strength of review is {}%'.format(abs(prob[0][0]*100)))

rev1='As iam told that this movie is really good, i really didnt feel it was satisfactory. But compared to other films, this was not good'     # A mediocre review

rev2='Oh My god!! This is the kind of movie i expect to watch. This Movie was so awesome and fantastic that I couldnt resist watching it again and again' # An Excellent review

rev3='Ughh! Do you even call this a movie? What type of sane minded human watches this? I dont have words to decribe how awful this movie is'   # Worst review

predict(rev1)

#<i>Evaluation of the Model

In [0]:
y_pred=model.predict(reviews_te)

print("Predictions: \n",y_pred,end='\n\n')          # Since the predictions are a continuous value, we need to convert them to categories of 0 and 1 (Just like the labels_te)

pred_lab=[]
for lab in y_pred:                                
  if lab>=0:                                      # The predictions range from (-1,1), thus we seperate it by '0' value
    pred_lab.append(1)
  else:
    pred_lab.append(0)

print('Binary predictions= ',pred_lab)

In [0]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

print('Accuracy score: ',accuracy_score(pred_lab,labels_te),end='\n\n')

print('The Confusion Matrix: \n',confusion_matrix(pred_lab,labels_te),end='\n\n')

print('The classification report: \n',classification_report(pred_lab,labels_te,target_names=['Negative','Positive'],digits=2))