In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


### Data Preprocessing

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [3]:
dataset="drive/My Drive/IMDB Dataset.csv"

In [4]:
df=pd.read_csv(dataset)

In [5]:
from bs4 import BeautifulSoup
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()
import re
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text

In [6]:
df['review']=df['review'].apply(denoise_text)

In [7]:
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text

In [8]:
def Convert_to_bin(text, remove_digits=True):
    if text=='positive':
      text= 1
    else:
      text=0
    return text

In [9]:
df['review']=df['review'].apply(remove_special_characters)

In [10]:
df['sentiment']=df['sentiment'].apply(Convert_to_bin)

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X=df['review'].values
Y=df['sentiment'].values
X_train, X_test, Y_train, Y_test= train_test_split(X,Y, test_size=0.3)

### Convolutional Neural Network

In mathematics (in particular, functional analysis) convolution is a mathematical operation on two functions (f and g) that produces a third function expressing how the shape of one is modified by the other. The term convolution refers to both the result function and to the process of computing it.

So, Convolutional is best for extracting special features and behaviour of feature values from the 2D pixels of images. Convolutional layers hava a set of kernels which helps to extract several important features from the data samples. Now here, in text classifications our feature matrices are 1Dimensional. So, here Conv1D is used. Basically it moves as a sliding window of size decieded by the user. We have chosen 5. 

Now, initially after embedding we get 100 Dimensional embedding. Next using 1D convolutions we try to make our feature set smaller and let the feature set dicover the best features relations for the classification. The maxpooling layer also helps to pick the features or words which have best performance. 


Convolutional layer is always used after an embedding layer after it provides its embedded feature vectors.

In [31]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [32]:
tokenizer = Tokenizer(num_words=10000)

In [33]:
tokenizer.fit_on_texts(X_train)

In [34]:
x_train = tokenizer.texts_to_sequences(X_train) 

In [35]:
x_test = tokenizer.texts_to_sequences(X_test)

In [36]:
vocab = len(tokenizer.word_index) + 1  

In [37]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [38]:
maxlen = 100

In [39]:
x_train = pad_sequences(x_train, padding='post', maxlen=maxlen)
x_test = pad_sequences(x_test, padding='post', maxlen=maxlen)

In [40]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,Dense, Activation, MaxPool1D,Conv1D
from tensorflow.keras.optimizers import Adam
emb_dim=100

model= Sequential()
model.add(Embedding(input_dim=vocab, output_dim=emb_dim, input_length=maxlen))
model.add(Conv1D(64, 5, activation='relu'))
model.add(MaxPool1D(5))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPool1D(5))
model.add(Dense(16,activation="relu"))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='Adam',loss='binary_crossentropy',metrics=['accuracy'])

In [41]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 100)          17739900  
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 96, 64)            32064     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 19, 64)            0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 15, 128)           41088     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 3, 128)            0         
_________________________________________________________________
dense_2 (Dense)              (None, 3, 16)             2064      
_________________________________________________________________
dense_3 (Dense)              (None, 3, 1)             

In [44]:
history = model.fit(x_train, Y_train,epochs=20,verbose=True,batch_size=16)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [54]:
test_score=model.evaluate(x_test,Y_test)



In [55]:
test_score

[1.7621402740478516, 0.7641777396202087]

[1.7621402740478516, 0.7641777396202087]

[1.7621402740478516, 0.7641777396202087]

In [56]:
train_score=model.evaluate(x_train,Y_train)



In [58]:
train_score

[0.01113097183406353, 0.9938176274299622]

[0.01113097183406353, 0.9938176274299622]

[0.01113097183406353, 0.9938176274299622]

Train accuracy 0.99
Test accuracy  0.77

In [None]:
## Done

## Recurrent Neural Network application

**Why Recurrent Neural Networks**

Until now we have tried to extract some features from all the words in a sample at a time. So, all of them are non temporal approaches. Now, let's see how a person will judge a sentiment. He/she will not only considered what were the words used, humans will also consider how they are used, that is, in what context, and what are the preeceding and succeeding words? So, until now we have focused on what were the words used only, so, now let's look at the other part of the story. 

So, for this part we need Recurrent neural network to give a memory to our models. If we think about telling something about someones statements, we will generally listen to the whole statement word by word and then make a comment. This is what the Recurrent Neural networks will accomplish. It will look at each word on a temporal manner one by one and try to correlate to the context using the embedded feature vector of the word. 

Now as we know RNN suffers from the vanishing and exploding gradient problem we will be using LSTM. 

Now LSTM, operates on two things a hidden state that is sent from previous time stamp and a cell state that actually maintains the weight neutralizing the vanishing gradient effect.

Now, the LSTM layer basically has 4 components:
A Forget gate, An input gate, a cell state and a output gate


![LSTM](https://miro.medium.com/max/700/1*-kBdBYzR7lpimgb3AIRkOw.png)

![gates](https://miro.medium.com/max/700/1*yBXV9o5q7L_CvY7quJt3WQ.png)

![alt text](https://miro.medium.com/max/700/1*y-RI3y90IZpOUMnkCBrQxQ.png)

**LSTM also provide feature set on the last time stamp for the dense layer to use the feature set to produce results.**  Now we can see the above equations are the equations for the Gates of LSTM. Now here each gate acts like a neural network individually. So, they have their individual weight matrices that are optimized when the recurrent network model is trained. Using these weight matrices only the gates learn their tasks, like which data to foget and what part of the data is needed to be updated to the cell state. So, the gates optimize their weight matrices and decide the operations accoring to it. 

Now, lets see the use. 

--------------------------------------------------

Say we have a 100 dimensional vector space. a batch size of 16, each sample length = 10. and the number of nodes in each layer= 64.

INPUT SIZE = batch_size * Embedding 
so, here it is 16 x 300 matrix = i(t)

The time stamp 0 that is the first word of every sample enter. 

PREVIOUS HIDDEN STATE (0 vec for tiemstamp 0) = Batch size x Hidden Units 
So, Here it is 16 x 64 matrix.= h(t-1)

After concatenation, the matrix formed h(t-1)= 16 x 64 and i(t)= 16 x 300

So the h(t-1) + i(t) matrix is sent to all gates.

--------------------------------------------------
First the forget gate Weight matrix of the hidden state is of dimension 64 x 64 because in the hidden state for each of the 16 words of timestamp (t-1) there were 64 values from the 64 nodes from the RNN. 

So, actually our matrix from hidden state 16 rows which are records and for each record there are 64 columns or 64 features. 

y=w1x1+w2x2+........wnxn 

where the x's are the features or the column values. So, there must a maintained array of 64 weights for each node or unit of the network. Now there are 64 such units so total of (64 x 64) matrix.

Again, now for input, there are 16 rows or records for each of them 300 columns or 300 features. so, the weight matrix of one hidden unit must have 100 values. Total 64 units are there. So, dimension of the matrix. (100 x 64)

So at forget gate 

y= sigmoid((16 x 100) x (100 x 64) + (16 x 100)  x (100 x 64))

y=sigmoid (16 x 64) vector

--------------------------------------------------

Sigmoid gives the value between 0 and 1. If the value is close to 0 the value is forgotten else added to the cell state.

Now,the cell state is also of the same dimension (16 x 64) as it is also having the weights of the 16 sample word's by 64 nodes So, they can easily be added.



Next is the input gate it decides what part of the data should enter means the actual tanh. It deciedes whether the cell state should be updated. 

These gates matrices are also same as the forget gates matrices with (64 x 64) values in the last hidden layer and (300 x 64) values for the input.

So, they also give sigmoid(16 x 64) as a result

One thing to notice here is there is a tanh layer also. The tanh is here to squeeze the value between 1 to -1 to deal with the exploding and vanishing gradient. So, it basically works like and regularized value that represnts the value of the cell state on that timestep. The sigmoid is the switch. 

So, after that the obtained vectors are just multiplied to obtain 1 result.

--------------------------------------------------

**One thing to notice about this is, though the wieght matrices are of same dimensions they are not same. They belong to different gates and their values and optimzations are all different.**

--------------------------------------------------

Now in the next step the cell step is updated 

It is basically a simple sum. 

The new c is formed by removing the unwanted information from the last step + accomplishments of the current time step. 


--------------------------------------------------

Next, comes the output gate. This decides what should be next steps hidden layer be. 


For this the new cell state is passed through a tanh gate and the h(t-1) + i(t) is passed through another sigmoid. Both of the results are multiplied. That's the next.

--------------------------------------------------


Now, these weights get updated at every timestep with every word and after the 10th word or timestamp. We get a matrix of size 16 x 64, which are basically the weight values of the nodes corresponding to each sample. But what we don't see are the weight matrices of the gates but they are also optimized. These 64 values basically represent the weights of a sample in the batch. 

For all the samples we obtain a value. These values act as feature set for the dense layers to perform their operations.







In [13]:
file_path="drive/My Drive/glove.6B/"

In [14]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)

In [15]:
x_train = tokenizer.texts_to_sequences(X_train) 
x_test = tokenizer.texts_to_sequences(X_test)

In [16]:
from keras.preprocessing.sequence import pad_sequences
maxlen=100
x_train = pad_sequences(x_train, padding='post', maxlen=maxlen)
x_test = pad_sequences(x_test, padding='post', maxlen=maxlen)

Using TensorFlow backend.


In [17]:
emb_dim=50
vocab=len(tokenizer.word_index)+1
emb_mat= np.zeros((vocab,emb_dim))

In [18]:
with open(file_path+'glove.6B.50d.txt') as f:
        for line in f:
            word, *emb = line.split()
            if word in tokenizer.word_index:
              ind=tokenizer.word_index[word]
              emb_mat[ind]=np.array(emb,dtype="float32")[:emb_dim]

In [20]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,Dense, Activation, MaxPool1D, LSTM
from tensorflow.keras.optimizers import Adam
emb_dim=50
maxlen=100
model= Sequential()
model.add(Embedding(input_dim=vocab, output_dim=emb_dim,weights=[emb_mat], input_length=maxlen,trainable=False))
model.add(MaxPool1D())
model.add(LSTM(64, return_sequences = False))
model.add(Dense(16,activation="relu"))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='Adam',loss='binary_crossentropy',metrics=['accuracy'])

In [21]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 50)           8876350   
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 50, 50)            0         
_________________________________________________________________
lstm (LSTM)                  (None, 64)                29440     
_________________________________________________________________
dense (Dense)                (None, 16)                1040      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 8,906,847
Trainable params: 30,497
Non-trainable params: 8,876,350
_________________________________________________________________


In [22]:
history = model.fit(x_train, Y_train,epochs=50,verbose=True,batch_size=16)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [23]:
test_score=model.evaluate(x_test,Y_test)



In [24]:
test_score

[1.6902750730514526, 0.7759333252906799]

In [25]:
train_score=model.evaluate(x_train,Y_train)



In [26]:
train_score

[0.03299878537654877, 0.9885714054107666]

In [27]:
## Done