Sentiment_Analysis_training.py

"""Akash_Maurya_multi_hot_sentiment .ipynb

Automatically generated by Colaboratory.


# Text classification 
## Sentiment analysis
It is a natural language processing problem where text is understood and the underlying intent is predicted. Here, you need to  predict the sentiment of movie reviews as either positive or negative in Python using the Keras deep learning library.

## Data description
The dataset is the Large Movie Review Dataset often referred to as the IMDB dataset.

The [Large Movie Review Dataset](http://ai.stanford.edu/~amaas/data/sentiment/) (often referred to as the IMDB dataset) contains 25,000 highly polar movie reviews (good or bad) for training and the same amount again for testing. The problem is to determine whether a given moving review has a positive or negative sentiment.  Reviews have been preprocessed, and each review is encoded as a sequence of word indexes (integers).

## Loading dataset
First, we will load complete dataset and analyze some properties of it.
"""

import numpy as np
import h5py
import numpy
import keras
from keras import regularizers,layers
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
# np.load is used inside imdb.load_data. But imdb.load_data still assumes the default 
# values of an older version of numpy. So necessary changes to np.load are made

# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)

# load Numpy
np_load_old = np.load

# modify the default parameters of np.load
np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)

# call load_data with allow_pickle implicitly set to true
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=10000)

# restore np.load for future normal usage
np.load = np_load_old

X = np.concatenate((X_train, X_test), axis=0)
y = np.concatenate((y_train, y_test), axis=0)
print("Size of X")
print(X.shape)
print("Size of X_train")
print(X_train.shape)
print("Size of y")
print(y.shape)
print("Size of y_train")
print(y_train.shape)

"""## **Let's see some of reviews.**"""
print("## **Let's see some of reviews.**")

word_to_id = keras.datasets.imdb.get_word_index()
id_to_word = {value:key for key,value in word_to_id.items()}
for i in range(15,17):
  print("********************  REVIEW_EXAMPLE  ************************")
  print(' '.join(id_to_word.get(id - 3, '?')for id in X_train[i] ))

"""## Summarize the data
1) Find out the number of classes in label (*y* array)? <br>
2) Find out number of unique words in dataset *X*?  <br>
3) Calculate the list of review length , report mean and standard deviation. <br>
"""

def summarize_data():
  """
  Output:
                    classes: list, list of unique classes in y  
                no_of_words: int, number of unique words in dataset x 
     list_of_review_lengths: list,  list of lengths of each review 
         mean_review_length: float, mean(list_of_review_lengths), a single floating point value
          std_review_length: float, standard_deviation(list_of_review_lengths), a single floating point value
  """
  import statistics
  classes = np.unique(y)
  no_of_words = len(np.unique(np.concatenate(X)))
  list_of_review_lengths = [len(i) for i in X]
  mean_review_length = statistics.mean(list_of_review_lengths)
  std_review_length = statistics.stdev(list_of_review_lengths)
  return classes, no_of_words, list_of_review_lengths, mean_review_length, std_review_length

'''Call the function summarize_data'''
classes, no_of_words, list_of_review_lengths, mean_review_length, std_review_length = summarize_data()


"""## One hot encode the output data"""

def one_hot(y):
  """
  Inputs:
    y: numpy array with class labels
  Outputs:
    y_oh: numpy array with corresponding one-hot encodings
  """
  oh = []
  for i in range(0, len(y)):
    if y[i] == 0:
      oh.append([1, 0])
    else:
      oh.append([0, 1]) 
  y_oh = np.array(oh)
  return y_oh
#call the function one_hot  
y_train = one_hot(y_train)
y_test = one_hot(y_test)


"""### Multi-hot encode the input data

All sequences are of different length and our vocabulory size is 10K.  <br>
**To Do**
1) Intialize vector of dimension 10,000 with value 0. <br>
2) For those tokens in a sequence which are present in Vocabulary make that position as 1 and keep all other positions filled with 0. <br>
For example, lets take Vocabulary = ['I': 0, ':1, 'eat: 2:' mango: 3, 'fruit':4, 'happy':5, 'you':6] <br>
We have two sequnces and 
Multi-hot encoding of both sequences will be of dimension:  7 (vocab size).<br>
1) *Mango is my favourite fruit* becomes *Mango ? ? ? fruit* after removing words which are not in my vocabulary. Hence multi hot encoding will have two 1's corresponding to mango and fruit i.e, [0, 0, 0, 1, 1, 0, 0] <br>
Similarly, <br>
  2) *I love to eat mango*  = *I ? ? eat mango*  =  [1, 1, 0, 1, 0, 0, 0]
"""

def multi_hot_encode(sequences, dimension):
  """
    Input:
          sequences: list of sequences in X_train or X_test

    Output:
          results: mult numpy matrix of shape(len(sequences), dimension)
                  
  """
  
  results = np.zeros((len(sequences), dimension))
  for i in range(len(sequences)):
    for j in range(len(sequences[i])):
      results[i][sequences[i][j]] = 1
  return results

'''call the function multi_hot_encode'''
x_train = multi_hot_encode(X_train, 10000)
x_test = multi_hot_encode(X_test, 10000)


"""## Split the data into train and validation"""

from sklearn.model_selection import train_test_split
x_strat, x_dev, y_strat, y_dev = train_test_split(x_train, y_train,test_size=0.40,random_state=0, stratify=y_train)

"""## Build Model
Build a multi layered feed forward network in keras.

### Create the model
"""

def create_model():
    """
    Output:
        model: A compiled keras model
    """
    model = Sequential()
    model.add(Embedding(10000, 32, input_length = 10000))
    model.add(Flatten())
    model.add(Dense(32, activation='tanh'))
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])
    return model
  
model = create_model()
print(model.summary())

"""### Fit the Model"""

def fit(model):
    """
    Action:
        Fit the model created above using training data as x_strat and y_strat
        and validation_data as x_dev and y_dev, verbose=2 and store it in 'history' variable.
        
        evaluate the model using x_test, y_test, verbose=0 and store it in 'scores' list
    Output:
        scores: list of length 2
        history_dict: output of history.history where history is output of model.fit()
    """
    history = model.fit(x_strat, y_strat, validation_data=(x_dev, y_dev), epochs=15, batch_size=128, verbose=1)
    scores =  model.evaluate(x_test, y_test, verbose=0)
    history_dict = history.history
    return scores,history_dict
    
scores,history_dict = fit(model)

model.save("Your_Model.h5")