<a href="https://colab.research.google.com/github/cagBRT/SentimentTextAnalysis/blob/master/Sentiment_Text_Analysis_8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Mount your Google Drive on this CoLab Notebook**
Execute the following code cell<br>
Click on the given link<br>
Select your user name<br>
Click **Allow**<br>
Copy the authorization code<br>
Paste the authorization code into the user input box. <br>
You Google Drive is mounted to this notebook.

In [None]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

In [None]:
# Clone the entire repo.
%cd /content/
!git clone  https://github.com/cagBRT/SentimentTextAnalysis.git cloned-repo
%cd cloned-repo
!ls

# **Check that your drive is mounted**
1. On the menu bar, click the **folder icon**<br>
2. Click on the **folder icon with the up arrow**
3. Click on **gdrive**
4. Click on **My Drive**
5. Check for the file called **wiki-news-300d-1M.vec **<br>
If the file is there, you have correctly installed the necessary files for this notebook. <br>




In [None]:
from IPython.display import Image
def page(num):
    return Image("images/sentTextAna"+str(num)+ ".png" , width=600)

In [None]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

# **Import the libraries**

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

# Install TensorFlow
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

import tensorflow as tf
from tensorflow import keras

In [None]:
import pandas as pd

In [None]:
from keras.models import Sequential
from keras import layers
from keras.callbacks import EarlyStopping

# **Examine the data**<br>
The data is from three sources: <br>
> yelp reviews<br>
> amazon reviews<br>
> movie reviews<br>

The data has the structure: <br>
>"review text" label source<br>

**review text is called**: sentence<br>
**label**: 0 = negative review, 1 = positive review<br>
**source**: yelp, amazon, imdb

In [None]:
#!cat yelp_labelled.txt
#Change directory to the cloned repo
%cd /content/cloned-repo/

In [None]:
#create a dataframe containing all three sources
filepath_dict = {'yelp':   'yelp_labelled.txt',
                 'amazon': 'amazon_cells_labelled.txt',
                 'imdb':   'imdb_labelled.txt'}

df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source  # Add another column filled with the source name
    df_list.append(df)

df = pd.concat(df_list)
print(df.iloc[0])
print("dataframe shape: ",df.shape)

In [None]:
from sklearn.model_selection import train_test_split
#select the rows of the data set that are from yelp
df_yelp = df[df['source'] == 'yelp']

sentences_yelp = df_yelp['sentence'].values
y_yelp = df_yelp['label'].values

#do a 75 - 25 split between train and test data
#If int, random_state is the seed used by the random number generator;
#If RandomState instance, random_state is the random number generator;
#If None, the random number generator is the RandomState instance used by np.random.
sentences_train_yelp, sentences_test_yelp, y_train_yelp, y_test_yelp = train_test_split(
   sentences_yelp, y_yelp, test_size=0.25, random_state=1000)

#print out the first sentence of the training set
print(sentences_train_yelp[0])

In [None]:
from sklearn.model_selection import train_test_split
#select the rows of the data set that are from yelp
df_amazon = df[df['source'] == 'amazon']

sentences_amazon = df_amazon['sentence'].values
y_amazon = df_amazon['label'].values

#do a 75 - 25 split between train and test data
#If int, random_state is the seed used by the random number generator;
#If RandomState instance, random_state is the random number generator;
#If None, the random number generator is the RandomState instance used by np.random.
sentences_train_amazon, sentences_test_amazon, y_train_amazon, y_test_amazon = train_test_split(
   sentences_amazon, y_amazon, test_size=0.25, random_state=1000)

#print out the first sentence of the training set
print(sentences_train_amazon[0])

In [None]:
from keras.preprocessing.text import Tokenizer

#Go through all the reviews and keep 3000 words.
tokenizer_yelp = Tokenizer(num_words=3000) #keep 3000 words

#Update the internal vocabulary based on a list of texts
#Must be run before running texts_to_sequences
tokenizer_yelp.fit_on_texts(sentences_train_yelp)

In [None]:
#Go through all the reviews and keep 3000 words.
tokenizer_amazon = Tokenizer(num_words=3000) #keep 3000 words

#Update the internal vocabulary based on a list of texts
#Must be run before running texts_to_sequences
tokenizer_amazon.fit_on_texts(sentences_train_amazon)

The number assigned to each word is dependent upon is frequency of use in all the sentences. <br>
For example:<br>
>'the' is 1<br>
'and' is 2<br>
'was' is 3<br>


In [None]:
#Examples of reviews as word embeddings
X_train_yelp = tokenizer_yelp.texts_to_sequences(sentences_train_yelp)

In [None]:
#Examples of reviews as word embeddings
X_train_amazon = tokenizer_amazon.texts_to_sequences(sentences_train_amazon)
print(sentences_train_amazon[3],X_train_amazon[3])

In [None]:
X_test_yelp = tokenizer_yelp.texts_to_sequences(sentences_test_yelp)
vocab_size_yelp = len(tokenizer_yelp.word_index) + 1  # Adding 1 because of reserved 0 index

print("vocab size=", vocab_size_yelp)

In [None]:
X_test_amazon= tokenizer_amazon.texts_to_sequences(sentences_test_amazon)
vocab_size_amazon = len(tokenizer_amazon.word_index) + 1  # Adding 1 because of reserved 0 index

print("vocab size=", vocab_size_amazon)

# **Pad the sequence of words**

In [None]:
from keras.utils import pad_sequences
#The maximum length of a review, cut off the extra words
maxlen = 100
#If a review is less than 100 words, pad the vector with 0s.

X_train_yelp = pad_sequences(X_train_yelp, padding='post', maxlen=maxlen)
X_test_yelp = pad_sequences(X_test_yelp, padding='post', maxlen=maxlen)

print(X_train_yelp.shape,X_test_yelp.shape)
print(y_train_yelp.shape,y_test_yelp.shape, "\n")

index=5
print("The review:\n",sentences_train_yelp[index])
print("\nThe final feature vector:\n",X_train_yelp[index, :])

In [None]:
#The maximum length of a review, cut off the extra words
maxlen = 100
#If a review is less than 100 words, pad the vector with 0s.

X_train_amazon = pad_sequences(X_train_amazon, padding='post', maxlen=maxlen)
X_test_amazon = pad_sequences(X_test_amazon, padding='post', maxlen=maxlen)

print(X_train_amazon.shape,X_test_amazon.shape)
print(y_train_amazon.shape,y_test_amazon.shape, "\n")

index=5
print("The review:\n",sentences_train_amazon[index])
print("\nThe final feature vector:\n",X_train_amazon[index, :])

# **Use a precomputed embedding space**


Can performance be improved using a precomputed embedding space that utilizes a much larger corpus? <br>
It is possible to precompute word embeddings by simply training them on a large corpus of text. Among the most popular methods are Word2Vec developed by Google and GloVe (Global Vectors for Word Representation) developed by the Stanford NLP Group.<br>

Word2Vec achieves this by employing neural networks and GloVe achieves this with a co-occurrence matrix and by using matrix factorization. <br>
In both cases you are dealing with dimensionality reduction:  <br>
>Word2Vec is more accurate  <br>
GloVe is faster to compute.


In [None]:
def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word]
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

In [None]:
import numpy as np

embedding_dim = 50
embedding_matrix_yelp = create_embedding_matrix(
    '/gdrive/My Drive/wiki-news-300d-1M.vec',
    tokenizer_yelp.word_index, embedding_dim)
print(embedding_matrix_yelp.shape)

In [None]:
embedding_matrix_amazon = create_embedding_matrix(
    '/gdrive/My Drive/wiki-news-300d-1M.vec',
    tokenizer_amazon.word_index, embedding_dim)
print(embedding_matrix_amazon.shape)

Percentage of vocabulary covered by the pretrained model

In [None]:
nonzero_elements_yelp = np.count_nonzero(np.count_nonzero(embedding_matrix_yelp, axis=1))
nonzero_elements_yelp / vocab_size_yelp

In [None]:
nonzero_elements_amazon = np.count_nonzero(np.count_nonzero(embedding_matrix_amazon, axis=1))
nonzero_elements_amazon / vocab_size_amazon

# **Convolututional Neural Network (CNN)**

In [None]:
embedding_dim = 100

model_yelp = Sequential()
model_yelp.add(layers.Embedding(vocab_size_yelp, embedding_dim, input_length=maxlen))
model_yelp.add(layers.Conv1D(128, 5, activation='relu',name="c1"))
model_yelp.add(layers.GlobalMaxPooling1D())
model_yelp.add(layers.Dense(10, activation='relu'))
model_yelp.add(layers.Dense(1, activation='sigmoid'))
model_yelp.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model_yelp.summary()


In [None]:
embedding_dim = 100

model_amazon = Sequential()
model_amazon.add(layers.Embedding(vocab_size_yelp, embedding_dim, input_length=maxlen))
model_amazon.add(layers.Conv1D(128, 5, activation='relu',name="c1"))
model_amazon.add(layers.GlobalMaxPooling1D())
model_amazon.add(layers.Dense(10, activation='relu'))
model_amazon.add(layers.Dense(1, activation='sigmoid'))
model_amazon.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model_amazon.summary()

In [None]:
history_yelp = model_yelp.fit(X_train_yelp, y_train_yelp,
                    epochs=10,
                    verbose=False,
                    validation_data=(X_test_yelp, y_test_yelp),
                    batch_size=10)
loss_yelp, accuracy_yelp = model_yelp.evaluate(X_train_yelp, y_train_yelp, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy_yelp))
loss_yelp, accuracy_yelp = model_yelp.evaluate(X_test_yelp, y_test_yelp, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy_yelp))
plot_history(history_yelp)

In [None]:
history_amazon = model_amazon.fit(X_train_amazon, y_train_amazon,
                    epochs=10,
                    verbose=False,
                    validation_data=(X_test_amazon, y_test_amazon),
                    batch_size=10)
loss_amazon, accuracy_amazon = model_amazon.evaluate(X_train_amazon, y_train_amazon, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy_amazon))
loss_amazon, accuracy_amazon = model_amazon.evaluate(X_test_amazon, y_test_amazon, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy_amazon))
plot_history(history_amazon)

# **HyperParameter Tuning**

In [None]:
def create_model(num_filters, kernel_size, vocab_size, embedding_dim, maxlen):
    model = Sequential()
    model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
    model.add(layers.Conv1D(num_filters, kernel_size, activation='relu'))
    model.add(layers.GlobalMaxPooling1D())
    model.add(layers.Dense(10, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',metrics=["acc"])
    return model

**Embedding dimension**

In [None]:
param_grid = dict(num_filters=[32, 64, 128],
                  kernel_size=[3, 5, 7],
                  vocab_size=[5000],
                  embedding_dim=[50],
                  maxlen=[100])

# **HyperParameter Grid Search of each text set**
Perform the random search method of hyperparameter tuning to improve the model performance. <br>


In [None]:
filepath_dict = {'yelp':   'yelp_labelled.txt',
                 'amazon': 'amazon_cells_labelled.txt',
                 'imdb':   'imdb_labelled.txt'}

df_full_list = []
for source, filepath in filepath_dict.items():
    df_full = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df_full['source'] = source  # Add another column filled with the source name
    df_full_list.append(df)
df = pd.concat(df_list)
print(df.iloc[0])
print("dataframe shape: ",df.shape)

# **This will take approximately 20 minutes**

In [None]:
!pip install scikeras

In [None]:
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV

# Main settings
epochs = 20
embedding_dim = 50
maxlen = 100
output_file = '/gdrive/My Drive/output.txt'

# Run grid search for each source (yelp, amazon, imdb)
for source, frame in df.groupby('source'):
    print('Running grid search for data set :', source)
    sentences = df['sentence'].values
    y = df['label'].values

    # Train-test split
    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)

    # Tokenize words
    tokenizer = Tokenizer(num_words=5000)
    tokenizer.fit_on_texts(sentences_train)
    X_train = tokenizer.texts_to_sequences(sentences_train)
    X_test = tokenizer.texts_to_sequences(sentences_test)

    # Adding 1 because of reserved 0 index
    vocab_size = len(tokenizer.word_index) + 1

    # Pad sequences with zeros
    X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
    X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

    # Parameter grid for grid search
    param_grid = dict(num_filters=[32, 64, 128],
                      kernel_size=[3, 5, 7],
                      vocab_size=[vocab_size],
                      embedding_dim=[embedding_dim],
                      maxlen=[maxlen])

    model = KerasClassifier(build_fn=create_model,
                            epochs=epochs, batch_size=10,
                            verbose=False, vocab_size=4603)

    grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid,
                              cv=4, verbose=1, n_iter=5)
    grid_result = grid.fit(X_train, y_train)

    # Evaluate testing set
    test_accuracy = grid.score(X_test, y_test)

    # Save and evaluate results
    with open(output_file, 'a') as f:
        s = ('Running {} data set\nBest Accuracy : '
             '{:.4f}\n{}\nTest Accuracy : {:.4f}\n\n')
        output_string = s.format(
            source,
            grid_result.best_score_,
            grid_result.best_params_,
            test_accuracy)
        print(output_string)
        f.write(output_string)

In [None]:
Image("images/CNN Results.png" , width=600)

# **HyperParameter Tuning on all the datasets together**

In [None]:
#create a dataframe containing all three sources
filepath_dict = {'yelp':   'yelp_labelled.txt',
                 'amazon': 'amazon_cells_labelled.txt',
                 'imdb':   'imdb_labelled.txt'}

df_full_list = []
for source, filepath in filepath_dict.items():
    df_full = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df_full['source'] = source  # Add another column filled with the source name
    df_full_list.append(df)

df_full = pd.concat(df_list)
print(df_full.iloc[2000])
print("dataframe shape: ",df_full.shape)

In [None]:
# Main settings
epochs = 20
embedding_dim = 50
maxlen = 100
output_file = '/gdrive/My Drive/output.txt'

print('Running grid search for data set :\n', df_full)
sentences = df_full['sentence'].values
y = df_full['label'].values

# Train-test split
sentences_train, sentences_test, y_train, y_test = train_test_split(
    sentences, y, test_size=0.25, random_state=1000)

# Tokenize words
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences_train)
X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)

# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1

# Pad sequences with zeros
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

# Parameter grid for grid search
param_grid = dict(num_filters=[32, 64, 128],
                      kernel_size=[3, 5, 7],
                      vocab_size=[vocab_size],
                      embedding_dim=[embedding_dim],
                      maxlen=[maxlen])
model = KerasClassifier(build_fn=create_model,
                            epochs=epochs, batch_size=10,
                            verbose=False)
grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid,
                              cv=4, verbose=1, n_iter=5)
grid_result = grid.fit(X_train, y_train)

# Evaluate testing set
test_accuracy = grid.score(X_test, y_test)

# Save and evaluate results
with open(output_file, 'a') as f:
   s = ('Running {} data set\nBest Accuracy : '
        '{:.4f}\n{}\nTest Accuracy : {:.4f}\n\n')
   output_string = s.format(
            source,
            grid_result.best_score_,
            grid_result.best_params_,
            test_accuracy)
   print(output_string)
   f.write(output_string)