# CSCI 635 - Introduction to Machine learning - Supervised learning

The following notebook discusses the training of neural networks to be able to perform sentiment analysis over a input review of video game. 

In [None]:
# Importing the libraries


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import math
from nltk.corpus import stopwords
import string
import re
%matplotlib inline

In [None]:
# Importing the dataset

dataset = pd.read_csv("Video_Games_CSV.csv")

dataset.head()


The text data preprocessing utility functions are mentioned below:

In [None]:
"""
A utility function to remove non alphabetical characters from the text.

"""

def clean_noncharacters(text):
    # Removing punctuations
    text  = "".join([char for char in text if char not in string.punctuation])
    # Removing numerics
    text = re.sub('[0-9]+', '', text)
    return text


"""
A utility function to remove URL links from the text.

"""

def clean_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)


"""
A utility function to remove stopwords from the text.

"""

def clean_stopwords(text):
    stop_words = set(stopwords.words('english'))
    res = [w for w in text.split() if not w in stop_words]
    res_string = " ".join(str(x) for x in res)
    return res_string

In [None]:
# Apply above preprocessing methods to the dataset

dataset["reviewText"] = dataset["reviewText"].apply(lambda x : clean_noncharacters(str(x)))
dataset["reviewText"] = dataset["reviewText"].apply(lambda x : clean_URL(str(x)))
dataset["reviewText"] = dataset["reviewText"].apply(lambda x : clean_stopwords(str(x)))

dataset.head(5)

As studied previously, not all reviews have numeric overall rating class. Some have textual classes as well. So we eliminate those reviews from our training process:

In [None]:
# Removing the rows with "overall" values not in range 1.0-5.0

dataset = dataset[dataset['overall'].apply(lambda x: x in ["1.0", "2.0", "3.0", "4.0", "5.0"])]

Removing all the columns which do not play role in training of the network.

In [None]:
# Shortening the dataset by removing all the columns except reviewText and overall

dataset_short = dataset[["reviewText","overall"]].copy()

We assign a polarity value to each review which determines the overall sentiment of the review.

The mapping is as below:

- Overall rating > 3: Positive sentiment (+1)
- Overall rating == 3: Neutral sentiment (0)
- Overall rating < 3: Negative sentiment (-1)

In [None]:
def apply_polarity(row):
    if int(float(row["overall"])) > 3:
        return 1
    elif int(float(row["overall"])) == 3:
        return 0
    else:
        return -1

In [None]:
dataset_short['polarity'] = dataset_short.apply(lambda row: apply_polarity(row), axis=1)

In [None]:
dataset_short.head(20)

A utlity function to plot the learning curves for a neural network history instance.

In [None]:
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

Preprocessing the data to be able to train by a neural network:

In [None]:
# Importing the tokenizer and pad_sequences libraries

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


# Setting the random seed so that rows shuffle in the same way in every session.

tf.random.set_seed(100)


# vocab_size - size of the vocabulary (unique words in the data corpus)
# trunc_type - whether to truncate the sentence from behind or start (in case if sentence length
# padding_type - whether to pad the short sentences from behind or start
# oov_tok - replace the out of vocab word with a token
# training_size - size of training data
# max_length - maximum length of a sentence sequence

vocab_size = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV_TOKEN>"
training_size = int(len(dataset_short)*0.6)
max_length = 100


# Shuffle the rows
dataset_short = dataset_short.sample(frac=1).reset_index(drop=True)

# Splitting the train and the test sentences list.

temp = [str(x) for x in dataset_short["reviewText"].tolist()]
train_reviews = temp[:training_size]
test_reviews = temp[training_size:]

# Splitting the train and the test labels list.

temp2 = [x for x in dataset_short["polarity"].tolist()]
train_rating = temp2[:training_size]
test_rating = temp2[training_size:]

In [None]:
# Tokenize the words
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_reviews)


# Convert sentence reviews to sequences
train_sequences = tokenizer.texts_to_sequences(train_reviews)
# Convert sequences to padded sequences
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)


# Convert sentence reviews to sequences
test_sequences = tokenizer.texts_to_sequences(test_reviews)
# Convert sequences to padded sequences
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
# Converting all the data to numpy arrays

from tensorflow.keras.utils import to_categorical

train_padded = np.array(train_padded)
train_rating = np.array(train_rating)
test_padded = np.array(test_padded)
test_rating = np.array(test_rating)


train_rating_encoded = to_categorical(train_rating, num_classes = 3)
test_rating_encoded = to_categorical(test_rating, num_classes = 3)

train_padded = np.array(train_padded)
train_rating_encoded = np.array(train_rating_encoded)

test_padded = np.array(test_padded)
test_rating_encoded = np.array(test_rating_encoded)

# Sentiment analysis using dense network

In [None]:

embedding_dim = 100

dnn_model = tf.keras.Sequential([
        
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),    
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(10, activation = "relu"),
    tf.keras.layers.Dense(5, activation = "relu"),
    tf.keras.layers.Dense(3, activation = "softmax")
])

# Compile the model
dnn_model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
# Uncomment below line for Hyperparamter tuning change
# num_epochs = 25
num_epochs = 10


# Learning rate scheduler callback function
def scheduler(num_epochs):
  if(num_epochs < 3):
    return 0.001
  else:
    return 0.0001 * tf.math.exp(0.1 * (10 - num_epochs))

lr_scheduler_callback = tf.keras.callbacks.LearningRateScheduler(scheduler)


# Change the patience value to 2 for hyperparameter tuned model.

early_stopping_callback_loss = tf.keras.callbacks.EarlyStopping(monitor = 'loss', patience = 1)

# Change the patience value to 2 for hyperparameter tuned model.

early_stopping_callback_val_loss = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss', patience = 1)


# Add lr_scheduler_callback to the callbacks list function parameter below to enable learning rate scheduler decay.

dnn_history = dnn_model.fit(train_padded, train_rating_encoded, epochs=num_epochs, validation_split = 0.2, \
                         callbacks = [early_stopping_callback_loss, early_stopping_callback_val_loss \
                                      ], verbose=1)

plot_graphs(dnn_history, "accuracy")
plot_graphs(dnn_history, "loss")

In [None]:
test_pred_classes = []
test_predictions = dnn_model.predict(test_padded)

# Assigning the suitable class to the test data from 3 softmax class probabilities.

i = 0
for probs in test_predictions:
    if(probs[0] > probs[1] and probs[0] > probs[2]):
        test_pred_classes.append(0)
    if(probs[1] > probs[0] and probs[1] > probs[2]):
        test_pred_classes.append(1)
    if(probs[2] > probs[0] and probs[2] > probs[1]):
        test_pred_classes.append(-1)

In [None]:
from sklearn.metrics import accuracy_score

print("The accuracy of the model is: {}".format(accuracy_score(test_rating, test_pred_classes)*100))

# Sentiment Analysis using LSTM based network.

In [None]:
embedding_dim = 100

lstm_model = tf.keras.Sequential([
    
    
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),    
    tf.keras.layers.LSTM(18),
    tf.keras.layers.Dense(10, activation="relu"),
    tf.keras.layers.Dense(3, activation = "softmax")
    
    ])

# Compile the model
lstm_model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
# Uncomment below line for Hyperparamter tuning change
# num_epochs = 25
num_epochs = 10


# Learning rate scheduler callback function
def scheduler(num_epochs):
  if(num_epochs < 3):
    return 0.001
  else:
    return 0.0001 * tf.math.exp(0.1 * (10 - num_epochs))

lr_scheduler_callback = tf.keras.callbacks.LearningRateScheduler(scheduler)



# Change the patience value to 2 for hyperparameter tuned model.

early_stopping_callback_loss = tf.keras.callbacks.EarlyStopping(monitor = 'loss', patience = 1)

# Change the patience value to 2 for hyperparameter tuned model.

early_stopping_callback_val_loss = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss', patience = 1)



# Add lr_scheduler_callback to the callbacks list function parameter below to enable learning rate scheduler decay.

lstm_history = lstm_model.fit(train_padded, train_rating_encoded, epochs=num_epochs, validation_split = 0.2, \
                         callbacks = [early_stopping_callback_loss, early_stopping_callback_val_loss, lr_scheduler_callback], verbose=1)

plot_graphs(lstm_history, "accuracy")
plot_graphs(lstm_history, "loss")

In [None]:
test_pred_classes = []
test_predictions = lstm_model.predict(test_padded)

# Assigning the suitable class to the test data from 3 softmax class probabilities.

i = 0
for probs in test_predictions:
    if(probs[0] > probs[1] and probs[0] > probs[2]):
        test_pred_classes.append(0)
    if(probs[1] > probs[0] and probs[1] > probs[2]):
        test_pred_classes.append(1)
    if(probs[2] > probs[0] and probs[2] > probs[1]):
        test_pred_classes.append(-1)

In [None]:
from sklearn.metrics import accuracy_score

print("The accuracy of the model is: {}".format(accuracy_score(test_rating, test_pred_classes)*100))