# Data preprocessing

### Directly train your models by using the following data in your network models:

train_padded, train_rating - Training sequences and overall ratings respectively <br />
test_padded, test_rating - Testing sequences and overall ratings respectively

In [58]:
# Importing the libraries


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import math
from nltk.corpus import stopwords
import string
import re
%matplotlib inline

In [59]:
# Importing the dataset

dataset = pd.read_csv("Video_Games_CSV.csv")

dataset.head()


Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A2HD75EMZR8QLN,700099867,123,"[8, 12]",Installing the game was a struggle (because of...,1.0,Pay to unlock content? I don't think so.,1341792000,"07 9, 2012"
1,A3UR8NLLY1ZHCX,700099867,"Alejandro Henao ""Electronic Junky""","[0, 0]",If you like rally cars get this game you will ...,4.0,Good rally game,1372550400,"06 30, 2013"
2,A1INA0F5CWW3J4,700099867,"Amazon Shopper ""Mr.Repsol""","[0, 0]",1st shipment received a book instead of the ga...,1.0,Wrong key,1403913600,"06 28, 2014"
3,A1DLMTOTHQ4AST,700099867,ampgreen,"[7, 10]","I got this version instead of the PS3 version,...",3.0,"awesome game, if it did not crash frequently !!",1315958400,"09 14, 2011"
4,A361M14PU2GUEG,700099867,"Angry Ryan ""Ryan A. Forrest""","[2, 2]",I had Dirt 2 on Xbox 360 and it was an okay ga...,4.0,DIRT 3,1308009600,"06 14, 2011"


In [60]:
"""
A utility function to remove non alphabetical characters from the text.

"""

def clean_noncharacters(text):
    # Removing punctuations
    text  = "".join([char for char in text if char not in string.punctuation])
    # Removing numerics
    text = re.sub('[0-9]+', '', text)
    return text


"""
A utility function to remove URL links from the text.

"""

def clean_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)


"""
A utility function to remove stopwords from the text.

"""

def clean_stopwords(text):
    stop_words = set(stopwords.words('english'))
    res = [w for w in text.split() if not w in stop_words]
    res_string = " ".join(str(x) for x in res)
    return res_string

In [61]:
# Apply above preprocessing methods to the dataset

dataset["reviewText"] = dataset["reviewText"].apply(lambda x : clean_noncharacters(str(x)))
dataset["reviewText"] = dataset["reviewText"].apply(lambda x : clean_URL(str(x)))
dataset["reviewText"] = dataset["reviewText"].apply(lambda x : clean_stopwords(str(x)))

dataset.head(10)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A2HD75EMZR8QLN,700099867,123,"[8, 12]",Installing game struggle games windows live bu...,1.0,Pay to unlock content? I don't think so.,1341792000,"07 9, 2012"
1,A3UR8NLLY1ZHCX,700099867,"Alejandro Henao ""Electronic Junky""","[0, 0]",If like rally cars get game funIt oriented Eur...,4.0,Good rally game,1372550400,"06 30, 2013"
2,A1INA0F5CWW3J4,700099867,"Amazon Shopper ""Mr.Repsol""","[0, 0]",st shipment received book instead gamend shipm...,1.0,Wrong key,1403913600,"06 28, 2014"
3,A1DLMTOTHQ4AST,700099867,ampgreen,"[7, 10]",I got version instead PS version turned mistak...,3.0,"awesome game, if it did not crash frequently !!",1315958400,"09 14, 2011"
4,A361M14PU2GUEG,700099867,"Angry Ryan ""Ryan A. Forrest""","[2, 2]",I Dirt Xbox okay game I started playing games ...,4.0,DIRT 3,1308009600,"06 14, 2011"
5,A2UTRVO4FDCBH6,700099867,A.R.G.,"[0, 0]",Overall well done racing game good graphics ti...,4.0,"Good racing game, terrible Windows Live Requir...",1368230400,"05 11, 2013"
6,AN3YYDZAS3O1Y,700099867,Bob,"[11, 13]",Loved playing Dirt I thought graphics good Pur...,5.0,A step up from Dirt 2 and that is terrific!,1313280000,"08 14, 2011"
7,AQTC623NCESZW,700099867,Chesty Puller,"[1, 4]",I cant tell piece dog game Like everything els...,1.0,Crash 3 is correct name AKA Microsoft,1353715200,"11 24, 2012"
8,A1QJJU33VNC4S7,700099867,D@rkFX,"[0, 1]",I initially gave one star crashing constantly ...,4.0,A great game ruined by Microsoft's account man...,1352851200,"11 14, 2012"
9,A2JLT2WY0F2HVI,700099867,D. Sweetapple,"[1, 1]",I still havent figured one Did everything inst...,2.0,Couldn't get this one to work,1391817600,"02 8, 2014"


In [62]:
# Removing the rows with "overall" values not in range 1.0-5.0

dataset = dataset[dataset['overall'].apply(lambda x: x in ["1.0", "2.0", "3.0", "4.0", "5.0"])]

In [63]:
# Shortening the dataset by removing all the columns except reviewText and overall

dataset_short = dataset[["reviewText","overall"]].copy()

In [64]:

dataset_short.head()

Unnamed: 0,reviewText,overall
0,Installing game struggle games windows live bu...,1.0
1,If like rally cars get game funIt oriented Eur...,4.0
2,st shipment received book instead gamend shipm...,1.0
3,I got version instead PS version turned mistak...,3.0
4,I Dirt Xbox okay game I started playing games ...,4.0


In [65]:
# Importing the tokenizer and pad_sequences libraries

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


# Setting the random seed so that rows shuffle in the same way in every session.

tf.random.set_seed(100)


# vocab_size - size of the vocabulary (unique words in the data corpus)
# trunc_type - whether to truncate the sentence from behind or start (in case if sentence length
#                                                                     greater than vector size.)
# padding_type - whether to pad the short sentences from behind or start
# oov_tok - replace the out of vocab word with a token
# training_size - size of training data
# max_length - maximum length of a sentence sequence

vocab_size = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV_TOKEN>"
training_size = int(len(dataset_short)*0.6)
max_length = 100


# Shuffle the rows
dataset_short = dataset_short.sample(frac=1).reset_index(drop=True)

# Splitting the train and the test sentences list.

temp = [str(x) for x in dataset_short["reviewText"].tolist()]
train_reviews = temp[:training_size]
test_reviews = temp[training_size:]

# Splitting the train and the test labels list.

temp2 = [int(float(x)) for x in dataset_short["overall"].tolist()]
train_rating = temp2[:training_size]
test_rating = temp2[training_size:]

In [66]:
# Tokenize the words

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_reviews)

# word_index = tokenizer.word_index

# Convert sentence reviews to sequences
train_sequences = tokenizer.texts_to_sequences(train_reviews)
# Convert sequences to padded sequences
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)


# Convert sentence reviews to sequences
test_sequences = tokenizer.texts_to_sequences(test_reviews)
# Convert sequences to padded sequences
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [67]:
# Converting all the data to numpy arrays

train_padded = np.array(train_padded)
train_rating = np.array(train_rating)

test_padded = np.array(test_padded)
test_rating = np.array(test_rating)