In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import re
from nltk.corpus import stopwords
import pickle
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import Embedding
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.callbacks import ModelCheckpoint

In [3]:
stopWord=stopwords.words("english")
stopWord.remove('not')

In [4]:
df=pd.read_csv("Reviews.csv")

In [5]:
df.dropna(inplace=True,subset=["ProductId","Text","Score"])

In [6]:
df.drop("Id",axis=1,inplace=True)

In [7]:
reviews_df=df[["Text","Score"]]

In [8]:
def cleaning_noise(String):
    tags = re.compile('<.*?>')
    string_without_tags = re.sub(tags, " ", String)
    string_without_url = re.sub(r'[\S]*\.(net|com|org|info|edu|gov|uk|de|ca|jp|fr|au|us|ru|ch|it|nel|se|no|es|mil)[\S]*\s?|[\S]*@gmail','',string_without_tags)
    string_without_punc=re.sub(r'[^a-zA-Z]',' ',string_without_url)
    noise_free_review=[i for i in string_without_punc.split() if i not in stopWord ]
    noise_free_review=' '.join(noise_free_review)
    return noise_free_review

In [9]:
reviews_df["Text"]=reviews_df["Text"].apply(cleaning_noise)

In [10]:
reviews_df=reviews_df[reviews_df["Text"].apply(lambda x:len(x.split(" "))in range(1,250))]

In [11]:
corpus=reviews_df["Text"].values

In [12]:
def change_score(score):
    if(score<3):
        return 0
    else:
        return 1

In [13]:
y_data=reviews_df["Score"].apply(change_score).values

In [14]:
y_data

array([1, 0, 1, ..., 1, 1, 1], dtype=int64)

In [17]:
num_words=102354  #num_words = len(data.word_index) + 1

In [18]:
data=Tokenizer(num_words=num_words)
data.fit_on_texts(corpus)
x_data=data.texts_to_sequences(corpus)
x_data=pad_sequences(x_data,padding="pre")

In [20]:
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [22]:
x_data

array([[    0,     0,     0, ...,  8361,    12,    36],
       [    0,     0,     0, ...,  7874,    12,  5364],
       [    0,     0,     0, ...,  2146,  6690, 12790],
       ...,
       [    0,     0,     0, ...,     3,    38,    17],
       [    0,     0,     0, ...,   793,  3863,   126],
       [    0,     0,     0, ...,   702,  1164,  2464]])

In [23]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.40, random_state = 0)

In [24]:
model = Sequential()
model.add(Embedding(num_words,8))
model.add(LSTM(units = 50,return_sequences = True))
model.add(Dropout(0.2))
model.add(LSTM(units = 50,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units = 50))
model.add(Dropout(0.2))
model.add(Dense(units = 1,activation='sigmoid'))
model.compile(optimizer="adam",loss='binary_crossentropy',metrics=["accuracy"])

In [25]:
Val_ACCURACY_THRESHOLD = 0.95
ACCURACY_THRESHOLD = 0.98
class myCallback(Callback):
    def on_epoch_end(self, epoch, logs={}):
        if(logs.get('val_accuracy') > Val_ACCURACY_THRESHOLD):
            self.model.stop_training = True
        elif(logs.get('accuracy') > ACCURACY_THRESHOLD):
            self.model.stop_training = True
callbacks = myCallback()


checkpointer = ModelCheckpoint(filepath="model.hdf5",monitor='val_accuracy',save_best_only=True)

In [26]:
model.fit(x_train,y_train,epochs = 5,batch_size=512,validation_data=(x_test,y_test),callbacks=[callbacks,checkpointer])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x204896c9b80>

In [27]:
with open("model.json", "w") as json_file:
    json_file.write(model.to_json())