In [1]:
#Import Data

data_path = 'reviews_Office_Products_5.json'

import json
import pandas as pd
Amazon_Data =[]
for line in open(data_path, 'r'):
    Amazon_Data.append(json.loads(line))

df = pd.DataFrame(Amazon_Data)
print(df)

           reviewerID        asin  \
0      A32T2H8150OJLU  B00000JBLH   
1      A3MAFS04ZABRGO  B00000JBLH   
2      A1F1A0QQP2XVH5  B00000JBLH   
3       A49R5DBXXQDE5  B00000JBLH   
4      A2XRMQA6PJ5ZJ8  B00000JBLH   
...               ...         ...   
53253  A1ODOGXEYECQQ8  B00KYA0RC2   
53254  A2XX2A4OJCDNLZ  B00KYA0RC2   
53255  A3LGT6UZL99IW1  B00KYA0RC2   
53256  A1XJOSJN6FHFO0  B00KYA0RC2   
53257   AAEVGE52KL0DJ  B00KYA0RC2   

                                           reviewerName helpful  \
0                                                   ARH  [3, 4]   
1                                      Let it Be "Alan"  [7, 9]   
2                                                Mark B  [3, 3]   
3                                          R. D Johnson  [7, 8]   
4                                   Roger J. Buffington  [0, 0]   
...                                                 ...     ...   
53253                                            Nuknuk  [0, 0]   
53254              

In [2]:
#Change types of variables to prepare for analysis

df['overall']=df['overall'].astype(int)
df['reviewText']=df['reviewText'].astype(str)
df = df.dropna()
df = df[['reviewText','overall']]

df.head(2)

Unnamed: 0,reviewText,overall
0,"I bought my first HP12C in about 1984 or so, a...",5
1,WHY THIS BELATED REVIEW? I feel very obliged t...,5


In [3]:
#Remove neutral ratings (3) and convert negative review to 0s and positibe reviews to 1s and drop overall rating

df = df[df['overall']!=3]

def sentiment(df):
    if(df['overall'] >=4): return 1
    elif(df['overall'] <=2): return 0

df['sentiment'] = df.apply(sentiment, axis =1)

df = df.drop(['overall'],axis=1)
df.head(5)

Unnamed: 0,reviewText,sentiment
0,"I bought my first HP12C in about 1984 or so, a...",1
1,WHY THIS BELATED REVIEW? I feel very obliged t...,1
2,I have an HP 48GX that has been kicking for mo...,0
3,I've started doing more finance stuff recently...,1
4,For simple calculations and discounted cash fl...,1


In [4]:
#Presence of Unusual Characters

review = df['reviewText']
characters =[]
for comment in review:
    for character in comment:
        if character not in characters:
            characters.append(character)
print(characters)

['I', ' ', 'b', 'o', 'u', 'g', 'h', 't', 'm', 'y', 'f', 'i', 'r', 's', 'H', 'P', '1', '2', 'C', 'n', 'a', '9', '8', '4', ',', 'd', 'e', 'v', 'l', '0', 'w', '.', 'c', 'p', 'S', 'G', 'W', 'k', '!', '(', '+', ')', 'T', 'x', "'", ';', 'Y', 'B', 'E', 'L', 'A', 'D', 'R', 'V', '?', 'O', '5', 'z', '6', '3', 'j', '-', 'q', '7', '"', 'N', '&', 'M', 'U', 'F', 'K', 'X', 'Q', 'J', '*', '$', '%', '#', '>', '/', ':', '[', ']', 'Z', '`', '=', '@', '~', '_', '|', '^', '}', '\\', '{', '\x1c', '\x08']


In [5]:
#Vocabulary Size & Tokenization

from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['reviewText'])
print("Vocabulary size: " , len(tokenizer.word_index)+1)
#print(tokenizer.word_index)


Vocabulary size:  53620


In [6]:
#Proposed Embedded Length

53258**(1/4)

15.191344528674602

In [7]:
#Lengths

import numpy as np
review_len = []
for char_len in review:
    review_len.append(len(char_len.split(' ')))
review_max = np.max(review_len)
review_min = np.min(review_len)
review_med = np.median(review_len)

print("Maximum:", review_max , "Minimum:", review_min, "Median:", review_med)

Maximum: 5799 Minimum: 1 Median: 101.0


In [8]:
#Convert to lower case

for description in df.reviewText:
    description = description.lower()

In [9]:
#Padding

from tensorflow.keras.preprocessing.sequence import pad_sequences

sequences = tokenizer.texts_to_sequences(df)
padded = pad_sequences(sequences, maxlen=5799, padding = 'post', truncating = 'post')

padded [1]

array([12698,     0,     0, ...,     0,     0,     0])

In [10]:
#Test/Train Split

import numpy as np
import sklearn
from sklearn import model_selection
from sklearn.model_selection import train_test_split

X = np.array(df) 
y = df.sentiment.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state= 20 , stratify = y)

print("Test: ", X_test.shape)
print("Train: ", X_train.shape)

Test:  (9554, 2)
Train:  (38215, 2)


In [11]:
#Convert to Array

X_train = np.asarray(X_train).astype(str)
y_train = np.asarray(y_train).astype(str)
X_test = np.asarray(X_test).astype(str)
y_test = np.asarray(y_test).astype(str)


In [12]:
#Export Files

pd.DataFrame(X_train).to_csv("X_train.csv")
pd.DataFrame(y_train).to_csv("y_train.csv")
pd.DataFrame(X_test).to_csv("X_test.csv")
pd.DataFrame(y_test).to_csv("y_test.csv")

In [18]:
#Creating Neural Network

import tensorflow
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.models import Sequential
from keras import layers
early_stop = EarlyStopping(patience=3)

# model = tensorflow.keras.Sequential(
# [
#     tensorflow.keras.layers.Embedding(53620, 15, input_length=2),
#     tensorflow.keras.layers.GlobalAveragePooling1D(),
#     tensorflow.keras.layers.Dense(100, activation='relu'),
#     tensorflow.keras.layers.Dense(50, activation='relu'),
#     tensorflow.keras.layers.Dense(1, activation= 'sigmoid')
# ])


model1 = Sequential()
model1.add(layers.Embedding(53620, 20)) #The embedding layer
model1.add(layers.LSTM(15,dropout=0.5)) #Our LSTM layer
model1.add(layers.Dense(3,activation='softmax'))

model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model1.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 20)          1072400   
                                                                 
 lstm_1 (LSTM)               (None, 15)                2160      
                                                                 
 dense_4 (Dense)             (None, 3)                 48        
                                                                 
Total params: 1,074,608
Trainable params: 1,074,608
Non-trainable params: 0
_________________________________________________________________


In [19]:
history = model1.fit(X_train, y_train, epochs = 20, validation_split = .2, callbacks=[early_stop],verbose= True)

Epoch 1/20


ValueError: in user code:

    File "C:\Users\avitr\anaconda3\lib\site-packages\keras\engine\training.py", line 1021, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\avitr\anaconda3\lib\site-packages\keras\engine\training.py", line 1010, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\avitr\anaconda3\lib\site-packages\keras\engine\training.py", line 1000, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\avitr\anaconda3\lib\site-packages\keras\engine\training.py", line 860, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "C:\Users\avitr\anaconda3\lib\site-packages\keras\engine\training.py", line 918, in compute_loss
        return self.compiled_loss(
    File "C:\Users\avitr\anaconda3\lib\site-packages\keras\engine\compile_utils.py", line 201, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "C:\Users\avitr\anaconda3\lib\site-packages\keras\losses.py", line 139, in __call__
        losses = call_fn(y_true, y_pred)
    File "C:\Users\avitr\anaconda3\lib\site-packages\keras\losses.py", line 243, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "C:\Users\avitr\anaconda3\lib\site-packages\keras\losses.py", line 1930, in binary_crossentropy
        backend.binary_crossentropy(y_true, y_pred, from_logits=from_logits),
    File "C:\Users\avitr\anaconda3\lib\site-packages\keras\backend.py", line 5247, in binary_crossentropy
        return tf.nn.sigmoid_cross_entropy_with_logits(labels=target, logits=output)

    ValueError: `logits` and `labels` must have the same shape, received ((None, 3) vs (None, 1)).


In [None]:
#Model Evaluation

#history = model.fit(X_train, y_train, batch_size=32, epochs=num_epochs,validation_split=.3,callbacks=[early_stop], verbose=True)

num_epochs = 20

score = model.evaluate(X_test, y_test, verbose = 0)
print(f'Test loss: {score[0]}/ Test accuracy: {score[1]}')

model.save('SentimentAnalysisModel.h5')

my_model = load_model('SentimentAnalysisModel.h5')

predictions = mymodel.predict(X_test)

i=13

print("Predicted review text: ", X_test[i], "\n")
print("Predicted: ", "Negative" if predictions[i][0] >=.5 else "Positive", "review")
print("Actual", "Negative" if y_test[i][1]==0 else "Positive", "review" )

In [None]:
#import matplotlib.pyplot as plt
#def plot_graphs(history, string):
#    plt.plot(history.history[string])
#    plt.plot(history.history['val_'+ string])
#    plt.xlabel("Epochs")
#    plt.ylabel(string)
#    plt.legend([string, 'val_'+string])
#    plt.show()
#plot_graphs(history, "accuracy")
#plot_graphs(history, "loss")