In [0]:
# create a notebook for LSTM prediction model 

# 1. Load the chosen dataset and try to see the vocab size 
# 2. Determine a MAX_VOCAB_SIZE incase you observe a vocab dict that is too large (choose the top/most frequent MAX_VOCAB_SIZE entries / Curse of dimensionality)
# 3. Determine a MAX_SEQUENCE LENGTH to vectorize for each review (Note: From splitter usually reviews will be limited to 400 + summary )

# DO NOT EDIT/ DELETE THIS BLOCK;



In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [2]:
# set working directory -> you must set the path into which you have uploaded the zipped file
# this is required in the case of colab or local 
%cd /content/drive/My\ Drive/SNLP\ Project
# %cd Source/repos/Sentiment-Analysis-using-Deep-Learning

/content/drive/My Drive/SNLP Project


In [3]:
# list content of drive - verify you are where you are supposed to be
%ls

 [0m[01;34mamz_all_beauty[0m/                    meta_lstm_small_200dim.tsv
 [01;34mamz_all_electronics[0m/               [01;34mModelResults[0m/
 CleantData_Apr-03-2020_01-31.zip   [01;34mModelVisualization[0m/
 dataset_dumps.json                'Project Ideas.gdoc'
 Electronics_5.json.gz             'Project Proposal.gdoc'
 [01;34mFinalModelResults[0m/                [01;34m'Report MetaData'[0m/
[01;34m'Final Report'[0m/                    'Sentiment Analysis - Deep Learning.pptx'
 [01;34mLexical_analysis[0m/                  vecs_lstm_small_200dim.tsv


In [4]:
# load your choice of dataset here . Specify paths as folder_datestring/file_datestring.zip
Train_ZipCSVFileName = 'amz_all_electronics/Data_Balanced_20000_Apr-03-2020_06-52/Train_20000_Apr-03-2020_06-52.zip'
Test_ZipCSVFileName = 'amz_all_electronics/Data_Balanced_20000_Apr-03-2020_06-52/Test_20000_Apr-03-2020_06-52.zip'

import pandas as pd
import numpy as np


df_train = pd.read_csv(Train_ZipCSVFileName)
df_train.info()

df_test = pd.read_csv(Test_ZipCSVFileName)
df_test.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   overall         100000 non-null  float64
 1   reviewTime      100000 non-null  object 
 2   reviewerID      100000 non-null  object 
 3   asin            100000 non-null  object 
 4   reviewText      100000 non-null  object 
 5   summary         100000 non-null  object 
 6   unixReviewTime  100000 non-null  int64  
 7   reviewText_len  100000 non-null  int64  
 8   summary_len     100000 non-null  int64  
dtypes: float64(1), int64(3), object(5)
memory usage: 6.9+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   overall         10000 non-null  float64
 1   reviewTime      10000 non-null  object 
 2   reviewerID      10000 non-null 

In [5]:

# Get names of indexes for which column Age has value 30
index_neutrals_train = df_train[ df_train['overall'] == 3 ].index
index_neutrals_test = df_test[ df_test['overall'] == 3 ].index
 
# Delete these row indexes from dataFrame
df_train.drop(index_neutrals_train , inplace=True)
df_test.drop(index_neutrals_test , inplace=True)

df_train.loc[(df_train.overall == 1),'overall']= 1
df_train.loc[(df_train.overall == 2),'overall']= 1
df_train.loc[(df_train.overall == 4),'overall']= 5
df_train.loc[(df_train.overall == 5),'overall']= 5

df_test.loc[(df_test.overall == 1),'overall']= 1
df_test.loc[(df_test.overall == 2),'overall']= 1
df_test.loc[(df_test.overall == 4),'overall']= 5
df_test.loc[(df_test.overall == 5),'overall']= 5

df_train['reviewText_len'].describe()
# Since the mean average review size is around 145 chars and max is 400, I can safely set the max [summary + review] Text Limit to 400 
	

count    80000.000000
mean       148.534925
std        109.884928
min          1.000000
25%         49.000000
50%        129.000000
75%        228.000000
max        399.000000
Name: reviewText_len, dtype: float64

In [6]:
df_train['overall'].describe()
df_test['summary_len'].describe()

count    8000.000000
mean       14.786125
std         7.736512
min         2.000000
25%         9.000000
50%        11.000000
75%        20.000000
max        34.000000
Name: summary_len, dtype: float64

In [0]:
# The maximum number of words to be used. (most frequent)
MAX_VOCAB_SIZE = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 200

import uuid
folderGUID = uuid.uuid4().hex

# stupid shell way of converting variable to string 
!mkdir "ModelResults/v3_$folderGUID"

In [0]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [9]:
tokenizer = Tokenizer(num_words= MAX_VOCAB_SIZE, filters='#$%&()*+<=>@[\\]^_`{|}~\t\n', lower=True)
tokenizer.fit_on_texts(df_train['summary'] + ' DELIM '+df_train['reviewText'])
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 29679 unique tokens.


In [10]:

print(df_train['summary'].values + ' DELIM '+ df_train['reviewText'].values)

['one star DELIM volum button were dead on arriv be awar of possibl defect'
 'plug fall out DELIM work great when i tape up the plug to the port i see whi this was for sale other than that it work fine'
 'do not buy DELIM did not work im sure the seller knew this befor he ship it out date and cannot be use at all those are word from the manufactur'
 ... 'five star DELIM the number of star say it all'
 'five star DELIM good product thank'
 'five star DELIM yet anoth awesom tripp lite product i put this into a rack case with a power amplifi']


In [11]:
X = tokenizer.texts_to_sequences(df_train['summary'].values + ' DELIM '+ df_train['reviewText'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)


Shape of data tensor: (80000, 250)


In [12]:
Y = pd.get_dummies(df_train['overall']).values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (80000, 2)


Function for metrics measurement

In [13]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

Using TensorFlow backend.


In [14]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dropout

model = Sequential()
model.add(Embedding(MAX_VOCAB_SIZE, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(LSTM(200, dropout = 0.2))
model.add(Dense(2, activation='softmax'))
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy',f1_m,precision_m, recall_m])

# run for small number of epochs then save 
epochs = 3

history = model.fit(X, Y, epochs=epochs)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [0]:
# Save the model to Local Disk 
# https://stackoverflow.com/questions/45424683/how-to-continue-training-for-a-saved-and-then-loaded-keras-model

filePath = "ModelResults/v3_"+str(folderGUID)+"/model.h5"
model.save(filePath)
print("Saved model to disk : "+ str(folderGUID))



Saved model to disk : 304198720d8b4005829fd39ef76638b0


In [16]:
# Run additional training if necessary & remember to resave it 
#from tensorflow.keras.models import load_model
#filePath = 'ModelResults/5eab2088768f4439824f997f86e64122/model.h5'
# Load the model
#model = load_model(filePath)

# Train more on the loaded model
history = model.fit(X, Y, epochs=epochs)



Epoch 1/3
Epoch 2/3
Epoch 3/3


In [15]:


X_test = tokenizer.texts_to_sequences((df_test['summary'].values + ' DELIM '+ df_test['reviewText'].values))
X_test = pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X_test.shape)

Y_Test = pd.get_dummies(df_test['overall']).values
print('Shape of label tensor:', Y_Test.shape)

loss, accuracy, f1_score, precision, recall = model.evaluate(X_test, Y_Test)

print("loss",loss)
print("accuracy",accuracy)
print("f1 score",f1_score)
print("precision",precision)
print("recall",recall)

Shape of data tensor: (8000, 250)
Shape of label tensor: (8000, 2)
loss 0.13258370757102966
accuracy 0.9523749947547913
f1 score 0.9523749947547913
precision 0.9523749947547913
recall 0.9523749947547913


In [0]:
import matplotlib.pyplot as plt


plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.legend()
plt.show();

plt.title('Accuracy')
plt.plot(history.history['accuracy'], label='test')
plt.legend()
plt.show();


In [0]:
from sklearn.metrics import classification_report

y_labels = df_test['overall'].values

#all precision
y_pred_softmax = model.predict(X_test)
y_pred = list()

# 1.0 for neg classes( argmax 0) and 5.0 for pos class (argmax 1)
# its just for verbosity 
for local_pred in y_pred_softmax:
  y_pred.append(1.0 if np.argmax(local_pred) == 0 else 5.0)

print(classification_report(y_labels, y_pred, target_names=['Classified Neg','Classified Pos']))

[[0.28729853 0.71270144]]
1
