In [457]:
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)

### Import libraries

In [458]:
%matplotlib inline
import pandas as pd
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Input, InputLayer, Dense, Activation,Dropout
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.callbacks import ModelCheckpoint,LearningRateScheduler
import keras
from tensorflow.keras import backend as K
from scipy.linalg import eigh
from sklearn import decomposition

In [459]:
df = pd.read_csv("../yelp.csv")

In [460]:
df.head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0


In [489]:
x = df['text'].values
y = df['stars'].values

In [462]:
df['stars'].unique()

array([5, 4, 2, 3, 1], dtype=int64)

In [463]:
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [464]:
X_train.shape, y_train.shape, X_test.shape

((8000,), (8000,), (2000,))

In [465]:
le = LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)
y_test = le.transform(y_test)

In [466]:
y_test

array([3, 4, 2, ..., 3, 3, 4], dtype=int64)

### Numerical encoding

In [524]:
# cleaning data by removing punctuation
def text_process(text):
    remove_punctuation = [c for c in text if c not in string.punctuation]
    remove_punctuation = ''.join(remove_punctuation)
    return [word for word in remove_punctuation.split() if word.lower() not in stopwords.words('english')]

In [467]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
vec.fit(X_train)
X_train = vec.transform(X_train)
X_test = vec.transform(X_test)
X_train.size

665975

### PCA Implementation

In [468]:
pca_dim = TruncatedSVD(n_components=3)
pca_dim.fit(X_train)
X_train = pca_dim.transform(X_train)
pca_dim = TruncatedSVD(n_components=3)
pca_dim.fit(X_test)
X_test = pca_dim.transform(X_test)

### Preprocessing Data

In [469]:
X_train = np.array(X_train)
y_train = np.array(y_train)
X_train.shape

(8000, 3)

In [470]:
y_train.shape

(8000,)

In [471]:
y_train

array([4, 3, 2, ..., 3, 4, 4], dtype=int64)

In [472]:
# Encoding the labels and making them as the class value and finally converting them as categorical values.
y_train = keras.utils.to_categorical(y_train, num_classes=5)

### Building NN Model

In [473]:
# without pca
model = Sequential()
model.add(Dense(512, input_dim=X_train.shape[1], activation='sigmoid'))
model.add(Dense(256, activation = "sigmoid"))
model.add(Dense(128, activation = "sigmoid"))
model.add(Dense(5, activation='softmax'))

opt = SGD(lr=0.1) 
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

In [474]:
model_history = model.fit(X_train, y_train, epochs=10)

Train on 8000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Model Evaluation - Without PCA

In [486]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve

In [508]:
# converting the reviews into a vector
vocab = CountVectorizer(analyzer=text_process).fit(x)
x = vocab.transform(x)
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=1)

In [523]:
# simple MLP
mlp = MLPClassifier(hidden_layer_sizes=(2,), activation='logistic', learning_rate='constant', learning_rate_init=0.1)
mlp.fit(x_train,y_train)
output = mlp.predict(x_test)
print("Test Prediction Accuracy: ", round(accuracy_score(y_test, output)*100,2), '%')

Test Prediction Accuracy:  78.11 %


### Model Evaluation - With PCA

In [478]:
prediction_hot = model.predict(X_test)
prediction = []

# convert from one-hot to prediction
for line in prediction_hot:
    prediction.append(np.argmax(line))

correct = 0
incorrect = 0

for i in prediction:
    if prediction[i] == y_test[i]:
        correct += 1
    else:
        incorrect += 1

accuracy = correct / y_test.shape[0]

print('Test Prediction Accuracy: ', accuracy * 100, '%')

Test Prediction Accuracy:  76.95 %
