# <u>Yelp Rating Prediction Using Tensorflow</u>

## **Modeling (relu and adam):**

### *Libraries*

In [1]:
import matplotlib.pyplot as plt
%matplotlib inline

#import tensorflow as tf
from tensorflow.keras.models import load_model
import numpy as np
#import sys
#import sklearn as sk
import pandas as pd
import os
#import sklearn.feature_extraction.text as sk_text
#import re

from collections.abc import Sequence
from sklearn import preprocessing
#import shutil
from sklearn.model_selection import train_test_split
#from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc  
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense#, Activation
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn import metrics as mt




### *Functions*

In [2]:
# from labs

# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low
    
# Plot an ROC. pred - the predictions, y - the expected output.
def plot_roc(pred,y):
    fpr, tpr, thresholds = roc_curve(y, pred)
    roc_auc = auc(fpr, tpr)

    plt.figure()
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC)')
    plt.legend(loc="lower right")
    plt.show()

### *Load dataframe from file*

In [3]:
df = pd.read_csv("data/df_data.csv")
df.head()

Unnamed: 0,stars,able,absolutely,actually,almost,also,always,amazing,amount,another,...,wont,work,working,worth,would,wouldnt,wrong,year,yet,youre
0,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.168147,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.207039,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.128253,...,0.0,0.0,0.0,0.0,0.090665,0.0,0.0,0.0,0.0,0.0
4,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.117374,0.0,0.0,0.139791,0.0,0.0,0.0


### *Use dataframe to configure x, y*

In [4]:
x, y = to_xy(df, 'stars')

### *x info*

In [None]:
print('shape:', x.shape)
print(x)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

### *y info*

In [None]:
print('shape:', y.shape)
print(y)

array([[0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       ...,
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.]], dtype=float32)

### *Separate test and train sets*

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

### *Train model*

In [None]:
checkpointer = ModelCheckpoint(filepath="dnn/relu_adam_best_weights.hdf5", verbose=0, save_best_only=True)

for i in range(5):
    print('sequence ', i)

    # create sequential model
    model = Sequential()
    model.add(Dense(30, input_dim=x.shape[1], activation='relu'))
    model.add(Dense(15, activation='relu'))
    model.add(Dense(y.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')

    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=2, verbose=2, mode='auto')

    model.fit(x_train, y_train, validation_data=(x_test, y_test), callbacks=[monitor, checkpointer], verbose=2, epochs=100)

sequence  0




Epoch 1/100

144062/144062 - 156s - loss: 0.8790 - val_loss: 0.8679 - 156s/epoch - 1ms/step
Epoch 2/100


  saving_api.save_model(


144062/144062 - 139s - loss: 0.8659 - val_loss: 0.8663 - 139s/epoch - 964us/step
Epoch 3/100
144062/144062 - 135s - loss: 0.8630 - val_loss: 0.8649 - 135s/epoch - 935us/step
Epoch 4/100
144062/144062 - 135s - loss: 0.8617 - val_loss: 0.8628 - 135s/epoch - 937us/step
Epoch 5/100
144062/144062 - 135s - loss: 0.8608 - val_loss: 0.8645 - 135s/epoch - 939us/step
Epoch 6/100
144062/144062 - 134s - loss: 0.8602 - val_loss: 0.8616 - 134s/epoch - 928us/step
Epoch 7/100
144062/144062 - 133s - loss: 0.8598 - val_loss: 0.8613 - 133s/epoch - 923us/step
Epoch 8/100
144062/144062 - 133s - loss: 0.8595 - val_loss: 0.8605 - 133s/epoch - 923us/step
Epoch 9/100
144062/144062 - 132s - loss: 0.8592 - val_loss: 0.8618 - 132s/epoch - 915us/step
Epoch 10/100
144062/144062 - 132s - loss: 0.8589 - val_loss: 0.8609 - 132s/epoch - 915us/step
Epoch 10: early stopping
sequence  1


MemoryError: Unable to allocate 6.87 GiB for an array with shape (4609973, 400) and data type float32

### *Load best model*

In [None]:
print('Training finished...Loading the best model')
model.load_weights('dnn/relu_adam_best_weights.hdf5')

Training finished...Loading the best model


### *Optional start point*

In [None]:
# if starting here:  load library, functions, configure x,y, and train/test split cells before running this cell
'''path = "./dnn/"
model = load_model(os.path.join(save_path, "relu_adam.hdf5"))'''

'path = "./dnn/"\nmodel = load_model(os.path.join(save_path, "relu_adam.hdf5"))'

### *Model prediction*

In [None]:

prediction = model.predict(x_test)
print("Shape: {}".format(prediction.shape))
print(prediction)

Shape: (1536658, 5)
[[0.00987801 0.01590877 0.03959442 0.2132281  0.7213908 ]
 [0.0122886  0.04311397 0.13297042 0.43753764 0.37408945]
 [0.8951734  0.04999074 0.01055799 0.00858875 0.03568922]
 ...
 [0.00456017 0.01131928 0.02995954 0.13104063 0.8231203 ]
 [0.02549307 0.06252662 0.14071798 0.35420138 0.4170609 ]
 [0.01199265 0.01230736 0.01686402 0.13762634 0.8212096 ]]


## **Results:**

### *Predictions vs. Actual*

In [None]:
predicted_stars = np.argmax(prediction, axis=1)
true_stars = np.argmax(y_test, axis=1)
print("Predictions: {}".format(predicted_stars))
print("Actual: {}".format(true_stars))

Predictions: [4 3 0 ... 4 4 4]
Actual: [4 2 0 ... 4 4 4]


### *Accuracy*

In [None]:
accuracy = mt.accuracy_score(true_stars, predicted_stars)
print("Accuracy score: {}".format(accuracy))

Accuracy score: 0.5505401982744371


### *Precision*

In [None]:
precision = mt.precision_score(true_stars, predicted_stars, average="weighted")
print("Precision score: {}".format(precision))


Precision score: 0.6130214042304802


### *Recall score*

In [None]:
recall = mt.recall_score(true_stars, predicted_stars, average="weighted")
print("Recall score: {}".format(recall))

Recall score: 0.6465147091935877


### *F1 score*

In [None]:
f1 = mt.f1_score(true_stars, predicted_stars, average="weighted")
print("F1 score: {}".format(f1))

F1 score: 0.6182259225200087


### *Log score*

In [None]:
log_score = mt.log_loss(y_test, prediction)
print("Log loss score: {}".format(log_score))

Log loss score: 0.8546070865311683


### *Classification report*

In [None]:
print(mt.classification_report(true_stars, predicted_stars))

              precision    recall  f1-score   support

           0       0.68      0.79      0.73    215721
           1       0.41      0.24      0.30    122629
           2       0.45      0.28      0.35    159154
           3       0.50      0.36      0.42    335342
           4       0.72      0.89      0.80    703812

    accuracy                           0.65   1536658
   macro avg       0.55      0.51      0.52   1536658
weighted avg       0.61      0.65      0.62   1536658



### *Mean Squared Error (MSE)*

In [None]:
mse = mt.mean_squared_error(prediction, y_test)
print('Final score (MSE): {}'.format(mse))


Final score (MSE): 0.09246926009654999


### *Root Mean Squared Error (RMSE)*

In [None]:
score = np.sqrt(mt.mean_squared_error(prediction, y_test))
print('Final score (RMSE): {}'.format(score))

Final score (RMSE): 0.3040875792503357


### *Actual vs. predicted*

In [None]:
df_y = pd.DataFrame(true_stars, columns=['actual'])
df_pred = pd.DataFrame(predicted_stars, columns=['predicted'])
result = pd.concat([df_y, df_pred], axis=1)
result

Unnamed: 0,actual,predicted
0,4,4
1,2,3
2,0,0
3,3,4
4,2,3
...,...,...
1536653,3,4
1536654,4,4
1536655,4,4
1536656,4,4


### *Receiver Operating Characteristic (ROC) curve*

In [None]:
roc_pred = prediction[:,1] # Only positive class (1)
plot_roc(y_test, roc_pred)

### *Regression lift chart*

In [None]:
chart_regression(prediction.flatten(), y_test, sort=True)