# <u>Yelp Rating Prediction Using Tensorflow</u>

## **Modeling (relu and adam):**

### *Libraries*

In [12]:
import matplotlib.pyplot as plt
%matplotlib inline

#import tensorflow as tf
from tensorflow.keras.models import load_model
import numpy as np
#import sys
#import sklearn as sk
import pandas as pd
import os
#import sklearn.feature_extraction.text as sk_text
#import re

from collections.abc import Sequence
from sklearn import preprocessing
#import shutil
from sklearn.model_selection import train_test_split
#from sklearn.linear_model import LogisticRegression

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense#, Activation
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn import metrics as mt

### *Functions*

In [2]:
# from labs

# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low

### *Load dataframe from file*

In [3]:
df = pd.read_csv("data/df_data.csv")
df.head()

Unnamed: 0,stars,able,absolutely,actually,almost,also,always,amazing,amount,another,...,wont,work,working,worth,would,wouldnt,wrong,year,yet,youre
0,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.168147,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.207039,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.128253,...,0.0,0.0,0.0,0.0,0.090665,0.0,0.0,0.0,0.0,0.0
4,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.117374,0.0,0.0,0.139791,0.0,0.0,0.0


### *Use dataframe to configure x, y*

In [4]:
x, y = to_xy(df, 'stars')

### *x info*

In [5]:
x.shape
x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

### *y info*

In [6]:
y.shape
y

array([[0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       ...,
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.]], dtype=float32)

### *Separate test and train sets*

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

### *Train model*

In [8]:
model = Sequential()

model.add(Dense(75, input_dim=x.shape[1], activation='relu'))
model.add(Dense(40, activation='relu'))
model.add(Dense(y.shape[1], activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=2, verbose=2, mode='auto')

checkpointer = ModelCheckpoint(filepath="dnn/relu_adam_best_weights.hdf5", verbose=0, save_best_only=True)

model.fit(x_train, y_train, validation_data=(x_test, y_test), callbacks=[monitor, checkpointer], verbose=2, epochs=100)



Epoch 1/100

144062/144062 - 187s - loss: 0.8690 - val_loss: 0.8573 - 187s/epoch - 1ms/step
Epoch 2/100


  saving_api.save_model(


144062/144062 - 156s - loss: 0.8530 - val_loss: 0.8544 - 156s/epoch - 1ms/step
Epoch 3/100
144062/144062 - 152s - loss: 0.8490 - val_loss: 0.8509 - 152s/epoch - 1ms/step
Epoch 4/100
144062/144062 - 153s - loss: 0.8470 - val_loss: 0.8515 - 153s/epoch - 1ms/step
Epoch 5/100
144062/144062 - 154s - loss: 0.8458 - val_loss: 0.8546 - 154s/epoch - 1ms/step
Epoch 5: early stopping


<keras.src.callbacks.History at 0x1ec7d698690>

### *Save model*

In [14]:
path = './data/'
save_path = './dnn/'
filename_write = os.path.join(save_path, "relu_adam.hdf5")
model.save(filename_write)
print("Wrote file to {}".format(filename_write))

Wrote file to ./dnn/network.hdf5


  saving_api.save_model(


### *Optional start point*

In [None]:
# if starting here:  load library, functions, configure x,y, and train/test split cells before running this cell
#model = load_model(os.path.join(save_path, "network.hdf5"))

### *Model prediction*

In [24]:

prediction = model.predict(x_test)
print("Shape: {}".format(prediction.shape))
print(prediction)

Shape: (1536658, 5)
[[1.2794351e-04 1.1522076e-03 1.8389789e-02 2.7837113e-01 7.0195901e-01]
 [9.8365286e-05 4.4485969e-03 1.2245468e-01 7.2730273e-01 1.4569570e-01]
 [9.8133677e-01 1.6811771e-02 1.2203933e-03 2.5283132e-04 3.7824718e-04]
 ...
 [7.4981917e-06 5.8018257e-05 2.8445653e-04 1.8143792e-02 9.8150623e-01]
 [3.4997662e-04 1.1936321e-03 7.5022802e-03 1.4703481e-01 8.4391934e-01]
 [1.0673405e-02 1.6957387e-02 1.9746313e-02 1.0694156e-01 8.4568131e-01]]


## **Results:**

### *Predictions vs. Actual*

In [26]:
predicted_stars = np.argmax(prediction, axis=1)
true_stars = np.argmax(y_test, axis=1)
print("Predictions: {}".format(predicted_stars))
print("Actual: {}".format(true_stars))

Predictions: [4 3 0 ... 4 4 4]
True: [4 2 0 ... 4 4 4]


### *Accuracy*

In [27]:
accuracy = mt.accuracy_score(true_stars, predicted_stars)
print("Accuracy score: {}".format(accuracy))

Accuracy score: 0.6465147091935877


### *Precision*

In [28]:
precision = mt.precision_score(true_stars, predicted_stars, average="weighted")
print("Precision score: {}".format(precision))


Precision score: 0.6130214042304802


### *Recall score*

In [29]:
recall = mt.recall_score(true_stars, predicted_stars, average="weighted")
print("Recall score: {}".format(recall))

Recall score: 0.6465147091935877


### *F1 score*

In [30]:
f1 = mt.f1_score(true_stars, predicted_stars, average="weighted")
print("F1 score: {}".format(f1))

F1 score: 0.6182259225200087


### *Log score*

In [31]:
log_score = mt.log_loss(y_test, prediction)
print("Log loss score: {}".format(log_score))

Log loss score: 0.8546070865311683


### *Classification report*

In [32]:
print(mt.classification_report(true_stars, predicted_stars))

              precision    recall  f1-score   support

           0       0.68      0.79      0.73    215721
           1       0.41      0.24      0.30    122629
           2       0.45      0.28      0.35    159154
           3       0.50      0.36      0.42    335342
           4       0.72      0.89      0.80    703812

    accuracy                           0.65   1536658
   macro avg       0.55      0.51      0.52   1536658
weighted avg       0.61      0.65      0.62   1536658



### *Mean Squared Error (MSE)*

In [33]:
mse = mt.mean_squared_error(prediction, y_test)
print('Final score (MSE): {}'.format(mse))


Final score (MSE): 0.09246926009654999


### *Root Mean Squared Error (RMSE)*

In [34]:
score = np.sqrt(mt.mean_squared_error(prediction, y_test))
print('Final score (RMSE): {}'.format(score))

Final score (RMSE): 0.3040875792503357


### *Actual vs. predicted*

In [35]:
df_y = pd.DataFrame(true_stars, columns=['actual'])
df_pred = pd.DataFrame(predicted_stars, columns=['predicted'])
result = pd.concat([df_y, df_pred], axis=1)
result

Unnamed: 0,actual,predicted
0,4,4
1,2,3
2,0,0
3,3,4
4,2,3
...,...,...
1536653,3,4
1536654,4,4
1536655,4,4
1536656,4,4
