## Random Forest Classifier model to predict if rating will be >= 4

In [1]:
# Import dependencies
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
# Read in the dataset
sf_df = pd.read_csv(Path('../Resources/Restaurants_reviews_SF.csv'), encoding = "ISO-8859-1")
sf_df.head()

Unnamed: 0,ID,Name,Image,Url,Review count,Category,Rating,Price,Latitude,Longitude,Address,City,State,Country,Phone,Review
0,HHtpR0RslupSQ99GIIwW5A,Marufuku Ramen,https://s3-media4.fl.yelpcdn.com/bphoto/ouK2Vm...,https://www.yelp.com/biz/marufuku-ramen-san-fr...,4122,Ramen,4.5,$$,37.785116,-122.432008,1581 Webster St,San Francisco,CA,US,(415) 872-9786,"['Long long overdue review, but I loved this p..."
1,f-m7-hyFzkf0HSEeQ2s-9A,Fog Harbor Fish House,https://s3-media2.fl.yelpcdn.com/bphoto/by8Hh6...,https://www.yelp.com/biz/fog-harbor-fish-house...,8324,Seafood,4.5,$$,37.808988,-122.410297,39 Pier,San Francisco,CA,US,(415) 969-2010,['Great food and not a long wait. Perfect on ...
2,XAYwAF_83becwNnSJDFkpA,Dumpling House,https://s3-media2.fl.yelpcdn.com/bphoto/DNfqq1...,https://www.yelp.com/biz/dumpling-house-san-fr...,360,Dim Sum,4.5,$$,37.763552,-122.432762,335 Noe St,San Francisco,CA,US,(415) 829-2789,"[""Love coming here - I've gotten so many of th..."
3,J7_-faNq_Ag9qTOlDn81Pw,Starbelly,https://s3-media2.fl.yelpcdn.com/bphoto/G1SweY...,https://www.yelp.com/biz/starbelly-san-francis...,2115,Comfort Food,4.0,$$,37.76402,-122.43253,3583 16th St,San Francisco,CA,US,(415) 252-7500,['Starbelly has been one of my go to comfort r...
4,QueFVMcMlT-6aZFv2M47mg,Bottega,https://s3-media2.fl.yelpcdn.com/bphoto/IawDcF...,https://www.yelp.com/biz/bottega-san-francisco...,379,Italian,4.5,$$,37.75472,-122.4212,1132 Valencia St,San Francisco,CA,US,(415) 655-9048,['What a great place to grab lunch/dinner with...


## Preprocess data

In [3]:
# View a list of all of the columns
sf_df.columns

Index(['ID', 'Name', 'Image', 'Url', 'Review count', 'Category', 'Rating',
       'Price', 'Latitude', 'Longitude', 'Address', 'City', 'State', 'Country',
       'Phone', 'Review'],
      dtype='object')

In [4]:
# Drop columns that are not relevant to the analysis / will confuse the model
sf_df = sf_df.drop(columns=[
    'ID',
    'Image',
    'Url',
    'Latitude',
    'Longitude',
    'Address', 
    'City',
    'State',
    'Country',
    'Phone'
])
sf_df.head()

Unnamed: 0,Name,Review count,Category,Rating,Price,Review
0,Marufuku Ramen,4122,Ramen,4.5,$$,"['Long long overdue review, but I loved this p..."
1,Fog Harbor Fish House,8324,Seafood,4.5,$$,['Great food and not a long wait. Perfect on ...
2,Dumpling House,360,Dim Sum,4.5,$$,"[""Love coming here - I've gotten so many of th..."
3,Starbelly,2115,Comfort Food,4.0,$$,['Starbelly has been one of my go to comfort r...
4,Bottega,379,Italian,4.5,$$,['What a great place to grab lunch/dinner with...


In [5]:
# Copy the Rating column into new Rating binary column
sf_df['Rating binary'] = sf_df['Rating']
sf_df.head()

Unnamed: 0,Name,Review count,Category,Rating,Price,Review,Rating binary
0,Marufuku Ramen,4122,Ramen,4.5,$$,"['Long long overdue review, but I loved this p...",4.5
1,Fog Harbor Fish House,8324,Seafood,4.5,$$,['Great food and not a long wait. Perfect on ...,4.5
2,Dumpling House,360,Dim Sum,4.5,$$,"[""Love coming here - I've gotten so many of th...",4.5
3,Starbelly,2115,Comfort Food,4.0,$$,['Starbelly has been one of my go to comfort r...,4.0
4,Bottega,379,Italian,4.5,$$,['What a great place to grab lunch/dinner with...,4.5


In [6]:
# Convert 'Rating binary' column to binary values
sf_df.loc[sf_df["Rating binary"] < 4, "Rating binary"] = 0
sf_df.loc[sf_df["Rating binary"] >= 4, "Rating binary"] = 1
sf_df

Unnamed: 0,Name,Review count,Category,Rating,Price,Review,Rating binary
0,Marufuku Ramen,4122,Ramen,4.5,$$,"['Long long overdue review, but I loved this p...",1.0
1,Fog Harbor Fish House,8324,Seafood,4.5,$$,['Great food and not a long wait. Perfect on ...,1.0
2,Dumpling House,360,Dim Sum,4.5,$$,"[""Love coming here - I've gotten so many of th...",1.0
3,Starbelly,2115,Comfort Food,4.0,$$,['Starbelly has been one of my go to comfort r...,1.0
4,Bottega,379,Italian,4.5,$$,['What a great place to grab lunch/dinner with...,1.0
...,...,...,...,...,...,...,...
619,LaoTable,1339,Laotian,4.0,$$,,1.0
620,Wooly Pig,777,Sandwiches,4.5,$$,,1.0
621,Boudin,4161,Bakeries,4.0,$$,,1.0
622,Tara Indian Cuisine,421,Indian,4.0,$$,,1.0


In [7]:
# Check column data types
sf_df.dtypes

Name              object
Review count       int64
Category          object
Rating           float64
Price             object
Review            object
Rating binary    float64
dtype: object

In [8]:
# Convert 'Rating binary' column from float64 to int to get rid of the decimal
sf_df = sf_df.astype({'Rating binary' : int})
sf_df.head()

Unnamed: 0,Name,Review count,Category,Rating,Price,Review,Rating binary
0,Marufuku Ramen,4122,Ramen,4.5,$$,"['Long long overdue review, but I loved this p...",1
1,Fog Harbor Fish House,8324,Seafood,4.5,$$,['Great food and not a long wait. Perfect on ...,1
2,Dumpling House,360,Dim Sum,4.5,$$,"[""Love coming here - I've gotten so many of th...",1
3,Starbelly,2115,Comfort Food,4.0,$$,['Starbelly has been one of my go to comfort r...,1
4,Bottega,379,Italian,4.5,$$,['What a great place to grab lunch/dinner with...,1


In [9]:
# Drop the Rating column since the Rating binary column is what we will use for the analysis
sf_df = sf_df.drop(columns=['Rating'])
sf_df.head()

Unnamed: 0,Name,Review count,Category,Price,Review,Rating binary
0,Marufuku Ramen,4122,Ramen,$$,"['Long long overdue review, but I loved this p...",1
1,Fog Harbor Fish House,8324,Seafood,$$,['Great food and not a long wait. Perfect on ...,1
2,Dumpling House,360,Dim Sum,$$,"[""Love coming here - I've gotten so many of th...",1
3,Starbelly,2115,Comfort Food,$$,['Starbelly has been one of my go to comfort r...,1
4,Bottega,379,Italian,$$,['What a great place to grab lunch/dinner with...,1


In [10]:
# Value_counts the Price column for preprocessing
price_counts = sf_df.Price.value_counts()
print(price_counts)

$$      501
$$$      67
$        35
$$$$     21
Name: Price, dtype: int64


In [11]:
# Convert 'Price' column to numeric values
sf_df.loc[sf_df["Price"] == "$", "Price"] = 1
sf_df.loc[sf_df["Price"] == "$$", "Price"] = 2
sf_df.loc[sf_df["Price"] == "$$$", "Price"] = 3
sf_df.loc[sf_df["Price"] == "$$$$", "Price"] = 4
sf_df

Unnamed: 0,Name,Review count,Category,Price,Review,Rating binary
0,Marufuku Ramen,4122,Ramen,2,"['Long long overdue review, but I loved this p...",1
1,Fog Harbor Fish House,8324,Seafood,2,['Great food and not a long wait. Perfect on ...,1
2,Dumpling House,360,Dim Sum,2,"[""Love coming here - I've gotten so many of th...",1
3,Starbelly,2115,Comfort Food,2,['Starbelly has been one of my go to comfort r...,1
4,Bottega,379,Italian,2,['What a great place to grab lunch/dinner with...,1
...,...,...,...,...,...,...
619,LaoTable,1339,Laotian,2,,1
620,Wooly Pig,777,Sandwiches,2,,1
621,Boudin,4161,Bakeries,2,,1
622,Tara Indian Cuisine,421,Indian,2,,1


In [12]:
# Drop columns that are not relevant to the analysis / will confuse the model
sf_df = sf_df.drop(columns=['Name','Review'])
sf_df.head()

Unnamed: 0,Review count,Category,Price,Rating binary
0,4122,Ramen,2,1
1,8324,Seafood,2,1
2,360,Dim Sum,2,1
3,2115,Comfort Food,2,1
4,379,Italian,2,1


### Define variables and Split the data into train and test sets

In [13]:
# Create our features
X = sf_df.copy()
X = X.drop(columns="Rating binary", axis=1)
X = pd.get_dummies(X)
X.head()

Unnamed: 0,Review count,Category_American (New),Category_American (Traditional),Category_Arabic,Category_Asian Fusion,Category_Bakeries,Category_Barbeque,Category_Bars,Category_Brazilian,Category_Breakfast & Brunch,...,Category_Thai,Category_Ukrainian,Category_Uzbek,Category_Vegetarian,Category_Vietnamese,Category_Wine Bars,Price_1,Price_2,Price_3,Price_4
0,4122,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,8324,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,360,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,2115,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,379,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [14]:
# Create our target
y = sf_df["Rating binary"].ravel()
y[:5]

array([1, 1, 1, 1, 1])

In [15]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Check the shape of X_train
X_train.shape

(468, 104)

In [16]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Create, Train, and Evaluate the model

In [17]:
# Define the logistic regression model
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

In [18]:
# Train the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [19]:
# Evaluate the model
predictions = rf_model.predict(X_test_scaled)

In [20]:
# Calculate the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1,5
Actual 1,8,142


In [21]:
# Calculate the Random Forest Classifer model accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [22]:
# Display the results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1,5
Actual 1,8,142


Accuracy Score : 0.9166666666666666
Classification Report
              precision    recall  f1-score   support

           0       0.11      0.17      0.13         6
           1       0.97      0.95      0.96       150

    accuracy                           0.92       156
   macro avg       0.54      0.56      0.54       156
weighted avg       0.93      0.92      0.92       156



In [23]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([5.82988708e-01, 2.00497050e-02, 2.73046351e-02, 3.09510716e-05,
       1.54115691e-05, 7.97331124e-03, 1.66626477e-04, 1.68402426e-04,
       2.30477748e-04, 9.30950957e-03, 3.57478374e-02, 5.01680662e-05,
       0.00000000e+00, 3.35617985e-04, 1.73137673e-04, 9.18502824e-03,
       2.19994052e-02, 1.04186946e-05, 3.22548567e-06, 3.39230727e-02,
       1.14906688e-04, 1.49612358e-02, 8.64177477e-04, 1.71169387e-02,
       8.69122984e-04, 9.36210553e-05, 0.00000000e+00, 9.03195122e-05,
       2.51730405e-05, 0.00000000e+00, 4.47920079e-04, 3.94971751e-02,
       0.00000000e+00, 2.16002291e-06, 6.43059862e-05, 0.00000000e+00,
       3.27902476e-05, 1.22831294e-05, 1.56773981e-05, 2.70275521e-04,
       0.00000000e+00, 1.07125967e-04, 0.00000000e+00, 6.19730352e-05,
       3.86726605e-05, 4.83395804e-05, 7.96863167e-06, 4.42388335e-05,
       4.10440762e-05, 1.37029332e-05, 3.40702448e-05, 1.25378803e-04,
       2.97755753e-03, 1.19492566e-05, 1.11493741e-02, 6.86877887e-05,
      