## Support Vector Machine model to predict if rating will be >= 4

In [1]:
# Import dependencies
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
# Read in the dataset
sf_df = pd.read_csv(Path('../Resources/Restaurants_reviews_SF.csv'), encoding = "ISO-8859-1")
sf_df.head()

Unnamed: 0,ID,Name,Image,Url,Review count,Category,Rating,Price,Latitude,Longitude,Address,City,State,Country,Phone,Review
0,HHtpR0RslupSQ99GIIwW5A,Marufuku Ramen,https://s3-media4.fl.yelpcdn.com/bphoto/ouK2Vm...,https://www.yelp.com/biz/marufuku-ramen-san-fr...,4122,Ramen,4.5,$$,37.785116,-122.432008,1581 Webster St,San Francisco,CA,US,(415) 872-9786,"['Long long overdue review, but I loved this p..."
1,f-m7-hyFzkf0HSEeQ2s-9A,Fog Harbor Fish House,https://s3-media2.fl.yelpcdn.com/bphoto/by8Hh6...,https://www.yelp.com/biz/fog-harbor-fish-house...,8324,Seafood,4.5,$$,37.808988,-122.410297,39 Pier,San Francisco,CA,US,(415) 969-2010,['Great food and not a long wait. Perfect on ...
2,XAYwAF_83becwNnSJDFkpA,Dumpling House,https://s3-media2.fl.yelpcdn.com/bphoto/DNfqq1...,https://www.yelp.com/biz/dumpling-house-san-fr...,360,Dim Sum,4.5,$$,37.763552,-122.432762,335 Noe St,San Francisco,CA,US,(415) 829-2789,"[""Love coming here - I've gotten so many of th..."
3,J7_-faNq_Ag9qTOlDn81Pw,Starbelly,https://s3-media2.fl.yelpcdn.com/bphoto/G1SweY...,https://www.yelp.com/biz/starbelly-san-francis...,2115,Comfort Food,4.0,$$,37.76402,-122.43253,3583 16th St,San Francisco,CA,US,(415) 252-7500,['Starbelly has been one of my go to comfort r...
4,QueFVMcMlT-6aZFv2M47mg,Bottega,https://s3-media2.fl.yelpcdn.com/bphoto/IawDcF...,https://www.yelp.com/biz/bottega-san-francisco...,379,Italian,4.5,$$,37.75472,-122.4212,1132 Valencia St,San Francisco,CA,US,(415) 655-9048,['What a great place to grab lunch/dinner with...


## Preprocess data

In [3]:
# View a list of all of the columns
sf_df.columns

Index(['ID', 'Name', 'Image', 'Url', 'Review count', 'Category', 'Rating',
       'Price', 'Latitude', 'Longitude', 'Address', 'City', 'State', 'Country',
       'Phone', 'Review'],
      dtype='object')

In [4]:
# Drop columns that are not relevant to the analysis / will confuse the model
sf_df = sf_df.drop(columns=[
    'ID',
    'Image',
    'Url',
    'Latitude',
    'Longitude',
    'Address', 
    'City',
    'State',
    'Country',
    'Phone'
])
sf_df.head()

Unnamed: 0,Name,Review count,Category,Rating,Price,Review
0,Marufuku Ramen,4122,Ramen,4.5,$$,"['Long long overdue review, but I loved this p..."
1,Fog Harbor Fish House,8324,Seafood,4.5,$$,['Great food and not a long wait. Perfect on ...
2,Dumpling House,360,Dim Sum,4.5,$$,"[""Love coming here - I've gotten so many of th..."
3,Starbelly,2115,Comfort Food,4.0,$$,['Starbelly has been one of my go to comfort r...
4,Bottega,379,Italian,4.5,$$,['What a great place to grab lunch/dinner with...


In [5]:
# Copy the Rating column into new Rating binary column
sf_df['Rating binary'] = sf_df['Rating']
sf_df.head()

Unnamed: 0,Name,Review count,Category,Rating,Price,Review,Rating binary
0,Marufuku Ramen,4122,Ramen,4.5,$$,"['Long long overdue review, but I loved this p...",4.5
1,Fog Harbor Fish House,8324,Seafood,4.5,$$,['Great food and not a long wait. Perfect on ...,4.5
2,Dumpling House,360,Dim Sum,4.5,$$,"[""Love coming here - I've gotten so many of th...",4.5
3,Starbelly,2115,Comfort Food,4.0,$$,['Starbelly has been one of my go to comfort r...,4.0
4,Bottega,379,Italian,4.5,$$,['What a great place to grab lunch/dinner with...,4.5


In [6]:
# Convert 'Rating binary' column to binary values
sf_df.loc[sf_df["Rating binary"] < 4, "Rating binary"] = 0
sf_df.loc[sf_df["Rating binary"] >= 4, "Rating binary"] = 1
sf_df

Unnamed: 0,Name,Review count,Category,Rating,Price,Review,Rating binary
0,Marufuku Ramen,4122,Ramen,4.5,$$,"['Long long overdue review, but I loved this p...",1.0
1,Fog Harbor Fish House,8324,Seafood,4.5,$$,['Great food and not a long wait. Perfect on ...,1.0
2,Dumpling House,360,Dim Sum,4.5,$$,"[""Love coming here - I've gotten so many of th...",1.0
3,Starbelly,2115,Comfort Food,4.0,$$,['Starbelly has been one of my go to comfort r...,1.0
4,Bottega,379,Italian,4.5,$$,['What a great place to grab lunch/dinner with...,1.0
...,...,...,...,...,...,...,...
619,LaoTable,1339,Laotian,4.0,$$,,1.0
620,Wooly Pig,777,Sandwiches,4.5,$$,,1.0
621,Boudin,4161,Bakeries,4.0,$$,,1.0
622,Tara Indian Cuisine,421,Indian,4.0,$$,,1.0


In [7]:
# Check column data types
sf_df.dtypes

Name              object
Review count       int64
Category          object
Rating           float64
Price             object
Review            object
Rating binary    float64
dtype: object

In [8]:
# Convert 'Rating binary' column from float64 to int to get rid of the decimal
sf_df = sf_df.astype({'Rating binary' : int})
sf_df.head()

Unnamed: 0,Name,Review count,Category,Rating,Price,Review,Rating binary
0,Marufuku Ramen,4122,Ramen,4.5,$$,"['Long long overdue review, but I loved this p...",1
1,Fog Harbor Fish House,8324,Seafood,4.5,$$,['Great food and not a long wait. Perfect on ...,1
2,Dumpling House,360,Dim Sum,4.5,$$,"[""Love coming here - I've gotten so many of th...",1
3,Starbelly,2115,Comfort Food,4.0,$$,['Starbelly has been one of my go to comfort r...,1
4,Bottega,379,Italian,4.5,$$,['What a great place to grab lunch/dinner with...,1


In [9]:
# Drop the Rating column since the Rating binary column is what we will use for the analysis
sf_df = sf_df.drop(columns=['Rating'])
sf_df.head()

Unnamed: 0,Name,Review count,Category,Price,Review,Rating binary
0,Marufuku Ramen,4122,Ramen,$$,"['Long long overdue review, but I loved this p...",1
1,Fog Harbor Fish House,8324,Seafood,$$,['Great food and not a long wait. Perfect on ...,1
2,Dumpling House,360,Dim Sum,$$,"[""Love coming here - I've gotten so many of th...",1
3,Starbelly,2115,Comfort Food,$$,['Starbelly has been one of my go to comfort r...,1
4,Bottega,379,Italian,$$,['What a great place to grab lunch/dinner with...,1


In [10]:
# Value_counts the Price column for preprocessing
price_counts = sf_df.Price.value_counts()
print(price_counts)

$$      501
$$$      67
$        35
$$$$     21
Name: Price, dtype: int64


In [11]:
# Convert 'Price' column to numeric values
sf_df.loc[sf_df["Price"] == "$", "Price"] = 1
sf_df.loc[sf_df["Price"] == "$$", "Price"] = 2
sf_df.loc[sf_df["Price"] == "$$$", "Price"] = 3
sf_df.loc[sf_df["Price"] == "$$$$", "Price"] = 4
sf_df

Unnamed: 0,Name,Review count,Category,Price,Review,Rating binary
0,Marufuku Ramen,4122,Ramen,2,"['Long long overdue review, but I loved this p...",1
1,Fog Harbor Fish House,8324,Seafood,2,['Great food and not a long wait. Perfect on ...,1
2,Dumpling House,360,Dim Sum,2,"[""Love coming here - I've gotten so many of th...",1
3,Starbelly,2115,Comfort Food,2,['Starbelly has been one of my go to comfort r...,1
4,Bottega,379,Italian,2,['What a great place to grab lunch/dinner with...,1
...,...,...,...,...,...,...
619,LaoTable,1339,Laotian,2,,1
620,Wooly Pig,777,Sandwiches,2,,1
621,Boudin,4161,Bakeries,2,,1
622,Tara Indian Cuisine,421,Indian,2,,1


In [12]:
# Drop columns that are not relevant to the analysis / will confuse the model
sf_df = sf_df.drop(columns=['Name','Review'])
sf_df.head()

Unnamed: 0,Review count,Category,Price,Rating binary
0,4122,Ramen,2,1
1,8324,Seafood,2,1
2,360,Dim Sum,2,1
3,2115,Comfort Food,2,1
4,379,Italian,2,1


### Define variables and Split the data into train and test sets

In [13]:
# Create our features
X = sf_df.drop(columns="Rating binary")
X = pd.get_dummies(X)


# Create our target
y = sf_df["Rating binary"]

In [14]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

# Check the shape of X_train
X_train.shape

(468, 104)

## Create, Train, and Evaluate the model

In [15]:
# Define the SVC model
model = SVC(kernel='linear')

In [16]:
# Train the model
model.fit(X_train,y_train)

SVC(kernel='linear')

In [17]:
# Evaluate the model
y_pred = model.predict(X_test)

results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,1,1
3,1,1
4,1,0
5,1,1
6,1,1
7,1,1
8,1,1
9,1,1


In [18]:
# Get the accuracy score from the model's predictions
accuracy_score(y_test, y_pred)

0.9487179487179487

In [19]:
# Get the classificaiton report from the model's predicions
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         8
           1       0.95      1.00      0.97       148

    accuracy                           0.95       156
   macro avg       0.47      0.50      0.49       156
weighted avg       0.90      0.95      0.92       156



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
