## Logistic Regression model to predict if rating will be >= 4

In [1]:
# Import dependencies
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
# Read in the dataset
zomato_df = pd.read_csv(Path('../Resources/zomato.csv'), encoding = "ISO-8859-1")
zomato_df.head()

Unnamed: 0,Restaurant ID,Restaurant Name,Country Code,City,Address,Locality,Locality Verbose,Longitude,Latitude,Cuisines,...,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Rating color,Rating text,Votes
0,6317637,Le Petit Souffle,162,Makati City,"Third Floor, Century City Mall, Kalayaan Avenu...","Century City Mall, Poblacion, Makati City","Century City Mall, Poblacion, Makati City, Mak...",121.027535,14.565443,"French, Japanese, Desserts",...,Botswana Pula(P),Yes,No,No,No,3,4.8,Dark Green,Excellent,314
1,6304287,Izakaya Kikufuji,162,Makati City,"Little Tokyo, 2277 Chino Roces Avenue, Legaspi...","Little Tokyo, Legaspi Village, Makati City","Little Tokyo, Legaspi Village, Makati City, Ma...",121.014101,14.553708,Japanese,...,Botswana Pula(P),Yes,No,No,No,3,4.5,Dark Green,Excellent,591
2,6300002,Heat - Edsa Shangri-La,162,Mandaluyong City,"Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...","Edsa Shangri-La, Ortigas, Mandaluyong City","Edsa Shangri-La, Ortigas, Mandaluyong City, Ma...",121.056831,14.581404,"Seafood, Asian, Filipino, Indian",...,Botswana Pula(P),Yes,No,No,No,4,4.4,Green,Very Good,270
3,6318506,Ooma,162,Mandaluyong City,"Third Floor, Mega Fashion Hall, SM Megamall, O...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.056475,14.585318,"Japanese, Sushi",...,Botswana Pula(P),No,No,No,No,4,4.9,Dark Green,Excellent,365
4,6314302,Sambo Kojin,162,Mandaluyong City,"Third Floor, Mega Atrium, SM Megamall, Ortigas...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.057508,14.58445,"Japanese, Korean",...,Botswana Pula(P),Yes,No,No,No,4,4.8,Dark Green,Excellent,229


## Preprocess data

In [3]:
# View a list of all of the columns
zomato_df.columns

Index(['Restaurant ID', 'Restaurant Name', 'Country Code', 'City', 'Address',
       'Locality', 'Locality Verbose', 'Longitude', 'Latitude', 'Cuisines',
       'Average Cost for two', 'Currency', 'Has Table booking',
       'Has Online delivery', 'Is delivering now', 'Switch to order menu',
       'Price range', 'Aggregate rating', 'Rating color', 'Rating text',
       'Votes'],
      dtype='object')

In [4]:
# Drop columns that are not relevant to the analysis / will confuse the model
zomato_df = zomato_df.drop(columns=[
    'Restaurant ID',
    'Restaurant Name',
    'Country Code',
    'City',
    'Address', 
    'Locality', 
    'Locality Verbose', 
    'Longitude',
    'Latitude',
    'Average Cost for two',
    'Currency',
    'Rating color',
    'Rating text',
    'Votes'
])
zomato_df.head()

Unnamed: 0,Cuisines,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating
0,"French, Japanese, Desserts",Yes,No,No,No,3,4.8
1,Japanese,Yes,No,No,No,3,4.5
2,"Seafood, Asian, Filipino, Indian",Yes,No,No,No,4,4.4
3,"Japanese, Sushi",No,No,No,No,4,4.9
4,"Japanese, Korean",Yes,No,No,No,4,4.8


In [5]:
# Copy the Aggregate rating column into new Aggregate rating binary column
zomato_df['Aggregate rating binary'] = zomato_df['Aggregate rating']
zomato_df.head()

Unnamed: 0,Cuisines,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Aggregate rating binary
0,"French, Japanese, Desserts",Yes,No,No,No,3,4.8,4.8
1,Japanese,Yes,No,No,No,3,4.5,4.5
2,"Seafood, Asian, Filipino, Indian",Yes,No,No,No,4,4.4,4.4
3,"Japanese, Sushi",No,No,No,No,4,4.9,4.9
4,"Japanese, Korean",Yes,No,No,No,4,4.8,4.8


In [6]:
# Convert 'Aggregate rating binary' column to binary values
zomato_df.loc[zomato_df["Aggregate rating binary"] < 4, "Aggregate rating binary"] = 0
zomato_df.loc[zomato_df["Aggregate rating binary"] >= 4, "Aggregate rating binary"] = 1
zomato_df

Unnamed: 0,Cuisines,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Aggregate rating binary
0,"French, Japanese, Desserts",Yes,No,No,No,3,4.8,1.0
1,Japanese,Yes,No,No,No,3,4.5,1.0
2,"Seafood, Asian, Filipino, Indian",Yes,No,No,No,4,4.4,1.0
3,"Japanese, Sushi",No,No,No,No,4,4.9,1.0
4,"Japanese, Korean",Yes,No,No,No,4,4.8,1.0
...,...,...,...,...,...,...,...,...
9546,Turkish,No,No,No,No,3,4.1,1.0
9547,"World Cuisine, Patisserie, Cafe",No,No,No,No,3,4.2,1.0
9548,"Italian, World Cuisine",No,No,No,No,4,3.7,0.0
9549,Restaurant Cafe,No,No,No,No,4,4.0,1.0


In [7]:
# Check column data types
zomato_df.dtypes

Cuisines                    object
Has Table booking           object
Has Online delivery         object
Is delivering now           object
Switch to order menu        object
Price range                  int64
Aggregate rating           float64
Aggregate rating binary    float64
dtype: object

In [8]:
# Convert 'Aggregate rating binary' column from float64 to int to get rid of the decimal
zomato_df = zomato_df.astype({'Aggregate rating binary' : int})
zomato_df.head()

Unnamed: 0,Cuisines,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Aggregate rating binary
0,"French, Japanese, Desserts",Yes,No,No,No,3,4.8,1
1,Japanese,Yes,No,No,No,3,4.5,1
2,"Seafood, Asian, Filipino, Indian",Yes,No,No,No,4,4.4,1
3,"Japanese, Sushi",No,No,No,No,4,4.9,1
4,"Japanese, Korean",Yes,No,No,No,4,4.8,1


In [9]:
# Drop the 'Aggregate rating' column since the 'Aggregate rating binary' column is what we will
# use for the analysis
zomato_df = zomato_df.drop(columns=['Aggregate rating'])
zomato_df.head()

Unnamed: 0,Cuisines,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating binary
0,"French, Japanese, Desserts",Yes,No,No,No,3,1
1,Japanese,Yes,No,No,No,3,1
2,"Seafood, Asian, Filipino, Indian",Yes,No,No,No,4,1
3,"Japanese, Sushi",No,No,No,No,4,1
4,"Japanese, Korean",Yes,No,No,No,4,1


### Define variables and Split the data into train and test sets

In [10]:
# Create our features
X = zomato_df.drop(columns="Aggregate rating binary")
X = pd.get_dummies(X)


# Create our target
y = zomato_df["Aggregate rating binary"]

In [11]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# Check the shape of X_train
X_train.shape

(7163, 1833)

In [12]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Create, Train, and Evaluate the model

In [13]:
# Define the logistic regression model
log_classifier = LogisticRegression(solver="lbfgs", max_iter=200, random_state=1)


# Train the model
log_classifier.fit(X_train,y_train)

# Evaluate the model
y_pred = log_classifier.predict(X_test)
print(f" Logistic Regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")

results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

 Logistic Regression model accuracy: 0.865


Unnamed: 0,Prediction,Actual
0,0,0
1,0,1
2,0,0
3,0,0
4,0,0
5,0,0
6,0,1
7,0,0
8,0,0
9,0,0
