In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/cleaned/cleaned_dataset.csv")
df.head()

Unnamed: 0,Country Code,City,Longitude,Latitude,Cuisines,Average Cost for two,Has Table booking,Has Online delivery,Price range,Aggregate rating,Votes,Primary Cuisine,City_Freq
0,6,Makati City,121.027535,14.565443,"French, Japanese, Desserts",1100,1,0,3,4.8,314,French,2
1,6,Makati City,121.014101,14.553708,Japanese,1200,1,0,3,4.5,591,Japanese,2
2,6,Mandaluyong City,121.056831,14.581404,"Seafood, Asian, Filipino, Indian",4000,1,0,4,4.4,270,Seafood,4
3,6,Mandaluyong City,121.056475,14.585318,"Japanese, Sushi",1500,0,0,4,4.9,365,Japanese,4
4,6,Mandaluyong City,121.057508,14.58445,"Japanese, Korean",1500,1,0,4,4.8,229,Japanese,4


In [2]:
# Lets drop the restaurants which have no rating(0.0)
df = df[df["Aggregate rating"] != 0.0].reset_index(drop=True)
df.shape

(7403, 13)

In [3]:
# Encode the Country Code
df["Country Code"] =  df["Country Code"].astype("category").cat.codes

In [4]:
#Handling primary Cuisine, Replace Cuisines that occure less than 10 times with other.
cuisine_counts = df["Primary Cuisine"].value_counts()
rare_cuisines =  cuisine_counts[cuisine_counts<10].index

df["Cuisine_Grouped"] = df["Primary Cuisine"].replace(rare_cuisines, "Other")

In [5]:
# Frequencey Encoding
df = pd.get_dummies(df, columns=["Cuisine_Grouped"], drop_first=True)

In [6]:
# Now we have to drop columns that are less relvant
df_model = df.drop([
    "City",
    "Address",
    "Locality",
    "Locality Verbose",
    "Latitude",
    "Longitude",
    "Restaurant ID",
    "Restaurant Name",
    "Cuisines",
    "Primary Cuisine",
], axis=1, errors='ignore')

In [7]:
# check
df_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7403 entries, 0 to 7402
Data columns (total 53 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country Code                   7403 non-null   int8   
 1   Average Cost for two           7403 non-null   int64  
 2   Has Table booking              7403 non-null   int64  
 3   Has Online delivery            7403 non-null   int64  
 4   Price range                    7403 non-null   int64  
 5   Aggregate rating               7403 non-null   float64
 6   Votes                          7403 non-null   int64  
 7   City_Freq                      7403 non-null   int64  
 8   Cuisine_Grouped_Asian          7403 non-null   bool   
 9   Cuisine_Grouped_BBQ            7403 non-null   bool   
 10  Cuisine_Grouped_Bakery         7403 non-null   bool   
 11  Cuisine_Grouped_Bengali        7403 non-null   bool   
 12  Cuisine_Grouped_Beverages      7403 non-null   b

In [9]:
# Now lets split training data and test data
from sklearn.model_selection import train_test_split

X = df_model.drop("Aggregate rating", axis = 1)
Y = df_model["Aggregate rating"]
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2, random_state=42)

X_train.shape, X_test.shape

((5922, 52), (1481, 52))

In [11]:
# Lets save the Dataset
from pathlib import Path
Path("../data/processed").mkdir(parents=True, exist_ok=True)
df_model.to_csv("../data/processed/model_data.csv", index= False)
print("Saved ../data/processed/model_data.csv")

Saved ../data/processed/model_data.csv


In [12]:
# Saving the train/test splits
import joblib
Path("../models").mkdir(parents=True, exist_ok= True)

joblib.dump(X_train, "../models/X_train.joblib")
joblib.dump(X_test,  "../models/X_test.joblib")
joblib.dump(Y_train, "../models/Y_train.joblib")
joblib.dump(Y_test,  "../models/Y_test.joblib")
print("Saved X/Y train/test splits in models/")

Saved X/Y train/test splits in models/


In [14]:
# Lets also save the list of feature column names
features = X_train.columns.to_list()
joblib.dump(features, "../models/feature_list.joblib")
print("Saved features")

Saved features
