# Swiggy Restaurant Recommendation System  
## Feature Engineering and Encoding

### Objective
The objective of this notebook is to prepare the cleaned restaurant dataset
for recommendation modeling by encoding categorical features and ensuring
all values are numerical.


In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
import pickle


In [2]:
# Load cleaned dataset
df = pd.read_csv("../data/processed/cleaned_data.csv")

# Preview dataset
df.head()


Unnamed: 0,id,name,city,rating,rating_count,cost,cuisine,lic_no,link,address,menu
0,567335,AB FOODS POINT,Abohar,4.0,0.0,200.0,"Beverages,Pizzas",22122652000138,https://www.swiggy.com/restaurants/ab-foods-po...,"AB FOODS POINT, NEAR RISHI NARANG DENTAL CLINI...",Menu/567335.json
1,531342,Janta Sweet House,Abohar,4.4,50.0,200.0,"Sweets,Bakery",12117201000112,https://www.swiggy.com/restaurants/janta-sweet...,"Janta Sweet House, Bazar No.9, Circullar Road,...",Menu/531342.json
2,158203,theka coffee desi,Abohar,3.8,100.0,100.0,Beverages,22121652000190,https://www.swiggy.com/restaurants/theka-coffe...,"theka coffee desi, sahtiya sadan road city",Menu/158203.json
3,187912,Singh Hut,Abohar,3.7,20.0,250.0,"Fast Food,Indian",22119652000167,https://www.swiggy.com/restaurants/singh-hut-n...,"Singh Hut, CIRCULAR ROAD NEAR NEHRU PARK ABOHAR",Menu/187912.json
4,543530,GRILL MASTERS,Abohar,4.0,0.0,250.0,"Italian-American,Fast Food",12122201000053,https://www.swiggy.com/restaurants/grill-maste...,"GRILL MASTERS, ADA Heights, Abohar - Hanumanga...",Menu/543530.json


In [3]:
df.shape

(148398, 11)

### Feature Selection

The following features are selected for recommendation modeling:

- Categorical Features: city, cuisine
- Numerical Features: rating, rating_count, cost


In [5]:
# Selecting relevant features
categorical_features = ['city', 'cuisine']
numerical_features = ['rating', 'rating_count', 'cost']

df_selected = df[categorical_features + numerical_features]
df_selected.head()


Unnamed: 0,city,cuisine,rating,rating_count,cost
0,Abohar,"Beverages,Pizzas",4.0,0.0,200.0
1,Abohar,"Sweets,Bakery",4.4,50.0,200.0
2,Abohar,Beverages,3.8,100.0,100.0
3,Abohar,"Fast Food,Indian",3.7,20.0,250.0
4,Abohar,"Italian-American,Fast Food",4.0,0.0,250.0


In [7]:
# Initialize One-Hot Encoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')



In [8]:
# Fit and transform categorical features
encoded_cat = encoder.fit_transform(df_selected[categorical_features])

# Get encoded feature names
encoded_cat_columns = encoder.get_feature_names_out(categorical_features)

# Convert encoded data to DataFrame
encoded_cat_df = pd.DataFrame(encoded_cat, columns=encoded_cat_columns)

encoded_cat_df.head()


Unnamed: 0,"city_Abids & Koti,Hyderabad",city_Abohar,"city_Adajan,Surat",city_Adilabad,city_Adityapur,city_Adoni,"city_Adyar,Chennai",city_Agartala,city_Agra,city_Ahmednagar,...,"cuisine_Vietnamese,Snacks",cuisine_Waffle,"cuisine_Waffle,Bakery","cuisine_Waffle,Beverages","cuisine_Waffle,Burgers","cuisine_Waffle,Chinese","cuisine_Waffle,Desserts","cuisine_Waffle,Fast Food","cuisine_Waffle,Ice Cream","cuisine_Waffle,Snacks"
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Reset index to maintain alignment
encoded_cat_df.reset_index(drop=True, inplace=True)
df_selected.reset_index(drop=True, inplace=True)


In [10]:
# Combine numerical and encoded categorical features
final_df = pd.concat(
    [df_selected[numerical_features], encoded_cat_df],
    axis=1
)

final_df.head()


Unnamed: 0,rating,rating_count,cost,"city_Abids & Koti,Hyderabad",city_Abohar,"city_Adajan,Surat",city_Adilabad,city_Adityapur,city_Adoni,"city_Adyar,Chennai",...,"cuisine_Vietnamese,Snacks",cuisine_Waffle,"cuisine_Waffle,Bakery","cuisine_Waffle,Beverages","cuisine_Waffle,Burgers","cuisine_Waffle,Chinese","cuisine_Waffle,Desserts","cuisine_Waffle,Fast Food","cuisine_Waffle,Ice Cream","cuisine_Waffle,Snacks"
0,4.0,0.0,200.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.4,50.0,200.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.8,100.0,100.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3.7,20.0,250.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.0,0.0,250.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
final_df.shape


(148398, 2956)

In [12]:
# Save encoded dataset
final_df.to_csv("../data/processed/encoded_data.csv", index=False)


In [13]:
# Save encoder for later use in Streamlit
with open("../models/encoder.pkl", "wb") as file:
    pickle.dump(encoder, file)
