In [2]:

import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import re

In [3]:
import pandas as pd

df = pd.read_csv(r"D:\python_programs\Swiggy Recommendation\swiggy.csv")
print(df.head())


       id               name    city rating     rating_count   cost  \
0  567335     AB FOODS POINT  Abohar     --  Too Few Ratings  ₹ 200   
1  531342  Janta Sweet House  Abohar    4.4      50+ ratings  ₹ 200   
2  158203  theka coffee desi  Abohar    3.8     100+ ratings  ₹ 100   
3  187912          Singh Hut  Abohar    3.7      20+ ratings  ₹ 250   
4  543530      GRILL MASTERS  Abohar     --  Too Few Ratings  ₹ 250   

                      cuisine          lic_no  \
0            Beverages,Pizzas  22122652000138   
1               Sweets,Bakery  12117201000112   
2                   Beverages  22121652000190   
3            Fast Food,Indian  22119652000167   
4  Italian-American,Fast Food  12122201000053   

                                                link  \
0  https://www.swiggy.com/restaurants/ab-foods-po...   
1  https://www.swiggy.com/restaurants/janta-sweet...   
2  https://www.swiggy.com/restaurants/theka-coffe...   
3  https://www.swiggy.com/restaurants/singh-hut-n...  

In [4]:
# Clean cost column by removing currency symbols and non-numeric characters
df["cost"] = df["cost"].astype(str)
df["cost"] = df["cost"].apply(lambda x: re.sub(r"[^\d.]", "", x))

print(df["cost"].head())

0    200
1    200
2    100
3    250
4    250
Name: cost, dtype: object


In [5]:
df["rating_count"] = df["rating_count"].str.extract(r"(\d+)").astype(float)


In [6]:
# Convert rating to numeric first (in case it contains strings)
df["rating"] = pd.to_numeric(df["rating"], errors="coerce")

# Fill NaN rating using mean rating for each restaurant name
df["rating"] = df.groupby("name")["rating"].transform(
    lambda x: x.fillna(x.mean())
)
print(df["rating"].head())

0    NaN
1    4.4
2    3.8
3    3.7
4    4.0
Name: rating, dtype: float64


In [7]:
df["cuisine"] = df["cuisine"].str.split(",")
df = df.explode("cuisine")


In [8]:
df.isnull().sum()



id                   0
name                86
city                 0
rating          130387
rating_count    148102
cost                 0
cuisine             99
lic_no             316
link                 0
address             86
menu                 0
dtype: int64

In [9]:
# BEFORE
print("=== BEFORE ===")
print("Duplicate rows:", df.duplicated().sum())
print("Total rows:", len(df))

# REMOVE DUPLICATES
df = df.drop_duplicates()

# AFTER
print("\n=== AFTER ===")
print("Duplicate rows:", df.duplicated().sum())
print("Total rows:", len(df))

=== BEFORE ===
Duplicate rows: 0
Total rows: 256611

=== AFTER ===
Duplicate rows: 0
Total rows: 256611


In [10]:
# Convert rating to numeric (important)
df["rating"] = pd.to_numeric(df["rating"], errors="coerce")

# -------------------------------------------
# 1. Rating null BEFORE
# -------------------------------------------
print("Rating null BEFORE:", df["rating"].isnull().sum())

# -------------------------------------------
# 2. Fill rating NaN with mean rating by NAME
# -------------------------------------------
df["rating"] = df.groupby("name")["rating"].transform(
    lambda x: x.fillna(x.mean())
)

# Fallback: if all rating values for a name were NaN
df["rating"] = df["rating"].fillna(df["rating"].mean())

# -------------------------------------------
# 3. Rating null AFTER
# -------------------------------------------
print("Rating null AFTER:", df["rating"].isnull().sum())

# -------------------------------------------
# 4. Drop null values in ALL OTHER columns
# (Rating will not be dropped because it no longer has NaN)
# -------------------------------------------
df = df.dropna()

# -------------------------------------------
# 5. Final missing value summary
# -------------------------------------------
print("\nNull values column-wise AFTER cleaning:")
print(df.isnull().sum())


Rating null BEFORE: 130387
Rating null AFTER: 0

Null values column-wise AFTER cleaning:
id              0
name            0
city            0
rating          0
rating_count    0
cost            0
cuisine         0
lic_no          0
link            0
address         0
menu            0
dtype: int64


In [11]:
df.head()

Unnamed: 0,id,name,city,rating,rating_count,cost,cuisine,lic_no,link,address,menu
1,531342,Janta Sweet House,Abohar,4.4,50.0,200,Sweets,12117201000112,https://www.swiggy.com/restaurants/janta-sweet...,"Janta Sweet House, Bazar No.9, Circullar Road,...",Menu/531342.json
1,531342,Janta Sweet House,Abohar,4.4,50.0,200,Bakery,12117201000112,https://www.swiggy.com/restaurants/janta-sweet...,"Janta Sweet House, Bazar No.9, Circullar Road,...",Menu/531342.json
2,158203,theka coffee desi,Abohar,3.8,100.0,100,Beverages,22121652000190,https://www.swiggy.com/restaurants/theka-coffe...,"theka coffee desi, sahtiya sadan road city",Menu/158203.json
3,187912,Singh Hut,Abohar,3.7,20.0,250,Fast Food,22119652000167,https://www.swiggy.com/restaurants/singh-hut-n...,"Singh Hut, CIRCULAR ROAD NEAR NEHRU PARK ABOHAR",Menu/187912.json
3,187912,Singh Hut,Abohar,3.7,20.0,250,Indian,22119652000167,https://www.swiggy.com/restaurants/singh-hut-n...,"Singh Hut, CIRCULAR ROAD NEAR NEHRU PARK ABOHAR",Menu/187912.json


In [12]:
df.dtypes

id                int64
name             object
city             object
rating          float64
rating_count    float64
cost             object
cuisine          object
lic_no           object
link             object
address          object
menu             object
dtype: object

In [13]:
# Columns
categorical_cols = ["name", "city", "cuisine"]
numerical_cols = ["rating", "rating_count", "cost"]

# -----------------------------
# BEFORE DATA TYPES
# -----------------------------
print("=== BEFORE DATA TYPES ===")
print(df.dtypes)

# -----------------------------
# TYPE CONVERSION
# -----------------------------
# Convert categorical
df[categorical_cols] = df[categorical_cols].astype("category")

# Convert numerical
df[numerical_cols] = df[numerical_cols].apply(pd.to_numeric, errors="coerce")

# -----------------------------
# AFTER DATA TYPES
# -----------------------------
print("\n=== AFTER DATA TYPES ===")
print(df.dtypes)


=== BEFORE DATA TYPES ===
id                int64
name             object
city             object
rating          float64
rating_count    float64
cost             object
cuisine          object
lic_no           object
link             object
address          object
menu             object
dtype: object

=== AFTER DATA TYPES ===
id                 int64
name            category
city            category
rating           float64
rating_count     float64
cost             float64
cuisine         category
lic_no            object
link              object
address           object
menu              object
dtype: object


In [14]:
# SAVE CLEANED CSV TO LOCATION
# -----------------------------
save_path = r"D:\python_programs\Swiggy Recommendation\cleaned_data.csv" 
df.to_csv(save_path, index=False)

print(f"\nCleaned CSV saved at: {save_path}")


Cleaned CSV saved at: D:\python_programs\Swiggy Recommendation\cleaned_data.csv
