In [1]:
# pip install scikit-learn

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import os

In [3]:
df_swiggy = pd.read_csv('./dataset/swiggy.csv')
df_swiggy.head(3)

Unnamed: 0,id,name,city,rating,rating_count,cost,cuisine,lic_no,link,address,menu
0,567335,AB FOODS POINT,Abohar,--,Too Few Ratings,₹ 200,"Beverages,Pizzas",22122652000138,https://www.swiggy.com/restaurants/ab-foods-po...,"AB FOODS POINT, NEAR RISHI NARANG DENTAL CLINI...",Menu/567335.json
1,531342,Janta Sweet House,Abohar,4.4,50+ ratings,₹ 200,"Sweets,Bakery",12117201000112,https://www.swiggy.com/restaurants/janta-sweet...,"Janta Sweet House, Bazar No.9, Circullar Road,...",Menu/531342.json
2,158203,theka coffee desi,Abohar,3.8,100+ ratings,₹ 100,Beverages,22121652000190,https://www.swiggy.com/restaurants/theka-coffe...,"theka coffee desi, sahtiya sadan road city",Menu/158203.json


In [4]:
df = df_swiggy.copy()
df['rating'] = df['rating'].str.replace('--', '0')
df['cost'] = df['cost'].str.replace('₹', '')
df['rating_count'] = df['rating_count'].str.replace('+','').str.replace('K','000').str.replace('Too Few Ratings', '0').str.replace('ratings', '')

In [5]:
df['rating'] = df['rating'].astype(float)
df['rating_count'] = df['rating_count'].fillna(0).astype(int)
df['cost'] = df['cost'].fillna(0).astype(int)


df = df.dropna()
df = df[df['rating_count'] > 0]
df.head(5)

Unnamed: 0,id,name,city,rating,rating_count,cost,cuisine,lic_no,link,address,menu
1,531342,Janta Sweet House,Abohar,4.4,50,200,"Sweets,Bakery",12117201000112,https://www.swiggy.com/restaurants/janta-sweet...,"Janta Sweet House, Bazar No.9, Circullar Road,...",Menu/531342.json
2,158203,theka coffee desi,Abohar,3.8,100,100,Beverages,22121652000190,https://www.swiggy.com/restaurants/theka-coffe...,"theka coffee desi, sahtiya sadan road city",Menu/158203.json
3,187912,Singh Hut,Abohar,3.7,20,250,"Fast Food,Indian",22119652000167,https://www.swiggy.com/restaurants/singh-hut-n...,"Singh Hut, CIRCULAR ROAD NEAR NEHRU PARK ABOHAR",Menu/187912.json
5,158204,Sam Uncle,Abohar,3.6,20,200,Continental,22119652000052,https://www.swiggy.com/restaurants/sam-uncle-c...,"Sam Uncle, hanumangarh road near raja bajaj sh...",Menu/158204.json
6,156588,shere punjab veg,Abohar,4.0,100,150,North Indian,22120652000021,https://www.swiggy.com/restaurants/shere-punja...,"shere punjab veg, major surinder chowk near ve...",Menu/156588.json


In [6]:
df['cost_per_person'] = df['cost'] / 2  
df['popularity'] = df['rating'] * np.log1p(df['rating_count'])

In [7]:
le_city = LabelEncoder()
le_cuisine = LabelEncoder()
df['city_encoded'] = le_city.fit_transform(df['city'])
df['cuisine_encoded'] = le_cuisine.fit_transform(df['cuisine'])

In [8]:
features = ['city_encoded', 'cuisine_encoded', 'rating', 'rating_count', 'popularity']
target = 'cost_per_person'

In [9]:
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

In [13]:
y_pred = model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [14]:
print(f"Mean Squared Error: {mse}")
print(f"R-squared Score: {r2}")

Mean Squared Error: 6411.235340040221
R-squared Score: 0.13276987632485082


In [15]:
np.save('./feature/X_train_scaled.npy', X_train_scaled)
np.save('./feature/X_test_scaled.npy', X_test_scaled)
np.save('./feature/y_train.npy', y_train)
np.save('./feature/y_test.npy', y_test)

joblib.dump(model, 'random_forest_model.joblib')
joblib.dump(le_city, 'le_city.pkl')
joblib.dump(le_cuisine, 'le_cuisine.pkl')
joblib.dump(scaler, 'scaler.pkl')


['scaler.pkl']