In [3]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [4]:
# Step 2: Load Dataset
# Assuming the dataset is in a CSV file
data = pd.read_csv(r'C:\Users\VSS\Desktop\ML projects\Dataset.csv')

# checking the data
data.head()

Unnamed: 0,Restaurant ID,Restaurant Name,Country Code,City,Address,Locality,Locality Verbose,Longitude,Latitude,Cuisines,...,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Rating color,Rating text,Votes
0,6317637,Le Petit Souffle,162,Makati City,"Third Floor, Century City Mall, Kalayaan Avenu...","Century City Mall, Poblacion, Makati City","Century City Mall, Poblacion, Makati City, Mak...",121.027535,14.565443,"French, Japanese, Desserts",...,Botswana Pula(P),Yes,No,No,No,3,4.8,Dark Green,Excellent,314
1,6304287,Izakaya Kikufuji,162,Makati City,"Little Tokyo, 2277 Chino Roces Avenue, Legaspi...","Little Tokyo, Legaspi Village, Makati City","Little Tokyo, Legaspi Village, Makati City, Ma...",121.014101,14.553708,Japanese,...,Botswana Pula(P),Yes,No,No,No,3,4.5,Dark Green,Excellent,591
2,6300002,Heat - Edsa Shangri-La,162,Mandaluyong City,"Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...","Edsa Shangri-La, Ortigas, Mandaluyong City","Edsa Shangri-La, Ortigas, Mandaluyong City, Ma...",121.056831,14.581404,"Seafood, Asian, Filipino, Indian",...,Botswana Pula(P),Yes,No,No,No,4,4.4,Green,Very Good,270
3,6318506,Ooma,162,Mandaluyong City,"Third Floor, Mega Fashion Hall, SM Megamall, O...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.056475,14.585318,"Japanese, Sushi",...,Botswana Pula(P),No,No,No,No,4,4.9,Dark Green,Excellent,365
4,6314302,Sambo Kojin,162,Mandaluyong City,"Third Floor, Mega Atrium, SM Megamall, Ortigas...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.057508,14.58445,"Japanese, Korean",...,Botswana Pula(P),Yes,No,No,No,4,4.8,Dark Green,Excellent,229


In [5]:
# informations about the data
data.describe()

Unnamed: 0,Restaurant ID,Country Code,Longitude,Latitude,Average Cost for two,Price range,Aggregate rating,Votes
count,9551.0,9551.0,9551.0,9551.0,9551.0,9551.0,9551.0,9551.0
mean,9051128.0,18.365616,64.126574,25.854381,1199.210763,1.804837,2.66637,156.909748
std,8791521.0,56.750546,41.467058,11.007935,16121.183073,0.905609,1.516378,430.169145
min,53.0,1.0,-157.948486,-41.330428,0.0,1.0,0.0,0.0
25%,301962.5,1.0,77.081343,28.478713,250.0,1.0,2.5,5.0
50%,6004089.0,1.0,77.191964,28.570469,400.0,2.0,3.2,31.0
75%,18352290.0,1.0,77.282006,28.642758,700.0,2.0,3.7,131.0
max,18500650.0,216.0,174.832089,55.97698,800000.0,4.0,4.9,10934.0


In [6]:
# Select features and target
features = ['Cuisines', 'City', 'Average Cost for two', 'Price range', 'Votes', 'Has Table booking', 'Has Online delivery']
target = 'Aggregate rating'

In [7]:
# Analyze missing values
missing_values = data.isnull().sum()
missing_percentage = (missing_values / len(data)) * 100

print("Missing Values Count:")
print(missing_values)
print("\nMissing Values Percentage:")
print(missing_percentage)

Missing Values Count:
Restaurant ID           0
Restaurant Name         0
Country Code            0
City                    0
Address                 0
Locality                0
Locality Verbose        0
Longitude               0
Latitude                0
Cuisines                9
Average Cost for two    0
Currency                0
Has Table booking       0
Has Online delivery     0
Is delivering now       0
Switch to order menu    0
Price range             0
Aggregate rating        0
Rating color            0
Rating text             0
Votes                   0
dtype: int64

Missing Values Percentage:
Restaurant ID           0.000000
Restaurant Name         0.000000
Country Code            0.000000
City                    0.000000
Address                 0.000000
Locality                0.000000
Locality Verbose        0.000000
Longitude               0.000000
Latitude                0.000000
Cuisines                0.094231
Average Cost for two    0.000000
Currency                0.00

In [8]:
# Handle missing values in features
# If there are missing values, let's check and handle them
print(data[features].isnull().sum())

Cuisines                9
City                    0
Average Cost for two    0
Price range             0
Votes                   0
Has Table booking       0
Has Online delivery     0
dtype: int64


In [9]:
# Preprocessing pipeline
numeric_features = ['Average Cost for two', 'Price range', 'Votes']
categorical_features = ['Cuisines', 'City', 'Has Table booking', 'Has Online delivery']

In [10]:
# We will create a pipeline for both numerical and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


In [11]:
# Use ColumnTransformer to apply transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [12]:
# Split the data into train and test sets
X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
# Create a pipeline with preprocessor and regressor
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', RandomForestRegressor(random_state=42))])

In [14]:
# Train the model
model.fit(X_train, y_train)


In [15]:
# Make predictions
y_pred = model.predict(X_test)


In [16]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 0.10228937065257028
R-squared: 0.9550595463187104


In [17]:
# Analyze feature importance (Random Forest feature importance)
importances = model.named_steps['regressor'].feature_importances_
feature_names = numeric_features + list(model.named_steps['preprocessor'].transformers_[1][1].named_steps['onehot'].get_feature_names_out(categorical_features))
feature_importances = pd.Series(importances, index=feature_names).sort_values(ascending=False)

print("Feature Importances:\n", feature_importances)

Feature Importances:
 Votes                                            0.950111
Average Cost for two                             0.008908
Price range                                      0.003042
City_New Delhi                                   0.002346
City_Noida                                       0.001736
                                                   ...   
Cuisines_Fast Food, South Indian                 0.000000
Cuisines_Tibetan                                 0.000000
Cuisines_Assamese                                0.000000
Cuisines_Tibetan, Chinese, North Indian          0.000000
Cuisines_Turkish, Arabian, Moroccan, Lebanese    0.000000
Length: 1674, dtype: float64



**The feature importance analysis shows that the Votes feature has the highest importance by far, with a value of 0.950, indicating it plays a significant role in predicting restaurant ratings. This makes sense, as more votes (or reviews) usually correlate with higher confidence in the aggregate rating.**

***Key Insights from Feature Importances:***
**Votes:** The most critical feature, accounting for 95% of the model's predictive power. This suggests that restaurants with more customer feedback tend to have more reliable ratings.

**Average Cost for Two & Price Range:** These have lower importance (0.0089 and 0.0030), suggesting that while price-related features do have an effect, they aren't as influential as the number of votes.

**City:** Cities like New Delhi and Noida have some influence, but their impact is quite small compared to votes.

**Cuisines:** Interestingly, many cuisines have zero importance in the model. This might suggest that cuisine types don't have a direct effect on the ratings, or their impact is overshadowed by other variables like votes.**

In [18]:
import joblib

# After training the model
joblib.dump(model, 'restaurant_rating_predictor.pkl')


['restaurant_rating_predictor.pkl']