## Table of Contents
1. [Introduction](#Introduction)
2. [Exploratory Data Analysis](#Exploratory-Data-Analysis)
3. [Modeling](#Modeling)
4. [Results](#Results)
5. [Conclusion](#Conclusion)


## Introduction

In [82]:
#importing required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [83]:
#reading the data

df=pd.read_csv("data/zomato_df_final_data.csv")

In [None]:
#inspect the first few rows of the data

print(df.head())



In [None]:
#check all the column names

print(df.columns)

## Exploratory Data Analysis

In [None]:
#checking the numerical and categorical variables

num_var=df.select_dtypes(include="number").columns
cat_var=df.select_dtypes(exclude="number").columns

print(num_var)
print(cat_var)

In [None]:
#checking for the NA values

print(df.isna().sum())

In [None]:
sns.heatmap(df.isnull(), yticklabels=False,cbar=False,cmap="viridis")

In [None]:
#detecting outliers in data

#list of numerical variables
numerical_cols=df[num_var]

#loop through each numerical columns to detect outliers

for col in numerical_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3-Q1

    #define the bound for ouliers
    lower_bound= Q1 - 1.5*IQR
    upper_bound= Q3 + 1.5*IQR

    #outlier
    outliers=df[(df[col] < lower_bound) | (df[col]> upper_bound)]

    print(f"Outliers in {col}:")
    print(outliers)
    print("\n")

In [None]:
#visualizing outliers sing boxplot 

#creating subplot axes
nrows=2
ncols=3
fig, axes =plt.subplots(nrows,ncols, figsize=(15,10))

# Flatten the axes array to make it easier to iterate over
axes = axes.flatten()

# use to set style of background of plot
sns.set(style="whitegrid")

# Define properties for the outliers (fliers)
flierprops = dict(marker='o', markerfacecolor='red', markersize=8, linestyle='none')


for i, col in enumerate(numerical_cols):
    sns.boxplot(y=df[col], ax=axes[i], color="skyblue",flierprops=flierprops)
    axes[i].set_title(f'Boxplot of {col}')

plt.tight_layout
plt.show()


#### How many unique cuisines are served by Sydney restaurants?

In [None]:
# Ensure the 'cuisine' column is of string type
df['cuisine'] = df['cuisine'].astype(str)

# Now clean and split the cuisines
df['cuisine'] = df['cuisine'].str.strip("[]").str.replace("'", "").str.split(',')

# Use explode to flatten the list of cuisines into individual rows
df_exploded = df.explode('cuisine')

# Strip whitespace from cuisine names and count unique cuisines
df_exploded['cuisine'] = df_exploded['cuisine'].str.strip()
unique_cuisines = df_exploded['cuisine'].nunique()

# Optional: List the unique cuisines
cuisine_list = df_exploded['cuisine'].unique()

print(f"There are {unique_cuisines} unique cuisines served by Sydney restaurants.")


In [11]:
cuisine_counts=df_exploded["cuisine"].value_counts()

In [None]:
#plottong the top 15 cuisines offered by sydney restaurants

plt.figure(figsize=(12, 8))
colors = sns.color_palette("tab10", n_colors=25)
sns.barplot(y=cuisine_counts.index[:25], x=cuisine_counts.values[:25],palette=colors)  
plt.title('Top 25 Most Popular Cuisines')
plt.xlabel('Number of Restaurants')
plt.ylabel('Cuisine')
plt.show()

#### Which suburbs (top 3) have the highest number of restaurants?

In [None]:
df["subzone"].nunique()

In [None]:
restaurant_suburb=df["subzone"].value_counts()

#getting the top 3 suburbs with the highest number of restaurants
top_3_suburbs = restaurant_suburb.head(3)

#result
print("Top 3 suburbs with the highest number of restaurants:")
print(top_3_suburbs)

In [None]:
#visualizing the top 3 suburbs
plt.figure(figsize=(12,12))
sns.barplot(x=top_3_suburbs.index[:3],y=top_3_suburbs.values[:3],palette="tab10")
plt.title("Top 3 Suburbs with highest number of restaurants")
plt.xlabel("Suburbs")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

#### “Restaurants with ‘excellent’ ratings are mostly costly while those with ‘Poor’ ratings are rarely expensive”.

To answer this if the above statement is true or not, first we will make a boxplot to see the cost distribution for each rating category.

In [None]:
plt.figure(figsize=(10,8))

# Create a boxplot to show the distribution of costs by rating
sns.boxplot(x=df['rating_text'], y=df['cost'], palette="tab10")

plt.xlabel("Ratings")
plt.ylabel("Cost")
plt.title("Boxplot of Cost Distribution by Ratings")
plt.show()


In [None]:
plt.figure(figsize=(10, 8))

# Use histplot with stacking and explicitly define legend=True to make sure legend is created
sns.histplot(data=df, x="cost", hue="rating_text", kde=True, multiple="stack",bins=30)

# Limit x-axis for better focus on the range
plt.xlim(0, 200)

# Add title and labels
plt.title('Stacked Histogram of Cost by Rating', fontsize=16)
plt.xlabel('Cost', fontsize=14)
plt.ylabel('Count', fontsize=14)

# Show the plot
plt.tight_layout()
plt.show()



#### Key observations:

-"**Excellent**" ratings: These restaurants tend to have a higher median cost, with the boxplot showing that many of these restaurants fall in the higher cost range. The interquartile range (IQR) is higher compared to other ratings, indicating that restaurants with "Excellent" ratings tend to be more expensive on average.There are outliers above 200, indicating some restaurants with "Excellent" ratings have very high costs, with some reaching around 500. 

-"**Poor**" ratings:The median cost for "Poor" rated restaurants is 50, which is higher than "Average" rated restaurants.The IQR is slightly larger compared to "Average" rated restaurants, meaning there’s more cost variation among "Poor" rated restaurants.A few outliers show some "Poor" rated restaurants can have higher costs, but most are still relatively affordable.

-"**Very Good**" ratings:The median cost for "Very Good" rated restaurants is lower than "Excellent" rated ones.The IQR is smaller, indicating less variation in restaurant costs. The box is mostly concentrated below 100, suggesting that most "Very Good" rated restaurants are moderately priced.There are fewer outliers compared to "Excellent" rated restaurants, but still a few higher-cost restaurants in this category.

-"**Good**" ratings:The median cost is lower than "Very Good" and "Excellent" but still close to 50.The IQR is small, indicating that most "Good" rated restaurants are affordable, with costs primarily under 100.There are a few outliers above 200, but most restaurants are in a lower price range.

-"**Average**" ratings:The median cost is close to 45, making it one of the lower-cost categories.The IQR is small, meaning that there’s little variation in the cost for "Average" rated restaurants.There are a few outliers, but the majority of restaurants are in the affordable range, under 100.

To further quantify, we can create a summary statistics: 

In [None]:
# Summary statistics for each rating category
cost_summary = df.groupby('rating_text')['cost'].agg(["mean","median"])

print("Median cost per rating category:")
print(cost_summary)


Based on the boxplot and summary statistics, I agree with the statement. Restaurants with "Excellent" ratings are generally more expensive, as reflected by the higher median cost and the presence of more high-cost outliers. In contrast, "Poor" rated restaurants tend to have lower costs, with fewer high-cost outliers, supporting the idea that they are generally less expensive.


### Univariate Analysis

#### Cost Variable

In [None]:
#creating histogram to see the distribution of the data
plt.figure(figsize=(9,8))
ax=sns.histplot(data=df,x="cost", bins=30, kde=True, color="orange",edgecolor="red")
ax.lines[0].set_color("green")
plt.title("Distibution of Cost Variable")
plt.xlabel("Cost")
plt.xlim(0, 200)
plt.show()

#### Key Interpretation:

- The histogram shows that the majority of restaurant costs are clustered between 20 and 50. 
- This can be observed from the tall bars in this range, with the highest peak around 25.
- The distribution is right-skewed, meaning there are fewer restaurants with higher costs (over 100), as indicated by the long tail on the right side.
- The green KDE curve (Kernel Density Estimate) overlays the histogram, providing a smooth representation of the probability density. It confirms that most restaurants have costs concentrated between 20 and 50.
- There are some outliers in the dataset, as seen from the bars on the right-hand side that go beyond 100, with a few even reaching as high as 200. This indicates a small proportion of restaurants with much higher costs, but they are very uncommon.

#### Rating Variable

In [None]:

# Creating a countplot to show the count of each rounded rating number
plt.figure(figsize=(10, 6))
sns.countplot(x='rating_number', data=df, palette='Set2')
plt.title('Count of Restaurants by Rating (0-5 Scale)')
plt.xlabel('Ratings')
plt.ylabel('Number of Restaurants')
plt.ylim(0, 1000) 
plt.xticks(rotation=45)
plt.show()


#### Key Interpretation:

- Most restaurants are rated between 3.0 and 3.5, suggesting that the majority are considered average to slightly above average in quality.
- Extremely low or high ratings are relatively rare, meaning most restaurants avoid being rated as exceptionally bad or outstanding.
- This distribution shows that customer ratings tend to cluster around the middle, and it's harder for restaurants to achieve exceptional or very poor ratings.

#### Type Variable

In [None]:
print(df["type"].isna().sum())

In [22]:
# Ensure the 'cuisine' column is of string type
df['type'] = df['type'].astype(str)

# Now clean and split the cuisines
df['type'] = df['type'].str.strip("[]").str.replace("'", "").str.split(',')

# Use explode to flatten the list of cuisines into individual rows
df_exploded_2 = df.explode('type')

# Strip whitespace from cuisine names and count unique cuisines
df_exploded_2['type'] = df_exploded_2['type'].str.strip()
#unique_cuisines = df_exploded['cuisine'].nunique()

# Optional: List the unique cuisines
#cuisine_list = df_exploded['cuisine'].unique()

type_count=df_exploded_2["type"].value_counts()

In [None]:
plt.figure(figsize=(10,8))
sns.barplot(x=type_count.values[:15],y=type_count.index[:15],palette="tab10")
plt.title("Barplot of types of restaurant in Sydney")
plt.xlabel("Count")
plt.ylabel("Types of restaurants")
plt.show()

## Modelling

In [84]:
features_model=df[["cost","cuisine","subzone","type","votes","cost_2","rating_number"]]

In [85]:
print(features_model.shape)

(10500, 7)


In [86]:
features_model=features_model.dropna(subset=["rating_number"])

In [87]:
print(features_model.shape)

(7184, 7)


#### Declare source and target variables

In [88]:
X=features_model.drop(["rating_number"], axis=1)
y=features_model["rating_number"]

In [89]:
print(X.shape)
print(y.shape)

(7184, 6)
(7184,)


#### Spliting the data into train and test set

In [90]:
# separate dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)
X_train.shape, X_test.shape

((5747, 6), (1437, 6))

### Feature Engineering

In [91]:
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

print(num_features)
print(cat_features)

Index(['cost', 'votes', 'cost_2'], dtype='object')
Index(['cuisine', 'subzone', 'type'], dtype='object')


#### Engineering missing values in numerical variables

In [92]:
print(X_train[num_features].dtypes)


cost      float64
votes     float64
cost_2    float64
dtype: object


In [93]:
#checking for missing values in train set
print(X_train[num_features].isnull().sum())

cost      83
votes      0
cost_2    83
dtype: int64


In [94]:
#checking for missing values in test set
print(X_test[num_features].isnull().sum())

cost      20
votes      0
cost_2    20
dtype: int64


In [95]:
# percentage of missing values in each variable
print(round(X_train[num_features].isnull().mean(), 2))

cost      0.01
votes     0.00
cost_2    0.01
dtype: float64


In [96]:
#imputaion of missing values with the median value

imputer = SimpleImputer(strategy="median")
X_train[num_features]=imputer.fit_transform(X_train[num_features])
X_test[num_features]=imputer.transform(X_test[num_features])

print("Result for training set in numerical variables")
print(X_train[num_features].isnull().sum())

print("Result for test set in numerical variables")
print(X_test[num_features].isnull().sum())

Result for training set in numerical variables
cost      0
votes     0
cost_2    0
dtype: int64
Result for test set in numerical variables
cost      0
votes     0
cost_2    0
dtype: int64


SimpleImputer with the median strategy has been used to handle missing values in the numerical columns. This approach ensures that missing values are replaced with the median calculated from the training data, which is robust to outliers.

I have fitted the imputer on the training data to avoid data leakage, meaning there is no use of any information from the test set while training. After fitting the imputer, same median values have been applied to both the training and test data, ensuring consistency in how missing values are handled.

#### Engineering missing values in categorical variables

In [97]:
#checking the missing values in of categorical variables in train set
print(X_train[cat_features].isnull().sum())

cuisine     0
subzone     0
type       19
dtype: int64


In [98]:
#checking the missing values in of categorical variables in test set
print(X_test[cat_features].isnull().sum())

cuisine    0
subzone    0
type       2
dtype: int64


SimpleImputer with the "most frequent" strategy was used to handle missing values in categorical variables. This filled missing values with the most commonly occurring category (mode) in each column.

The imputer was fitted on the training data and applied to both training and test sets, ensuring that the imputation was based on the training data only, preventing data leakage and maintaining consistency in handling missing values

In [99]:
X_train_clean = X_train.dropna(subset=['type'])
X_test_clean = X_test.dropna(subset=['type'])



In [100]:
# Check the new shape after dropping rows
print(f"New shape of X_train after dropping rows with NaN in 'type': {X_train_clean.shape}")
print(f"New shape of X_test after dropping rows with NaN in 'type': {X_test_clean.shape}")

New shape of X_train after dropping rows with NaN in 'type': (5728, 6)
New shape of X_test after dropping rows with NaN in 'type': (1435, 6)


In [101]:
#checking the missing values in of categorical variables in train set
print(X_train[cat_features].isnull().sum())

cuisine     0
subzone     0
type       19
dtype: int64


In [None]:
#checking the missing values in of categorical variables in test set
print(X_test[cat_features].isnull().sum())

#### Engineering outliers in numerical variables

In [78]:

# Function to replace outliers with maximum values based on the upper bound
def max_value(df_temp, variable, upper_bound):
    return np.where(df_temp[variable] > upper_bound, upper_bound, df_temp[variable])

# Dictionary to hold the calculated upper bounds for each column
cols_with_outliers = {}

# List of numerical columns in the dataset
numerical_cols = X_train.select_dtypes(include=[np.number]).columns

# Calculate the IQR for each numerical column and store the upper bounds
for col in numerical_cols:
    Q1 = X_train[col].quantile(0.25)
    Q3 = X_train[col].quantile(0.75)
    IQR = Q3 - Q1
    
    # Define the upper bound as Q3 + 1.5 * IQR (standard IQR rule)
    upper_bound = Q3 + 1.5 * IQR
    
    # Store the upper bound in the dictionary
    cols_with_outliers[col] = upper_bound

# Apply the upper bounds to cap outliers in both X_train and X_test
for df_temp in [X_train, X_test]:
    for col in cols_with_outliers:
        df_temp[col] = max_value(df_temp, col, cols_with_outliers[col])

# Now the outliers in both X_train and X_test have been capped based on the IQR method


In [None]:
# we can also use seaborn library to plot elegant ones
df_custom = X_train[["cost","votes","cost_2"]]
plt.figure(figsize=(15,10))
ax = sns.boxplot(data=df_custom, orient="h", palette="Set2")

#### Encoding the categorical variables

In [None]:
import ast
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder

# Sample DataFrame with categorical columns
# X_train and X_test assumed to have 'cuisine', 'subzone', and 'type' as columns

# Step 1: Convert 'cuisine' and 'type' from string to list format
# Using ast.literal_eval to safely convert strings to lists
X_train['cuisine'] = X_train['cuisine'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
X_train['type'] = X_train['type'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

X_test['cuisine'] = X_test['cuisine'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
X_test['type'] = X_test['type'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Now, the rest of your process can continue as is, e.g., normalizing the lists, binarizing, and encoding


# Step 2: Normalize the 'cuisine' and 'type' columns (lowercase and strip whitespace)
X_train['cuisine'] = X_train['cuisine'].apply(lambda lst: [item.strip().lower() for item in lst])
X_train['type'] = X_train['type'].apply(lambda lst: [item.strip().lower() for item in lst])

X_test['cuisine'] = X_test['cuisine'].apply(lambda lst: [item.strip().lower() for item in lst])
X_test['type'] = X_test['type'].apply(lambda lst: [item.strip().lower() for item in lst])

# Step 3: Apply MultiLabelBinarizer to 'cuisine' and 'type'
mlb_cuisine = MultiLabelBinarizer()
mlb_type = MultiLabelBinarizer()

# Fit on the train set and transform both the train and test sets
X_train_cuisine_encoded = pd.DataFrame(mlb_cuisine.fit_transform(X_train['cuisine']), columns=mlb_cuisine.classes_, index=X_train.index)
X_test_cuisine_encoded = pd.DataFrame(mlb_cuisine.transform(X_test['cuisine']), columns=mlb_cuisine.classes_, index=X_test.index)

X_train_type_encoded = pd.DataFrame(mlb_type.fit_transform(X_train['type']), columns=mlb_type.classes_, index=X_train.index)
X_test_type_encoded = pd.DataFrame(mlb_type.transform(X_test['type']), columns=mlb_type.classes_, index=X_test.index)

# Step 4: Apply OneHotEncoder to 'subzone'
onehot_encoder = OneHotEncoder(sparse=False, drop='first')  # Using drop='first' to avoid multicollinearity
subzone_encoded_train = onehot_encoder.fit_transform(X_train[['subzone']])
subzone_encoded_test = onehot_encoder.transform(X_test[['subzone']])

# Convert the one-hot encoded data to DataFrame
subzone_encoded_train_df = pd.DataFrame(subzone_encoded_train, columns=onehot_encoder.get_feature_names_out(['subzone']), index=X_train.index)
subzone_encoded_test_df = pd.DataFrame(subzone_encoded_test, columns=onehot_encoder.get_feature_names_out(['subzone']), index=X_test.index)

# Step 5: Concatenate the encoded columns back to the original data
X_train_encoded = pd.concat([X_train, X_train_cuisine_encoded, X_train_type_encoded, subzone_encoded_train_df], axis=1)
X_test_encoded = pd.concat([X_test, X_test_cuisine_encoded, X_test_type_encoded, subzone_encoded_test_df], axis=1)

# Optionally, drop the original 'cuisine', 'type', and 'subzone' columns if no longer needed
X_train_encoded.drop(columns=['cuisine', 'type', 'subzone'], inplace=True)
X_test_encoded.drop(columns=['cuisine', 'type', 'subzone'], inplace=True)

# Final shape check
print(f"Shape of X_train_encoded: {X_train_encoded.shape}")
print(f"Shape of X_test_encoded: {X_test_encoded.shape}")

# Check the first few rows to confirm
#print(X_train_encoded.head(3))
#print(X_test_encoded.head(3))


In [None]:

# Initialize StandardScaler
scaler = StandardScaler()

# Fit and transform the numeric columns in X_train, and transform in X_test
X_train[num_features] = scaler.fit_transform(X_train[num_features])
X_test[num_features] = scaler.transform(X_test[num_features])

# Check the processed datasets
print(X_train.shape)
print(X_test.shape)




### Linear Regression

In [50]:
model=LinearRegression()


In [None]:
model.fit(X_train,y_train)


In [None]:
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)


In [None]:
model_train_mae=mean_squared_error(y_train, y_train_pred)
model_test_mae=mean_squared_error(y_test, y_test_pred)


In [None]:
print('Model performance for Training set')
print("Mean Squared Error:", format(model_train_mae))


In [None]:

# Calculate Root Mean Squared Error
rmse = np.sqrt(model_test_mae)


In [None]:

# Calculate R-squared
r_squared = r2_score(y_test, y_pred)


In [None]:

# Print results
print("R-squared: %.4f" % r_squared)
print("Mean Squared Error: %.4f" % score)
print("Root Mean Squared Error: %.4f" % rmse) 
print("\n")
print("-----------------------------------------------")
print("\n")
print('Model performance for Testing set')
print("Mean Squared Error:", format(model_test_mae))