# Libraries and Modules

In [1]:
# Data Manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.preprocessing import StandardScaler

# Models
## Linear Regression is used as a baseline model
from sklearn.linear_model import LinearRegression

## Random Forest, XGB and Gradient Boosting are experimented for better performance
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb

## Support Vector Machine is useful for non-linear data
from sklearn.svm import SVR


# Model Parameters
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Model Evaluation
## For Regression Models we use MAE, MSE, RMSE, R2
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

Recommendations

Begin with Linear Regression as a baseline.

Experiment with Random Forest and XGBoost/LightGBM for better performance.

If you suspect non-linear relationships, try SVR or Neural Networks.

Use Ensemble methods for combining predictions from multiple models.

# Data Import

In [3]:
df_test = pd.read_csv(r'Dataset\Test_dataset.csv')
df_train = pd.read_csv(r'Dataset\Train_dataset.csv')

# Exploratory Data Analysis (EDA)

## 1. Data Structure
- Columns: Check column data, compare test with train data
- Data Types: Check the data types of each column.
- Missing Values: Identify and handle missing values.
- Basic Statistics: Calculate basic statistics like mean, median, standard deviation, etc.

### Columns

In [36]:
train_columns = set(df_train.columns)
test_columns = set(df_test.columns)

# Identify columns present in train but missing in test
missing_in_test = train_columns - test_columns

# Identify columns present in test but missing in train
missing_in_train = test_columns - train_columns

# Print the results
if missing_in_test:
    print("Columns present in train but missing in test:")
    print(missing_in_test)
else:
    print("No columns are missing in test dataset.")

if missing_in_train:
    print("Columns present in test but missing in train:")
    print(missing_in_train)
else:
    print("No columns are missing in train dataset.")

Columns present in train but missing in test:
{'Endorsed By', 'Annual Turnover'}
Columns present in test but missing in train:
{'Endoresed By'}


In [37]:
# Fix the column name
df_test.rename(columns={'Endoresed By':'Endorsed By'}, inplace=True)

### Data types

In [75]:
dtype_train = df_train.drop(columns=['Annual Turnover']).dtypes
dtype_test = df_test.dtypes

# Align the indices of the two Series objects
dtype_test = dtype_test.reindex(dtype_train.index)

# Compare data types
dtype_diff = dtype_train.compare(dtype_test)

# Print the differences
if not dtype_diff.empty:
    print("Differences in data types between train and test datasets:")
    print(dtype_diff)
else:
    print("All columns have matching data types in train and test datasets.")

All columns have matching data types in train and test datasets.


In [43]:
df_train['Resturant Tier'].value_counts()

Resturant Tier
2.0    3191
1.0     253
Name: count, dtype: int64

In [74]:
df_train['Ambience'].value_counts().sort_index(ascending=False)

Ambience
10.0    167
9.0     348
8.0     665
7.0     596
6.0     679
5.0     368
4.0     349
3.0     150
2.0      96
1.0      29
0.0      21
Name: count, dtype: int64

In [53]:
# Match the data types of the two datasets
df_test['Resturant Tier'] = df_test['Resturant Tier'].astype(float)
df_test['Ambience'] = df_test['Ambience'].astype(float)

# Could not convert train['Ambience'] and train['Resturant Tier'] to int as they contain NaN values

- Note:

Now that data types are alined we can check df_train data types and any changes made will be replicated on to df_test. This step could have been done before checking for differences but I didn't think of it until now and don't want to change it

In [63]:
df_train

Unnamed: 0,Registration Number,Annual Turnover,Cuisine,City,Restaurant Location,Opening Day of Restaurant,Facebook Popularity Quotient,Endorsed By,Instagram Popularity Quotient,Fire Audit,Liquor License Obtained,Situated in a Multi Complex,Dedicated Parking,Open Sitting Available,Resturant Tier,Restaurant Type,Restaurant Theme,Restaurant Zomato Rating,Restaurant City Tier,Order Wait Time,Staff Responsivness,Value for Money,Hygiene Rating,Food Rating,Overall Restaurant Rating,Live Music Rating,Comedy Gigs Rating,Value Deals Rating,Live Sports Rating,Ambience,Lively,Service,Comfortablility,Privacy
0,60001,42000000,"indian,irish",Bangalore,Near Business Hub,14/02/09,84.30,Not Specific,95.80,1,1,1,0,1,2.0,Bar,Arabian,3,0,2,5,5,5,7,10.0,4.0,,,,8.0,8,6,6,6
1,60002,50000000,"indian,irish",Indore,Near Party Hub,29/09/08,85.40,Tier A Celebrity,85.00,1,1,1,1,0,2.0,Bar,Greek,3,0,6,6,6,7,8,9.0,,4.0,,,5.0,7,7,3,8
2,60003,32500000,"tibetan,italian",Chennai,Near Business Hub,30/07/11,85.00,Tier A Celebrity,68.20,1,1,1,1,0,2.0,Bar,90's,3,0,2,6,5,3,7,8.0,3.0,,,,7.0,10,5,2,8
3,60004,110000000,"turkish,nigerian",Gurgaon,Near Party Hub,30/11/08,85.60,Tier A Celebrity,83.60,0,1,1,1,1,1.0,Bar,Arabian,3,1,7,6,5,6,8,9.0,6.0,,,,7.0,7,4,3,5
4,60005,20000000,"irish,belgian",Manesar,Near Party Hub,22/02/10,,Tier A Celebrity,76.80,1,1,1,1,1,,Bar,Greek,3,0,10,5,6,4,8,6.0,,2.0,,,,6,2,4,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3488,63489,40500000,"algerian,belgian",-1,Near Party Hub,20/03/09,69.10,Not Specific,62.11,1,1,0,1,1,2.0,Bar,Arabian,4,0,3,5,5,6,6,9.0,5.0,,,,7.0,7,6,6,8
3489,63490,32500000,"tibetan,greek",Bangalore,Near Party Hub,05/02/12,91.00,Not Specific,96.30,1,1,0,1,0,2.0,Bar,Greek,3,0,1,4,5,7,8,,4.0,,,,4.0,9,4,0,5
3490,63491,42500000,"indian,irish",Chennai,Near Party Hub,21/05/09,80.83,Not Specific,86.80,1,1,1,1,0,2.0,Bar,Nature,3,1,8,5,4,6,6,8.0,,,,3.0,6.0,8,3,3,7
3491,63492,53000000,"japanese,thai",Bangalore,Near Party Hub,22/06/08,79.40,Not Specific,86.00,1,1,1,1,1,2.0,Bar,Greek,3,1,5,5,3,5,10,7.0,3.0,2.0,,,7.0,6,3,3,6


In [64]:
def change_col_type(col_name, dtype):
    '''
    Change the data type of a column in both train and test datasets.
    '''
    df_train[col_name] = df_train[col_name].astype(dtype)
    df_test[col_name] = df_test[col_name].astype(dtype)

resturant tier == resturant city tier

In [None]:
change_col_type('Opening Day of Restaurant', 'datetime64[ns]')

col_bool = ['Fire Audit', 'Liquor License Obtained', 'Situated in a Multi Complex', 'Dedicated Parking', 'Open Sitting Available']

for col in col_bool:
    change_col_type(col, 'bool')

In [73]:
df_test.dtypes

Registration Number                       int64
Cuisine                                  object
City                                     object
Restaurant Location                      object
Opening Day of Restaurant        datetime64[ns]
Facebook Popularity Quotient            float64
Endorsed By                              object
Instagram Popularity Quotient           float64
Fire Audit                                 bool
Liquor License Obtained                    bool
Situated in a Multi Complex                bool
Dedicated Parking                          bool
Open Sitting Available                     bool
Resturant Tier                          float64
Restaurant Type                          object
Restaurant Theme                         object
Restaurant Zomato Rating                  int64
Restaurant City Tier                      int64
Order Wait Time                           int64
Staff Responsivness                       int64
Value for Money                         

- Note:

Since both column types match a final inspection of only one is needed.

### Missing Values


In [95]:
df_train.shape

(3493, 34)

In [83]:
df_test.isna().sum()

Registration Number                0
Cuisine                            0
City                               0
Restaurant Location                0
Opening Day of Restaurant          0
Facebook Popularity Quotient       0
Endorsed By                        0
Instagram Popularity Quotient      0
Fire Audit                         0
Liquor License Obtained            0
Situated in a Multi Complex        0
Dedicated Parking                  0
Open Sitting Available             0
Resturant Tier                     0
Restaurant Type                    0
Restaurant Theme                   0
Restaurant Zomato Rating           0
Restaurant City Tier               0
Order Wait Time                    0
Staff Responsivness                0
Value for Money                    0
Hygiene Rating                     0
Food Rating                        0
Overall Restaurant Rating         34
Live Music Rating                102
Comedy Gigs Rating               370
Value Deals Rating               385
L

- Note: 

A deep dive into the data reveals that the missing values for _Live Music Rating_, _Comedy Gigs Rating_, _Value Deals Rating_, _Live Sports Rating_ means the venue does not offer those services. **New indicator columns will display this difference**. 

The Missing **values will be kept as NaN for the time being** since some models can handle them and others can't, further on the values will be changed depending on the model's requirements.

_Overall Resturant Rating_ missing values will be **dropped**.

In [106]:
# Create a new column indicating whether the service is available
def create_has_service(col_name):
    '''
    Create a new column indicating whether the service is available and removing 'Ratings' from new colummns name.
    '''
    df_train['Has ' + col_name.replace(' Rating','')] = df_train[col_name].notna()
    df_test['Has ' + col_name.replace(' Ratings','')] = df_test[col_name].notna()

In [109]:
services_cols = ['Live Music Rating', 'Comedy Gigs Rating', 'Value Deals Rating', 'Live Sports Rating']

for col in services_cols:
    create_has_service(col)

In [112]:
# Drop Overall Restaurant Rating Missing values
df_train.dropna(subset=['Overall Restaurant Rating'], inplace=True)
df_test.dropna(subset=['Overall Restaurant Rating'], inplace=True)

### Basic Statistics (describe)

## 2. Univariate Analysis
- Distribution of Numerical Features: Use histograms, box plots, and density plots.
- Distribution of Categorical Features: Use bar plots and count plots.

### Distribution of Numerical Features

### Distribution of Categorical Features

## 3. Bivariate Analysis
- Numerical vs Numerical: Use scatter plots and pair plots.
- Numerical vs Categorical: Use box plots and violin plots.
- Categorical vs Categorical: Use cross-tabulations and heatmaps.

## 4. Multivariate Analysis
- Pair Plots: Visualize relationships between multiple numerical features.
- Heatmaps: Visualize correlations between multiple features.

## 5. Outlier Detection
- Box Plots: Identify outliers in numerical features.
- Z-Score or IQR: Use statistical methods to detect outliers.

## 6. Feature Engineering
- Create New Features: Based on domain knowledge or interactions between existing features.
- Transform Features: Apply log transformation, scaling, or encoding.

## 7. Target Variable Analysis
- Distribution: Examine the distribution of the target variable.
- Relationships: Analyze the relationship between the target variable and other features.

Metric to measure

The measure of accuracy will be RMSE (Root mean square error)

The predicted Annual Turnover for each restaurant in the Test dataset will be compared with the actual Annual Turnover to calculate the RMSE value of the entire prediction. The lower the RMSE value, the better the model will be.