# Zomata Price Predictor

In [300]:
import pandas as pd

In [301]:
data = pd.read_csv('zomato.csv', encoding='latin-1')

In [302]:
data.head(3)

Unnamed: 0,Restaurant ID,Restaurant Name,Country Code,City,Address,Locality,Locality Verbose,Longitude,Latitude,Cuisines,...,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Rating color,Rating text,Votes
0,6317637,Le Petit Souffle,162,Makati City,"Third Floor, Century City Mall, Kalayaan Avenu...","Century City Mall, Poblacion, Makati City","Century City Mall, Poblacion, Makati City, Mak...",121.027535,14.565443,"French, Japanese, Desserts",...,Botswana Pula(P),Yes,No,No,No,3,4.8,Dark Green,Excellent,314
1,6304287,Izakaya Kikufuji,162,Makati City,"Little Tokyo, 2277 Chino Roces Avenue, Legaspi...","Little Tokyo, Legaspi Village, Makati City","Little Tokyo, Legaspi Village, Makati City, Ma...",121.014101,14.553708,Japanese,...,Botswana Pula(P),Yes,No,No,No,3,4.5,Dark Green,Excellent,591
2,6300002,Heat - Edsa Shangri-La,162,Mandaluyong City,"Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...","Edsa Shangri-La, Ortigas, Mandaluyong City","Edsa Shangri-La, Ortigas, Mandaluyong City, Ma...",121.056831,14.581404,"Seafood, Asian, Filipino, Indian",...,Botswana Pula(P),Yes,No,No,No,4,4.4,Green,Very Good,270


In [303]:
data.shape

(9551, 21)

# 1. Analysis on Name and Id cols.
#### We are sure that name, and id won't play role to determine price

In [304]:
data.drop(columns=['Restaurant ID', 'Restaurant Name'], inplace=True)

In [305]:
data.columns

Index(['Country Code', 'City', 'Address', 'Locality', 'Locality Verbose',
       'Longitude', 'Latitude', 'Cuisines', 'Average Cost for two', 'Currency',
       'Has Table booking', 'Has Online delivery', 'Is delivering now',
       'Switch to order menu', 'Price range', 'Aggregate rating',
       'Rating color', 'Rating text', 'Votes'],
      dtype='object')

## 2. Analysis on Country Code and Currency cols
#### let's perform some EDA to see which colum could else be dropped.

In [306]:
data['Country Code'].nunique()

15

In [307]:
data['Country Code'].value_counts()


Country Code
1      8652
216     434
215      80
30       60
214      60
189      60
148      40
208      34
14       24
162      22
94       21
184      20
166      20
191      20
37        4
Name: count, dtype: int64

#### Since, almost all country (8652 out of 9551) is same . We'll only select rows with that country only

In [308]:
mask = data['Country Code'] == 1
data = data[mask]

In [309]:
data.shape

(8652, 19)

#### Now, we don't need cols : Country code and currency as well.

In [310]:
data.drop(columns=['Country Code', 'Currency'], inplace=True)

In [311]:
data.columns

Index(['City', 'Address', 'Locality', 'Locality Verbose', 'Longitude',
       'Latitude', 'Cuisines', 'Average Cost for two', 'Has Table booking',
       'Has Online delivery', 'Is delivering now', 'Switch to order menu',
       'Price range', 'Aggregate rating', 'Rating color', 'Rating text',
       'Votes'],
      dtype='object')

## 3. Analysis on City column
#### let's see distribution along City columns

In [312]:
print(data['City'].nunique())
print(data['City'].value_counts())

43
City
New Delhi       5473
Gurgaon         1118
Noida           1080
Faridabad        251
Ghaziabad         25
Ahmedabad         21
Guwahati          21
Lucknow           21
Bhubaneshwar      21
Amritsar          21
Pune              20
Puducherry        20
Patna             20
Ludhiana          20
Ranchi            20
Surat             20
Vadodara          20
Nashik            20
Nagpur            20
Mysore            20
Mumbai            20
Varanasi          20
Mangalore         20
Agra              20
Kochi             20
Kolkata           20
Dehradun          20
Allahabad         20
Aurangabad        20
Bangalore         20
Bhopal            20
Chennai           20
Coimbatore        20
Goa               20
Indore            20
Jaipur            20
Kanpur            20
Vizag             20
Chandigarh        18
Hyderabad         18
Secunderabad       2
Panchkula          1
Mohali             1
Name: count, dtype: int64


#### Let's keep first 3 cities only

In [313]:
mask = data['City'].isin(['New Delhi', 'Gurgaon', 'Noida'])
data = data[mask]
data.shape

(7671, 17)

In [314]:
data.columns

Index(['City', 'Address', 'Locality', 'Locality Verbose', 'Longitude',
       'Latitude', 'Cuisines', 'Average Cost for two', 'Has Table booking',
       'Has Online delivery', 'Is delivering now', 'Switch to order menu',
       'Price range', 'Aggregate rating', 'Rating color', 'Rating text',
       'Votes'],
      dtype='object')

## 4. Analysis on cols. where one replace others
***1. longitude,lattitude can replace address, locality <br>
   2. Agregare rating can replace rating color & rating text [IF Nlp , sentiment analysis on rating text]***

   ```inplace = True if not assigned to variable to make permanent change```

In [315]:
data = data.drop(columns=['Address', 'Locality','Locality Verbose','Rating color','Rating text']) 

In [316]:
data.columns

Index(['City', 'Longitude', 'Latitude', 'Cuisines', 'Average Cost for two',
       'Has Table booking', 'Has Online delivery', 'Is delivering now',
       'Switch to order menu', 'Price range', 'Aggregate rating', 'Votes'],
      dtype='object')

In [317]:
data['Switch to order menu'].value_counts()

Switch to order menu
No    7671
Name: count, dtype: int64

#### Since, all directs to single value --> can be removed !!

In [318]:
data.drop(columns=['Switch to order menu'], inplace=True)

In [319]:
 data.columns

Index(['City', 'Longitude', 'Latitude', 'Cuisines', 'Average Cost for two',
       'Has Table booking', 'Has Online delivery', 'Is delivering now',
       'Price range', 'Aggregate rating', 'Votes'],
      dtype='object')

## 5. Analysis on Cuisines column

In [320]:
data['Cuisines'].head()

1161                  Cafe, Beverages
1162            North Indian, Mughlai
1163            North Indian, Mughlai
1164    South Indian, Seafood, Kerala
1165                           Bakery
Name: Cuisines, dtype: object

In [321]:
print(data['Cuisines'].nunique())
print(data['Cuisines'].value_counts())

1143
Cuisines
North Indian                                      873
North Indian, Chinese                             447
Fast Food                                         320
Chinese                                           314
North Indian, Mughlai                             303
                                                 ... 
Healthy Food, Fast Food                             1
North Indian, Rajasthani, Gujarati                  1
North Indian, Chinese, Continental, Seafood         1
Mediterranean, Continental, Italian                 1
Chinese, North Indian, South Indian, Fast Food      1
Name: count, Length: 1143, dtype: int64


***Can neither be Removed -->look at the distribution <br> Nor keep it--> OHE forms many columns***

##### Remaining after 8th analysis

### 6. Analysis on Has table booking

In [322]:
data['Has Table booking'].value_counts()

Has Table booking
No     6640
Yes    1031
Name: count, dtype: int64

In [323]:
data['Has Table booking'].replace('Yes', 1, inplace=True)
data['Has Table booking'].replace('No', 0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Has Table booking'].replace('Yes', 1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Has Table booking'].replace('No', 0, inplace=True)
  data['Has Table booking'].replace('No', 0, inplace=True)


In [324]:
data['Has Table booking'].value_counts()


Has Table booking
0    6640
1    1031
Name: count, dtype: int64

### 7. Analysis on Has Online Delivery

In [325]:
data['Has Online delivery'].value_counts()

Has Online delivery
No     5393
Yes    2278
Name: count, dtype: int64

In [326]:
data['Has Online delivery'].replace('Yes', 1, inplace=True)
data['Has Online delivery'].replace('No', 0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Has Online delivery'].replace('Yes', 1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Has Online delivery'].replace('No', 0, inplace=True)
  data['Has Online delivery'].replace('No', 0, inplace=True)


In [327]:
data['Has Online delivery'].value_counts()

Has Online delivery
0    5393
1    2278
Name: count, dtype: int64

In [328]:
data.columns

Index(['City', 'Longitude', 'Latitude', 'Cuisines', 'Average Cost for two',
       'Has Table booking', 'Has Online delivery', 'Is delivering now',
       'Price range', 'Aggregate rating', 'Votes'],
      dtype='object')

## 8. Analysis regarding relations of ```Average Cost for two``` with other columns
  ***Is Average Cost for two --> Output cols ?*** **YES** <BR>
  Better if done little before

In [329]:
data.select_dtypes(include=['number']).corr()

Unnamed: 0,Longitude,Latitude,Average Cost for two,Has Table booking,Has Online delivery,Price range,Aggregate rating,Votes
Longitude,1.0,0.997721,0.071854,0.059863,0.13823,0.082031,0.221801,0.071482
Latitude,0.997721,1.0,0.070816,0.060228,0.137746,0.081059,0.221399,0.071196
Average Cost for two,0.071854,0.070816,1.0,0.643845,0.072198,0.848425,0.329785,0.297741
Has Table booking,0.059863,0.060228,0.643845,1.0,0.072631,0.661477,0.251972,0.224377
Has Online delivery,0.13823,0.137746,0.072198,0.072631,1.0,0.184855,0.339658,0.121491
Price range,0.082031,0.081059,0.848425,0.661477,0.184855,1.0,0.372624,0.325489
Aggregate rating,0.221801,0.221399,0.329785,0.251972,0.339658,0.372624,1.0,0.2915
Votes,0.071482,0.071196,0.297741,0.224377,0.121491,0.325489,0.2915,1.0


In [330]:
data.select_dtypes(include=['number']).corr()['Average Cost for two']

Longitude               0.071854
Latitude                0.070816
Average Cost for two    1.000000
Has Table booking       0.643845
Has Online delivery     0.072198
Price range             0.848425
Aggregate rating        0.329785
Votes                   0.297741
Name: Average Cost for two, dtype: float64

**Remove Longitutde, latitude and has online delivery --> Coz' no role in output**

In [331]:
data = data.drop(columns=['Longitude', 'Latitude','Has Online delivery', 'Is delivering now', 'Price range']) 

### Remaining Analysis on 'Cuisines' columns

In [332]:
cuisine = data.groupby('Cuisines')['Average Cost for two'].mean().reset_index()

In [333]:
cuisine

Unnamed: 0,Cuisines,Average Cost for two
0,Afghani,512.500000
1,"Afghani, Mughlai, Chinese",500.000000
2,"Afghani, North Indian",900.000000
3,"Afghani, North Indian, Pakistani, Arabian",500.000000
4,American,666.666667
...,...,...
1138,"Tibetan, Street Food",100.000000
1139,Turkish,600.000000
1140,"Turkish, Arabian, Moroccan, Lebanese",400.000000
1141,"Turkish, Mediterranean, Middle Eastern",2000.000000


In [334]:
data = data.merge(cuisine, on='Cuisines')

In [335]:
data.drop(columns=['Cuisines'], inplace=True)

In [336]:
data

Unnamed: 0,City,Average Cost for two_x,Has Table booking,Aggregate rating,Votes,Average Cost for two_y
0,Gurgaon,350,0,3.4,16,416.666667
1,Gurgaon,800,1,2.7,80,755.511551
2,Gurgaon,2000,1,4.3,1887,755.511551
3,Gurgaon,1400,1,4.0,802,1400.000000
4,Gurgaon,250,0,3.0,4,337.000000
...,...,...,...,...,...,...
7666,Noida,700,0,2.6,34,599.888143
7667,Noida,400,0,0.0,1,279.843750
7668,Noida,600,0,0.0,3,279.843750
7669,Noida,500,0,0.0,0,599.888143


In [337]:
data = data.rename(columns={'Average Cost for two_x': 'Cost', 'Average Cost for two_y':'Cuisine price'})

# Our data is ready for model train.......

## Shuffle col, so that can be split into X and y

In [338]:
data = data[['City', 'Has Table booking', 'Aggregate rating', 'Votes', 'Cuisine price','Cost']]

In [339]:
data.head()

Unnamed: 0,City,Has Table booking,Aggregate rating,Votes,Cuisine price,Cost
0,Gurgaon,0,3.4,16,416.666667,350
1,Gurgaon,1,2.7,80,755.511551,800
2,Gurgaon,1,4.3,1887,755.511551,2000
3,Gurgaon,1,4.0,802,1400.0,1400
4,Gurgaon,0,3.0,4,337.0,250


In [340]:
X = data.iloc[:, :-1]  
y = data.iloc[:, [-1]]

In [341]:
y

Unnamed: 0,Cost
0,350
1,800
2,2000
3,1400
4,250
...,...
7666,700
7667,400
7668,600
7669,500


In [342]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder(categories=[['New Delhi', 'Gurgaon', 'Noida']])

In [343]:
X.iloc[:,  [0]] = oe.fit_transform(X.iloc[:, [0]]) # iloc not mentioned for numpy arrray

In [344]:
y

Unnamed: 0,Cost
0,350
1,800
2,2000
3,1400
4,250
...,...
7666,700
7667,400
7668,600
7669,500


In [345]:
X

Unnamed: 0,City,Has Table booking,Aggregate rating,Votes,Cuisine price
0,1.0,0,3.4,16,416.666667
1,1.0,1,2.7,80,755.511551
2,1.0,1,4.3,1887,755.511551
3,1.0,1,4.0,802,1400.000000
4,1.0,0,3.0,4,337.000000
...,...,...,...,...,...
7666,2.0,0,2.6,34,599.888143
7667,2.0,0,0.0,1,279.843750
7668,2.0,0,0.0,3,279.843750
7669,2.0,0,0.0,0,599.888143


In [346]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [347]:
X_train.shape

(6136, 5)

In [348]:
X_test.shape

(1535, 5)

In [349]:
from sklearn.linear_model import LinearRegression
model0 = LinearRegression()
model0.fit(X_train, y_train)
y_pred=lr.predict(X_test)

In [350]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.720085708766282

# Let's try RandomForest

In [351]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=200, max_depth=9)

In [352]:
model.fit(X_train, y_train)
y_pred=model.predict(X_test)

  return fit_method(estimator, *args, **kwargs)


In [353]:
r2_score(y_test, y_pred)

0.718983947069612

# Let's try DecisionTreeRegressor

In [354]:
from sklearn.tree import DecisionTreeRegressor
model2 = DecisionTreeRegressor()

In [355]:
model2.fit(X_train, y_train)
y_pred=model2.predict(X_test)

In [356]:
r2_score(y_test, y_pred)

0.48697522359894996

# let's try SVM

In [358]:
from sklearn.svm import SVR
model3 = SVR()
model3.fit(X_train, y_train)
y_pred=model3.predict(X_test)
r2_score(y_test, y_pred)

  y = column_or_1d(y, warn=True)


0.29304825243975463

# Conclusion
  ``` Linear Regression ``` worked best for us

In [None]:
pip freeze > requirements.txt
