## zomato cost price prediction

### Importing Libraries

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing  import OneHotEncoder,StandardScaler


from joblib import dump, load


### Read Data

In [11]:
df=pd.read_csv('data/zomato.csv')

In [12]:
df.sample(5)   # Random Data will be generated each time 

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
122,2022-01-18,134.5,134.600006,132.800003,133.050003,133.050003,9428096
497,2023-07-25,81.300003,83.199997,79.300003,82.650002,82.650002,72459586
19,2021-08-20,134.949997,141.449997,133.0,139.300003,139.300003,53789580
316,2022-11-01,63.5,63.75,62.700001,63.5,63.5,50425928
429,2023-04-18,54.049999,54.200001,53.049999,53.299999,53.299999,27617603


In [13]:
df.describe(include='all')

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
count,631,631.0,631.0,631.0,631.0,631.0,631.0
unique,631,,,,,,
top,2021-07-23,,,,,,
freq,1,,,,,,
mean,,90.011252,91.835737,87.868304,89.707686,89.707686,67317370.0
std,,32.757639,33.319545,31.900849,32.6219,32.6219,74610300.0
min,,40.849998,44.400002,40.599998,41.650002,41.650002,0.0
25%,,62.549999,63.450001,61.125,62.074998,62.074998,28007880.0
50%,,80.0,81.0,78.099998,79.699997,79.699997,47597100.0
75%,,124.474998,126.75,121.525002,124.599998,124.599998,75254390.0


In [14]:
df.shape

(631, 7)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 631 entries, 0 to 630
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       631 non-null    object 
 1   Open       631 non-null    float64
 2   High       631 non-null    float64
 3   Low        631 non-null    float64
 4   Close      631 non-null    float64
 5   Adj Close  631 non-null    float64
 6   Volume     631 non-null    int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 34.6+ KB


#### checking the duplicated values if there

In [16]:
df.duplicated().value_counts()   # No duplicate value seen

False    631
Name: count, dtype: int64

### Splitting the Data

In [17]:
X=df.drop(['Close'],axis=1)
y=df['Close']

In [18]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=123)

X_train.to_csv("data/X_train.csv")
y_train.to_csv("data/y_train.csv")
X_test.to_csv("data/X_test.csv")
y_test.to_csv("data/y_test.csv")

In [19]:
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(441, 6) (190, 6) (441,) (190,)


## X Train

In [20]:
X_train

Unnamed: 0,Date,Open,High,Low,Adj Close,Volume
295,2022-09-29,59.200001,62.000000,58.900002,61.200001,76969699
536,2023-09-20,101.300003,104.900002,99.300003,100.000000,104115507
406,2023-03-10,53.799999,54.500000,53.299999,53.849998,39226931
463,2023-06-06,71.550003,73.150002,71.349998,72.349998,64762959
308,2022-10-19,63.400002,63.700001,61.500000,62.049999,52638798
...,...,...,...,...,...,...
98,2021-12-15,140.500000,142.000000,140.000000,140.600006,12791660
322,2022-11-10,65.500000,65.949997,63.150002,63.950001,44332611
382,2023-02-03,48.150002,49.250000,47.700001,48.750000,42497841
365,2023-01-10,56.000000,56.150002,53.799999,54.700001,45966335


## Y TRAIN

In [21]:
y_train

295     61.200001
536    100.000000
406     53.849998
463     72.349998
308     62.049999
          ...    
98     140.600006
322     63.950001
382     48.750000
365     54.700001
510     93.449997
Name: Close, Length: 441, dtype: float64

### Separating Numerical and Categorical columns 

In [22]:
numerical_cols=X_train.select_dtypes(exclude='object')

In [23]:
numerical_cols

Unnamed: 0,Open,High,Low,Adj Close,Volume
295,59.200001,62.000000,58.900002,61.200001,76969699
536,101.300003,104.900002,99.300003,100.000000,104115507
406,53.799999,54.500000,53.299999,53.849998,39226931
463,71.550003,73.150002,71.349998,72.349998,64762959
308,63.400002,63.700001,61.500000,62.049999,52638798
...,...,...,...,...,...
98,140.500000,142.000000,140.000000,140.600006,12791660
322,65.500000,65.949997,63.150002,63.950001,44332611
382,48.150002,49.250000,47.700001,48.750000,42497841
365,56.000000,56.150002,53.799999,54.700001,45966335


In [24]:
categorical_cols=X_train.select_dtypes(include='object')

In [25]:
categorical_cols

Unnamed: 0,Date
295,2022-09-29
536,2023-09-20
406,2023-03-10
463,2023-06-06
308,2022-10-19
...,...
98,2021-12-15
322,2022-11-10
382,2023-02-03
365,2023-01-10


### Encoding categorical data

In [49]:
oh=OneHotEncoder()

In [50]:
encoder=oh.fit(categorical_cols)

#getting coloumns name for dataframe
col=encoder.get_feature_names_out()

In [51]:
#saving encoder
dump(encoder,'data/models/encoder/one_hot_encoder.pkl')

['data/models/encoder/one_hot_encoder.pkl']

In [54]:
#Loading encoder
#encoder_model=load('data/models/encoder/one_hot_encoder.pkl')
transform=encoder_model.transform(categorical_cols).toarray()

In [93]:
categorical_encode_data=pd.DataFrame(transform,columns=col)
categorical_encode_data

Unnamed: 0,Date_2021-07-26,Date_2021-07-27,Date_2021-07-28,Date_2021-07-29,Date_2021-08-02,Date_2021-08-03,Date_2021-08-04,Date_2021-08-06,Date_2021-08-10,Date_2021-08-12,...,Date_2024-01-17,Date_2024-01-19,Date_2024-01-22,Date_2024-01-24,Date_2024-01-25,Date_2024-01-29,Date_2024-01-30,Date_2024-01-31,Date_2024-02-02,Date_2024-02-06
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
436,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
437,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
438,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
439,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Scaling Numerical Data

In [58]:
scaler=StandardScaler()

In [59]:
numerical_scaler=scaler.fit(numerical_cols)
numerical_scaler

In [None]:
StandardScaler()

In [60]:
dump(numerical_scaler,'data/models/scaling/standard_scaler.pkl')

['data/models/scaling/standard_scaler.pkl']

In [61]:
model_scaling=load('data/models/scaling/standard_scaler.pkl')

In [66]:
scaled_data=model_scaling.transform(numerical_cols)

In [69]:
numerical_scaled_data=pd.DataFrame(scaled_data,columns=numerical_cols.columns)
numerical_scaled_data

Unnamed: 0,Open,High,Low,Adj Close,Volume
0,-0.973256,-0.924806,-0.938879,-0.902241,0.186271
1,0.314117,0.363013,0.327680,0.284371,0.588096
2,-1.138383,-1.149949,-1.114441,-1.127025,-0.372414
3,-0.595606,-0.590093,-0.548566,-0.561243,0.005582
4,-0.844825,-0.873773,-0.857368,-0.876246,-0.173885
...,...,...,...,...,...
436,1.512812,1.476721,1.603644,1.526033,-0.763720
437,-0.780609,-0.806230,-0.805639,-0.818138,-0.296837
438,-1.311154,-1.307549,-1.290004,-1.282997,-0.323996
439,-1.071109,-1.100417,-1.098766,-1.101029,-0.272654


### Concat Numerical column and Categorical column

In [73]:
Features=pd.concat([numerical_scaled_data,categorical_encode_data],axis=1)

In [74]:
Features

Unnamed: 0,Open,High,Low,Adj Close,Volume,Open.1,High.1,Low.1,Adj Close.1,Volume.1
0,-0.973256,-0.924806,-0.938879,-0.902241,0.186271,-0.973256,-0.924806,-0.938879,-0.902241,0.186271
1,0.314117,0.363013,0.327680,0.284371,0.588096,0.314117,0.363013,0.327680,0.284371,0.588096
2,-1.138383,-1.149949,-1.114441,-1.127025,-0.372414,-1.138383,-1.149949,-1.114441,-1.127025,-0.372414
3,-0.595606,-0.590093,-0.548566,-0.561243,0.005582,-0.595606,-0.590093,-0.548566,-0.561243,0.005582
4,-0.844825,-0.873773,-0.857368,-0.876246,-0.173885,-0.844825,-0.873773,-0.857368,-0.876246,-0.173885
...,...,...,...,...,...,...,...,...,...,...
436,1.512812,1.476721,1.603644,1.526033,-0.763720,1.512812,1.476721,1.603644,1.526033,-0.763720
437,-0.780609,-0.806230,-0.805639,-0.818138,-0.296837,-0.780609,-0.806230,-0.805639,-0.818138,-0.296837
438,-1.311154,-1.307549,-1.290004,-1.282997,-0.323996,-1.311154,-1.307549,-1.290004,-1.282997,-0.323996
439,-1.071109,-1.100417,-1.098766,-1.101029,-0.272654,-1.071109,-1.100417,-1.098766,-1.101029,-0.272654


## linear regression

In [76]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


In [88]:
#linear regression
model=LinearRegression()
model.fit(Features,y_train)

In [80]:
dump(model,'data/models/lr.pkl')

['data/models/lr.pkl']

In [91]:
lr=load('data/models/lr.pkl')

In [92]:
print('train score: ',model.score(Features,y_train)*100)

train score:  100.0


In [82]:
model_DT=DecisionTreeRegressor()
model_DT.fit(Features,y_train)
dump(model_DT,'data/models/DT.pkl')
print('train score: ',model_DT.score(Features,y_train)*100)

train score:  100.0


In [83]:
model_RF=RandomForestRegressor()
model_RF.fit(Features,y_train)
dump(model_RF,'data/models/RF.pkl')
print('train score: ',model_RF.score(Features,y_train)*100)

train score:  99.99547299435136
