In [1]:
# importing libraries.
import pandas as pd
import numpy as np 

## Loading Datasets
test = pd.read_csv('/home/aakash/Genpact/test_main.csv')
train = pd.read_csv('/home/aakash/Genpact/train/train.csv') ## This is the demand data for raw materials.
train_meal = pd.read_csv('/home/aakash/Genpact/train/meal_info.csv')
train_centers = pd.read_csv('/home/aakash/Genpact/train/fulfilment_center_info.csv')

In [2]:
## EDA
print('train dataset shape',train.shape) # So we have a huge dataset
print('train dataset columns', train.columns)     

train dataset shape (456548, 9)
train dataset columns Index(['id', 'week', 'center_id', 'meal_id', 'checkout_price', 'base_price',
       'emailer_for_promotion', 'homepage_featured', 'num_orders'],
      dtype='object')


In [3]:
test_id = test.id
print(test.shape)
print(test.columns) ## So num_orders is our target variable.

(32573, 8)
Index(['id', 'week', 'center_id', 'meal_id', 'checkout_price', 'base_price',
       'emailer_for_promotion', 'homepage_featured'],
      dtype='object')


In [4]:
train.dtypes ## so clearly our target varibale is Numerical
## So this is a regression analysis problem.

id                         int64
week                       int64
center_id                  int64
meal_id                    int64
checkout_price           float64
base_price               float64
emailer_for_promotion      int64
homepage_featured          int64
num_orders                 int64
dtype: object

In [5]:
train.head()
## A himepage featured meal is likely to have higher num_order.

Unnamed: 0,id,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured,num_orders
0,1379560,1,55,1885,136.83,152.29,0,0,177
1,1466964,1,55,1993,136.83,135.83,0,0,270
2,1346989,1,55,2539,134.86,135.86,0,0,189
3,1338232,1,55,2139,339.5,437.53,0,0,54
4,1448490,1,55,2631,243.5,242.5,0,0,40


In [6]:
## Now what I want to do is convert all the 3 train data's into a single dataset.
# Note that center ID and meal ID are going to be the most imprtant features for us.
# Lets check for the null values. if any. So no null values.
print(len(train.meal_id.unique()))  
# So we have 77 unique meal centres and 51 unique meals.
print(train_meal.category.value_counts()) # So most meals actually coreespont to beverages.
print("  ")
print(train_meal.cuisine.value_counts()) # and most people prefer Thai cuisine.

51
Beverages       12
Pizza            3
Fish             3
Rice Bowl        3
Pasta            3
Extras           3
Seafood          3
Other Snacks     3
Biryani          3
Desert           3
Starters         3
Sandwich         3
Salad            3
Soup             3
Name: category, dtype: int64
  
Thai           15
Indian         12
Italian        12
Continental    12
Name: cuisine, dtype: int64


In [7]:
print(train_centers.shape)
train_centers.city_code.value_counts() ## So most meal centres are in city with city ID 590
train_centers.region_code.value_counts() # and most meal centres have a region code 30.
train_centers.center_type.value_counts() # So Type_A might be some centres of high importance.
train_centers.head()

(77, 5)


Unnamed: 0,center_id,city_code,region_code,center_type,op_area
0,11,679,56,TYPE_A,3.7
1,13,590,56,TYPE_B,6.7
2,124,590,56,TYPE_C,4.0
3,66,648,34,TYPE_A,4.1
4,94,632,34,TYPE_C,3.6


In [8]:
## Now lwts compute the mean number of orders..
x = np.mean(train.num_orders)
print(x)

# lets make a baseline submission on test data using this mean.
pd.DataFrame({'id': test.id, 'num_orders' : x}).to_csv('genpact.csv', index = False)

261.8727603669275


In [9]:
# Now lets create new datasets. 
## Creating columns from centre data.
### For Train data ####
for col in train_centers.columns[1:]:
    train[col] = train.center_id.replace(train_centers.set_index('center_id')[col])
    test[col] = test.center_id.replace(train_centers.set_index('center_id')[col])

for col in train_meal.columns[1:]:
    train[col] = train.meal_id.replace(train_meal.set_index('meal_id')[col])
    test[col] = test.meal_id.replace(train_meal.set_index('meal_id')[col])    
    
train['center_type'] = pd.get_dummies(train['center_type'])
test['center_type'] = pd.get_dummies(test['center_type'])

train['cuisine'] = pd.get_dummies(train['cuisine'])
test['cuisine'] = pd.get_dummies(test['cuisine'])

train['category'] = pd.get_dummies(train['category'])
test['category'] = pd.get_dummies(test['category'])

In [10]:
from scipy.stats import skew
print(skew(train.num_orders))

train.num_orders = np.log1p(train.num_orders)
print(skew(train.num_orders))

6.929943296742635
-0.021917552017371056


In [11]:
train_y = train['num_orders']
train_x = train.drop('num_orders', axis = 1)
train_x = train_x.drop('id', axis = 1)
test = test.drop('id', axis = 1)

In [17]:
### Modelling ###
## Adding an Xgboost model
import matplotlib.pyplot as plt
% matplotlib inline

## Splitting the training data into train and validation sets.
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val  = train_test_split(train_x, train_y, test_size = 0.25, random_state = 1)

import lightgbm as lgb
lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_val, y_val, reference = lgb_train)

params = {'learning_rate' : 0.05, 
          'boosting_type' : 'gbdt', 
          "metric" : 'l2_root', 
          "sub_feature" : 0.5, 
          'num_leaves' : 100,
          'bagging_freq': 1,
          'bagging_fraction': 0.8}

clf = lgb.train(params, lgb_train, num_boost_round = 1000, valid_sets = lgb_eval, early_stopping_rounds = 100)

[1]	valid_0's rmse: 1.19411
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's rmse: 1.18467
[3]	valid_0's rmse: 1.15733
[4]	valid_0's rmse: 1.12252
[5]	valid_0's rmse: 1.08955
[6]	valid_0's rmse: 1.06927
[7]	valid_0's rmse: 1.0477
[8]	valid_0's rmse: 1.01935
[9]	valid_0's rmse: 0.99294
[10]	valid_0's rmse: 0.987013
[11]	valid_0's rmse: 0.962589
[12]	valid_0's rmse: 0.939918
[13]	valid_0's rmse: 0.926107
[14]	valid_0's rmse: 0.9121
[15]	valid_0's rmse: 0.900532
[16]	valid_0's rmse: 0.881563
[17]	valid_0's rmse: 0.871376
[18]	valid_0's rmse: 0.853553
[19]	valid_0's rmse: 0.844149
[20]	valid_0's rmse: 0.835005
[21]	valid_0's rmse: 0.819329
[22]	valid_0's rmse: 0.81009
[23]	valid_0's rmse: 0.796504
[24]	valid_0's rmse: 0.784026
[25]	valid_0's rmse: 0.777165
[26]	valid_0's rmse: 0.770794
[27]	valid_0's rmse: 0.765006
[28]	valid_0's rmse: 0.753971
[29]	valid_0's rmse: 0.743715
[30]	valid_0's rmse: 0.737856
[31]	valid_0's rmse: 0.732532
[32]	valid_0's rmse: 0.727691

[272]	valid_0's rmse: 0.50283
[273]	valid_0's rmse: 0.502751
[274]	valid_0's rmse: 0.502656
[275]	valid_0's rmse: 0.502508
[276]	valid_0's rmse: 0.502308
[277]	valid_0's rmse: 0.502063
[278]	valid_0's rmse: 0.501973
[279]	valid_0's rmse: 0.501891
[280]	valid_0's rmse: 0.501562
[281]	valid_0's rmse: 0.501486
[282]	valid_0's rmse: 0.501307
[283]	valid_0's rmse: 0.501202
[284]	valid_0's rmse: 0.500961
[285]	valid_0's rmse: 0.500831
[286]	valid_0's rmse: 0.500587
[287]	valid_0's rmse: 0.500377
[288]	valid_0's rmse: 0.50018
[289]	valid_0's rmse: 0.500034
[290]	valid_0's rmse: 0.499825
[291]	valid_0's rmse: 0.499691
[292]	valid_0's rmse: 0.499545
[293]	valid_0's rmse: 0.499366
[294]	valid_0's rmse: 0.499254
[295]	valid_0's rmse: 0.49918
[296]	valid_0's rmse: 0.499133
[297]	valid_0's rmse: 0.499009
[298]	valid_0's rmse: 0.498885
[299]	valid_0's rmse: 0.498796
[300]	valid_0's rmse: 0.498712
[301]	valid_0's rmse: 0.498663
[302]	valid_0's rmse: 0.498532
[303]	valid_0's rmse: 0.498416
[304]	valid

[541]	valid_0's rmse: 0.478519
[542]	valid_0's rmse: 0.478487
[543]	valid_0's rmse: 0.478399
[544]	valid_0's rmse: 0.478357
[545]	valid_0's rmse: 0.478324
[546]	valid_0's rmse: 0.478232
[547]	valid_0's rmse: 0.478196
[548]	valid_0's rmse: 0.478158
[549]	valid_0's rmse: 0.478109
[550]	valid_0's rmse: 0.477984
[551]	valid_0's rmse: 0.477893
[552]	valid_0's rmse: 0.477832
[553]	valid_0's rmse: 0.477756
[554]	valid_0's rmse: 0.477713
[555]	valid_0's rmse: 0.477584
[556]	valid_0's rmse: 0.477515
[557]	valid_0's rmse: 0.477465
[558]	valid_0's rmse: 0.47742
[559]	valid_0's rmse: 0.477395
[560]	valid_0's rmse: 0.477372
[561]	valid_0's rmse: 0.47735
[562]	valid_0's rmse: 0.477329
[563]	valid_0's rmse: 0.477311
[564]	valid_0's rmse: 0.477252
[565]	valid_0's rmse: 0.4772
[566]	valid_0's rmse: 0.477104
[567]	valid_0's rmse: 0.477032
[568]	valid_0's rmse: 0.47699
[569]	valid_0's rmse: 0.476891
[570]	valid_0's rmse: 0.476838
[571]	valid_0's rmse: 0.476776
[572]	valid_0's rmse: 0.476732
[573]	valid_0

[813]	valid_0's rmse: 0.46676
[814]	valid_0's rmse: 0.466692
[815]	valid_0's rmse: 0.466663
[816]	valid_0's rmse: 0.466609
[817]	valid_0's rmse: 0.466598
[818]	valid_0's rmse: 0.466565
[819]	valid_0's rmse: 0.466538
[820]	valid_0's rmse: 0.466529
[821]	valid_0's rmse: 0.466509
[822]	valid_0's rmse: 0.466483
[823]	valid_0's rmse: 0.466464
[824]	valid_0's rmse: 0.466442
[825]	valid_0's rmse: 0.466411
[826]	valid_0's rmse: 0.466349
[827]	valid_0's rmse: 0.466299
[828]	valid_0's rmse: 0.466279
[829]	valid_0's rmse: 0.466256
[830]	valid_0's rmse: 0.466231
[831]	valid_0's rmse: 0.466193
[832]	valid_0's rmse: 0.466176
[833]	valid_0's rmse: 0.466138
[834]	valid_0's rmse: 0.466131
[835]	valid_0's rmse: 0.466091
[836]	valid_0's rmse: 0.466077
[837]	valid_0's rmse: 0.466037
[838]	valid_0's rmse: 0.466003
[839]	valid_0's rmse: 0.465948
[840]	valid_0's rmse: 0.465881
[841]	valid_0's rmse: 0.465796
[842]	valid_0's rmse: 0.465777
[843]	valid_0's rmse: 0.4657
[844]	valid_0's rmse: 0.465692
[845]	valid

In [18]:
from sklearn.metrics import mean_squared_log_error

y_pred_val = np.expm1(clf.predict(x_val))
true_val = np.expm1(y_val)

mean_squared_log_error(y_pred_val, true_val)

0.21329169089278446

In [20]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x_train)
x_train_r = scaler.transform(x_train)
x_val_r = scaler.transform(x_val)
x_test_r = scaler.transform(test)

In [21]:
lgb_train = lgb.Dataset(x_train_r, y_train)
lgb_eval = lgb.Dataset(x_val_r, y_val, reference = lgb_train)
clf = lgb.train(params, lgb_train, num_boost_round = 1000, valid_sets = lgb_eval, early_stopping_rounds = 100)

[1]	valid_0's rmse: 1.19411
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's rmse: 1.18467
[3]	valid_0's rmse: 1.15735
[4]	valid_0's rmse: 1.1225
[5]	valid_0's rmse: 1.08954
[6]	valid_0's rmse: 1.0693
[7]	valid_0's rmse: 1.04754
[8]	valid_0's rmse: 1.01919
[9]	valid_0's rmse: 0.992961
[10]	valid_0's rmse: 0.987025
[11]	valid_0's rmse: 0.962581
[12]	valid_0's rmse: 0.93972
[13]	valid_0's rmse: 0.925983
[14]	valid_0's rmse: 0.911898
[15]	valid_0's rmse: 0.900321
[16]	valid_0's rmse: 0.881271
[17]	valid_0's rmse: 0.871218
[18]	valid_0's rmse: 0.85326
[19]	valid_0's rmse: 0.844011
[20]	valid_0's rmse: 0.83481
[21]	valid_0's rmse: 0.819159
[22]	valid_0's rmse: 0.809854
[23]	valid_0's rmse: 0.79629
[24]	valid_0's rmse: 0.78395
[25]	valid_0's rmse: 0.777113
[26]	valid_0's rmse: 0.770665
[27]	valid_0's rmse: 0.764879
[28]	valid_0's rmse: 0.753914
[29]	valid_0's rmse: 0.743624
[30]	valid_0's rmse: 0.737808
[31]	valid_0's rmse: 0.732269
[32]	valid_0's rmse: 0.727453
[

[272]	valid_0's rmse: 0.503483
[273]	valid_0's rmse: 0.503381
[274]	valid_0's rmse: 0.503249
[275]	valid_0's rmse: 0.503093
[276]	valid_0's rmse: 0.502887
[277]	valid_0's rmse: 0.502683
[278]	valid_0's rmse: 0.502605
[279]	valid_0's rmse: 0.502522
[280]	valid_0's rmse: 0.502245
[281]	valid_0's rmse: 0.502191
[282]	valid_0's rmse: 0.502022
[283]	valid_0's rmse: 0.501926
[284]	valid_0's rmse: 0.501678
[285]	valid_0's rmse: 0.501563
[286]	valid_0's rmse: 0.501401
[287]	valid_0's rmse: 0.501198
[288]	valid_0's rmse: 0.501004
[289]	valid_0's rmse: 0.500863
[290]	valid_0's rmse: 0.500681
[291]	valid_0's rmse: 0.500499
[292]	valid_0's rmse: 0.500339
[293]	valid_0's rmse: 0.500141
[294]	valid_0's rmse: 0.500027
[295]	valid_0's rmse: 0.499929
[296]	valid_0's rmse: 0.499887
[297]	valid_0's rmse: 0.499739
[298]	valid_0's rmse: 0.499652
[299]	valid_0's rmse: 0.499563
[300]	valid_0's rmse: 0.499472
[301]	valid_0's rmse: 0.499404
[302]	valid_0's rmse: 0.499276
[303]	valid_0's rmse: 0.499212
[304]	va

[548]	valid_0's rmse: 0.478108
[549]	valid_0's rmse: 0.478075
[550]	valid_0's rmse: 0.477995
[551]	valid_0's rmse: 0.477915
[552]	valid_0's rmse: 0.477845
[553]	valid_0's rmse: 0.477765
[554]	valid_0's rmse: 0.477709
[555]	valid_0's rmse: 0.47765
[556]	valid_0's rmse: 0.477606
[557]	valid_0's rmse: 0.477556
[558]	valid_0's rmse: 0.477516
[559]	valid_0's rmse: 0.47747
[560]	valid_0's rmse: 0.477446
[561]	valid_0's rmse: 0.477437
[562]	valid_0's rmse: 0.477419
[563]	valid_0's rmse: 0.477407
[564]	valid_0's rmse: 0.477347
[565]	valid_0's rmse: 0.477303
[566]	valid_0's rmse: 0.47719
[567]	valid_0's rmse: 0.477143
[568]	valid_0's rmse: 0.477112
[569]	valid_0's rmse: 0.477033
[570]	valid_0's rmse: 0.476991
[571]	valid_0's rmse: 0.476923
[572]	valid_0's rmse: 0.476861
[573]	valid_0's rmse: 0.476814
[574]	valid_0's rmse: 0.476794
[575]	valid_0's rmse: 0.476705
[576]	valid_0's rmse: 0.47668
[577]	valid_0's rmse: 0.476585
[578]	valid_0's rmse: 0.476565
[579]	valid_0's rmse: 0.476507
[580]	valid_

[824]	valid_0's rmse: 0.466909
[825]	valid_0's rmse: 0.466895
[826]	valid_0's rmse: 0.466837
[827]	valid_0's rmse: 0.466786
[828]	valid_0's rmse: 0.466778
[829]	valid_0's rmse: 0.466753
[830]	valid_0's rmse: 0.466743
[831]	valid_0's rmse: 0.46672
[832]	valid_0's rmse: 0.46671
[833]	valid_0's rmse: 0.46667
[834]	valid_0's rmse: 0.466667
[835]	valid_0's rmse: 0.466624
[836]	valid_0's rmse: 0.466604
[837]	valid_0's rmse: 0.466591
[838]	valid_0's rmse: 0.466567
[839]	valid_0's rmse: 0.46653
[840]	valid_0's rmse: 0.466456
[841]	valid_0's rmse: 0.466394
[842]	valid_0's rmse: 0.466376
[843]	valid_0's rmse: 0.466268
[844]	valid_0's rmse: 0.466254
[845]	valid_0's rmse: 0.466236
[846]	valid_0's rmse: 0.466214
[847]	valid_0's rmse: 0.466181
[848]	valid_0's rmse: 0.466085
[849]	valid_0's rmse: 0.466076
[850]	valid_0's rmse: 0.466038
[851]	valid_0's rmse: 0.466032
[852]	valid_0's rmse: 0.466019
[853]	valid_0's rmse: 0.465899
[854]	valid_0's rmse: 0.465861
[855]	valid_0's rmse: 0.465857
[856]	valid_

In [25]:
y_pred_val_r = np.expm1(clf.predict(x_val_r))
true_val = np.expm1(y_val)

np.sqrt(mean_squared_log_error(y_pred_val_r, true_val))

0.4622247217267719

In [24]:
## Final prediction on test dataset.

x = np.expm1(clf.predict(x_test_r))
pd.DataFrame({'id': test_id, 'num_orders' : x}).to_csv('genpact.csv', index = False)