In [1]:
# importing libraries.
import pandas as pd
import numpy as np 

## Loading Datasets
test = pd.read_csv('/home/aakash/Genpact/test_main.csv')
train = pd.read_csv('/home/aakash/Genpact/train/train.csv') ## This is the demand data for raw materials.
train_meal = pd.read_csv('/home/aakash/Genpact/train/meal_info.csv')
train_centers = pd.read_csv('/home/aakash/Genpact/train/fulfilment_center_info.csv')

In [2]:
## EDA
print('train dataset shape',train.shape) # So we have a huge dataset
print('train dataset columns', train.columns)     

train dataset shape (456548, 9)
train dataset columns Index(['id', 'week', 'center_id', 'meal_id', 'checkout_price', 'base_price',
       'emailer_for_promotion', 'homepage_featured', 'num_orders'],
      dtype='object')


In [3]:
test_id = test.id
print(test.shape)
print(test.columns) ## So num_orders is our target variable.

(32573, 8)
Index(['id', 'week', 'center_id', 'meal_id', 'checkout_price', 'base_price',
       'emailer_for_promotion', 'homepage_featured'],
      dtype='object')


In [4]:
train.dtypes ## so clearly our target varibale is Numerical
## So this is a regression analysis problem.

id                         int64
week                       int64
center_id                  int64
meal_id                    int64
checkout_price           float64
base_price               float64
emailer_for_promotion      int64
homepage_featured          int64
num_orders                 int64
dtype: object

In [5]:
train.head()
## A himepage featured meal is likely to have higher num_order.

Unnamed: 0,id,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured,num_orders
0,1379560,1,55,1885,136.83,152.29,0,0,177
1,1466964,1,55,1993,136.83,135.83,0,0,270
2,1346989,1,55,2539,134.86,135.86,0,0,189
3,1338232,1,55,2139,339.5,437.53,0,0,54
4,1448490,1,55,2631,243.5,242.5,0,0,40


In [6]:
train.price = train.base_price - train.checkout_price
train.price[train.price >= 0] = 1
train.price[train.price < 0] = 0

test.price = test.base_price - test.checkout_price
test.price[test.price >= 0] = 1
test.price[test.price < 0] = 0

  """Entry point for launching an IPython kernel.
  """


In [7]:
## Now what I want to do is convert all the 3 train data's into a single dataset.
# Note that center ID and meal ID are going to be the most imprtant features for us.
# Lets check for the null values. if any. So no null values.
print(len(train.meal_id.unique()))  
# So we have 77 unique meal centres and 51 unique meals.
print(train_meal.category.value_counts()) # So most meals actually coreespont to beverages.
print("  ")
print(train_meal.cuisine.value_counts()) # and most people prefer Thai cuisine.

51
Beverages       12
Soup             3
Desert           3
Seafood          3
Sandwich         3
Fish             3
Starters         3
Salad            3
Rice Bowl        3
Biryani          3
Extras           3
Other Snacks     3
Pasta            3
Pizza            3
Name: category, dtype: int64
  
Thai           15
Indian         12
Continental    12
Italian        12
Name: cuisine, dtype: int64


In [8]:
print(train_centers.shape)
train_centers.city_code.value_counts() ## So most meal centres are in city with city ID 590
train_centers.region_code.value_counts() # and most meal centres have a region code 30.
train_centers.center_type.value_counts() # So Type_A might be some centres of high importance.
train_centers.head()

(77, 5)


Unnamed: 0,center_id,city_code,region_code,center_type,op_area
0,11,679,56,TYPE_A,3.7
1,13,590,56,TYPE_B,6.7
2,124,590,56,TYPE_C,4.0
3,66,648,34,TYPE_A,4.1
4,94,632,34,TYPE_C,3.6


In [9]:
## Now lwts compute the mean number of orders..
x = np.mean(train.num_orders)
print(x)

# lets make a baseline submission on test data using this mean.
pd.DataFrame({'id': test.id, 'num_orders' : x}).to_csv('genpact.csv', index = False)

261.8727603669275


In [10]:
# Now lets create new datasets. 
## Creating columns from centre data.
# ### For Train data ####
for col in train_centers.columns[1:]:
    train[col] = train.center_id.replace(train_centers.set_index('center_id')[col])
    test[col] = test.center_id.replace(train_centers.set_index('center_id')[col])

train['center_type'] = pd.get_dummies(train['center_type'])
test['center_type'] = pd.get_dummies(test['center_type'])

for col in train_meal.columns[1:]:
    train[col] = train.meal_id.replace(train_meal.set_index('meal_id')[col])
    test[col] = test.meal_id.replace(train_meal.set_index('meal_id')[col])

train['category'] = pd.get_dummies(train['category'])
test['category'] = pd.get_dummies(test['category'])

train['cuisine'] = pd.get_dummies(train['cuisine'])
test['cuisine'] = pd.get_dummies(test['cuisine'])

In [11]:
from scipy.stats import skew
print(skew(train.num_orders))

train.num_orders = np.log1p(train.num_orders)
print(skew(train.num_orders))

6.929943296742635
-0.021917552017371056


In [12]:
### Modelling ###
## Adding an Xgboost model
import matplotlib.pyplot as plt
% matplotlib inline

train_y = train['num_orders']
train_x = train.drop(['num_orders', 'base_price', 'checkout_price', 'id'], axis = 1)
test = test.drop(['id', 'base_price', 'checkout_price'], axis = 1)

import xgboost as xgb
dtrain = xgb.DMatrix(train_x, label = train_y)
dtest = xgb.DMatrix(test)

params = {'max_depth' : 9 , 'eta' :0.65, "objective" : "reg:linear", "colsample_bytree" : 1}
model = xgb.cv(params, dtrain, num_boost_round = 40, early_stopping_rounds = 50) 
model.loc[5:, ['test-rmse-mean', 'train-rmse-mean']]

Unnamed: 0,test-rmse-mean,train-rmse-mean
5,0.606076,0.598016
6,0.586989,0.577848
7,0.571681,0.560963
8,0.56184,0.54992
9,0.553508,0.540179
10,0.548499,0.533965
11,0.545836,0.530543
12,0.539705,0.523411
13,0.536276,0.518848
14,0.533292,0.514745


In [19]:
import xgboost as xgb
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score
import warnings
warnings.simplefilter('ignore')

iter_no = 5
cv_splits = 10

def treesCV(eta, gamma,max_depth,subsample,colsample_bytree):
#function for cross validation gradient boosted trees
    return(-cross_val_score(xgb.XGBRegressor(objective='reg:linear',
                                            tree_method = 'auto',
                                            learning_rate = eta,
                                            gamma=max(gamma,0),
                                            max_depth = int(max_depth),
                                            colsample_bytree = min(colsample_bytree,1),
                                            n_estimators = 20,
                                            subsample = min(subsample,1), 
                                            seed=95), 
                                            X=train_x, 
                                            y=train_y, 
                                            scoring = 'neg_mean_squared_log_error', 
                                            cv=7, n_jobs=-1).mean())

In [24]:
## Bayesian Optimisation

treesBO = BayesianOptimization(treesCV,{'eta' : (0.55,0.65),'gamma':(2,5), 'max_depth': (6,9),
                                            'subsample':(0.95,1),
                                            'colsample_bytree':(0.90,0.95)})
treesBO.maximize(n_iter = 5)
tree_best = treesBO.res['max']

#train tree with best paras
model_xgb = xgb.XGBRegressor(objective='reg:linear',
                                    tree_method = 'auto',
                                    seed=95,
                                    learning_rate=max(tree_best['max_params']['eta'],0),
                                    gamma=max(tree_best['max_params']['gamma'],0),
                                    max_depth=int(tree_best['max_params']['max_depth']),
                                    silent=True,
                                    subsample=min(tree_best['max_params']['subsample'],1),
                                    colsample_bytree=min(tree_best['max_params']['colsample_bytree'],1),
                                    n_estimators=50,nthread=-1)

[31mInitialization[0m
[94m------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |       eta |     gamma |   max_depth |   subsample | 
    1 | 00m20s | [35m   0.01334[0m | [32m            0.9256[0m | [32m   0.6038[0m | [32m   4.5946[0m | [32m     6.0900[0m | [32m     0.9913[0m | 
    2 | 00m19s | [35m   0.01343[0m | [32m            0.9281[0m | [32m   0.6381[0m | [32m   2.6367[0m | [32m     6.9933[0m | [32m     0.9925[0m | 
    3 | 00m22s |    0.01339 |             0.9149 |    0.6205 |    3.1583 |      6.5728 |      0.9681 | 
    4 | 00m30s |    0.01225 |             0.9417 |    0.5903 |    2.7833 |      8.5748 |      0.9711 | 
    5 | 00m22s | [35m   0.01354[0m | [32m            0.9331[0m | [32m   0.5972[0m | [32m   4.0921[0m | [32m     6.8559[0m | [32m     0.9662[0m | 
[31mBayesian Optimization[0m
[94m------------------------------------------

In [28]:
## Splitting the training data into train and validation sets.
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val  = train_test_split(train_x, train_y, test_size = 0.25, random_state = 1)

from sklearn.metrics import mean_squared_log_error
model_xgb.fit(x_train, y_train)
y_pred_val = np.expm1(model_xgb.predict(x_val))
true_val = np.expm1(y_val)

np.sqrt(mean_squared_log_error(y_pred_val, true_val))

0.5464524615810603

In [27]:
x = np.expm1(model_xgb.predict(test))
pd.DataFrame({'id': test_id, 'num_orders' : x}).to_csv('genpact.csv', index = False)