# Food Demand Forecasting Challenge
https://datahack.analyticsvidhya.com/contest/genpact-machine-learning-hackathon-1/  
**Aleksey Shipitsyn**    
**2019-07-22**

# Read and Explore Train Data

In [209]:
import numpy as np
import pandas as pd


In [210]:
# weekly demand data
df_demand = pd.read_csv('./Documents/Competitions/Food Demand/train_GzS76OK/train.csv',
                       dtype={'id': str, 'week': int, 'center_id': str, 'meal_id': str, 
                              'checkout_price': float, 'base_price': float, 
                              'emailer_for_promotion': int, 'homepage_featured': int, 'num_orders': int})

print(df_demand.isnull().sum(),'\n')

print('shape:', df_demand.shape, '\n')

print('dtype:', df_demand.dtypes, '\n')

print(df_demand.center_id.unique())


df_demand.head()

id                       0
week                     0
center_id                0
meal_id                  0
checkout_price           0
base_price               0
emailer_for_promotion    0
homepage_featured        0
num_orders               0
dtype: int64 

shape: (456548, 9) 

dtype: id                        object
week                       int64
center_id                 object
meal_id                   object
checkout_price           float64
base_price               float64
emailer_for_promotion      int64
homepage_featured          int64
num_orders                 int64
dtype: object 

['55' '24' '11' '83' '32' '13' '109' '52' '93' '186' '146' '57' '149' '89'
 '124' '152' '97' '74' '108' '99' '66' '94' '91' '20' '34' '137' '92'
 '126' '36' '162' '75' '177' '27' '157' '106' '64' '129' '14' '17' '153'
 '139' '161' '81' '26' '73' '50' '104' '42' '113' '145' '53' '72' '67'
 '174' '29' '77' '41' '30' '76' '59' '88' '143' '58' '10' '101' '80' '43'
 '65' '39' '102' '110' '132' '23' '86'

Unnamed: 0,id,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured,num_orders
0,1379560,1,55,1885,136.83,152.29,0,0,177
1,1466964,1,55,1993,136.83,135.83,0,0,270
2,1346989,1,55,2539,134.86,135.86,0,0,189
3,1338232,1,55,2139,339.5,437.53,0,0,54
4,1448490,1,55,2631,243.5,242.5,0,0,40


In [211]:
# number of unique values in columns
pd.DataFrame({'column': df_demand.columns, 
              'unique': [len(df_demand[col].unique()) for col in df_demand.columns]})   

Unnamed: 0,column,unique
0,id,456548
1,week,145
2,center_id,77
3,meal_id,51
4,checkout_price,1992
5,base_price,1907
6,emailer_for_promotion,2
7,homepage_featured,2
8,num_orders,1250


In [212]:
# fulfillment centers data
df_fulfill = pd.read_csv('./Documents/Competitions/Food Demand/train_GzS76OK/fulfilment_center_info.csv',
                        dtype={'center_id': str, 'city_code': str, 'region_code': str, 'center_type': str, 
                               'op_area': float})                         

print(df_fulfill.isnull().sum(), '\n')

print('shape:', df_fulfill.shape, '\n')

print('dtype:', df_fulfill.dtypes)

df_fulfill.head()

center_id      0
city_code      0
region_code    0
center_type    0
op_area        0
dtype: int64 

shape: (77, 5) 

dtype: center_id       object
city_code       object
region_code     object
center_type     object
op_area        float64
dtype: object


Unnamed: 0,center_id,city_code,region_code,center_type,op_area
0,11,679,56,TYPE_A,3.7
1,13,590,56,TYPE_B,6.7
2,124,590,56,TYPE_C,4.0
3,66,648,34,TYPE_A,4.1
4,94,632,34,TYPE_C,3.6


In [213]:
# number of unique values in columns
pd.DataFrame({'column': df_fulfill.columns, 
              'unique': [len(df_fulfill[col].unique()) for col in df_fulfill.columns]})   

Unnamed: 0,column,unique
0,center_id,77
1,city_code,51
2,region_code,8
3,center_type,3
4,op_area,30


In [214]:
# meal being served
df_meal = pd.read_csv('./Documents/Competitions/Food Demand/train_GzS76OK/meal_info.csv',
                     dtype={'meal_id': str, 'category': str, 'cuisine': str})

print(df_meal.isnull().sum(), '\n')

print('shape:', df_meal.shape, '\n')

print('dtype:', df_meal.dtypes)

df_meal.head()

meal_id     0
category    0
cuisine     0
dtype: int64 

shape: (51, 3) 

dtype: meal_id     object
category    object
cuisine     object
dtype: object


Unnamed: 0,meal_id,category,cuisine
0,1885,Beverages,Thai
1,1993,Beverages,Thai
2,2539,Beverages,Thai
3,1248,Beverages,Indian
4,2631,Beverages,Indian


In [215]:
# number of unique values in columns
pd.DataFrame({'column': df_meal.columns, 
              'unique': [len(df_meal[col].unique()) for col in df_meal.columns]})   

Unnamed: 0,column,unique
0,meal_id,51
1,category,14
2,cuisine,4


## Read and Explore Test Data

In [216]:
# test data 
# extention of Excel file, but it is actually csv-file 
df_test = pd.read_csv('./Documents/Competitions/Food Demand/test_QoiMO9B.xls',
                      dtype={'id': str, 'week': int, 'center_id': str, 'meal_id': str, 
                             'checkout_price': float, 'base_price': float, 
                             'emailer_for_promotion': int, 'homepage_featured': int})

print(df_test.isnull().sum(), '\n')

print('shape:', df_test.shape, '\n')

print('dtype:', df_test.dtypes)

df_test.head()

id                       0
week                     0
center_id                0
meal_id                  0
checkout_price           0
base_price               0
emailer_for_promotion    0
homepage_featured        0
dtype: int64 

shape: (32573, 8) 

dtype: id                        object
week                       int64
center_id                 object
meal_id                   object
checkout_price           float64
base_price               float64
emailer_for_promotion      int64
homepage_featured          int64
dtype: object


Unnamed: 0,id,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured
0,1028232,146,55,1885,158.11,159.11,0,0
1,1127204,146,55,1993,160.11,159.11,0,0
2,1212707,146,55,2539,157.14,159.14,0,0
3,1082698,146,55,2631,162.02,162.02,0,0
4,1400926,146,55,1248,163.93,163.93,0,0


In [217]:
# number of unique values in columns
pd.DataFrame({'column': df_test.columns, 
              'unique': [len(df_test[col].unique()) for col in df_test.columns]})   

Unnamed: 0,column,unique
0,id,32573
1,week,10
2,center_id,77
3,meal_id,51
4,checkout_price,1397
5,base_price,1179
6,emailer_for_promotion,2
7,homepage_featured,2


In [218]:
# submission file data 
# extention of Excel file, but it is actually csv-file 
df_submission = pd.read_csv('./Documents/Competitions/Food Demand/sample_submission_hSlSoT6.xls',
                           dtype={'id': str, 'num_orders': int})

print(df_submission.isnull().sum(), '\n')

print('shape:', df_submission.shape, '\n')

print('dtype:', df_submission.dtypes)

df_submission.head()


id            0
num_orders    0
dtype: int64 

shape: (32573, 2) 

dtype: id            object
num_orders     int64
dtype: object


Unnamed: 0,id,num_orders
0,1028232,0
1,1127204,0
2,1212707,0
3,1082698,0
4,1400926,0


In [219]:
# check if row id in test set is the same as in submission file
any(df_test.id != df_submission.id)

# they are the same

False

## Naive model  

Naive model does not use Time Series approach and not use previous target values to predict the next value.
This model uses only yearly week number to navigate in time. 
The logic is: the pattern of previous years will repeat the next year.  

**Modelling steps:**

- Merge all avaliable data together, for training and test set
- Represent week number as week of the year in range 1-52
- Randomly split training data to training and validation subsets
- Feed everything into Regression model
- Evaluate the model 


## Merging data and Train Test split

In [220]:
# merging train data
data_train = pd.merge(left=df_demand, right=df_fulfill, how='left', on='center_id')
data_train = pd.merge(left=data_train, right=df_meal, how='left', on='meal_id')
data_train.head()

data_train.center_id.unique()


array(['55', '24', '11', '83', '32', '13', '109', '52', '93', '186',
       '146', '57', '149', '89', '124', '152', '97', '74', '108', '99',
       '66', '94', '91', '20', '34', '137', '92', '126', '36', '162',
       '75', '177', '27', '157', '106', '64', '129', '14', '17', '153',
       '139', '161', '81', '26', '73', '50', '104', '42', '113', '145',
       '53', '72', '67', '174', '29', '77', '41', '30', '76', '59', '88',
       '143', '58', '10', '101', '80', '43', '65', '39', '102', '110',
       '132', '23', '86', '68', '51', '61'], dtype=object)

In [221]:
# merging tets data
data_test = pd.merge(left=df_test, right=df_fulfill, how='left', on='center_id')
data_test = pd.merge(left=data_test, right=df_meal, how='left', on='meal_id')
data_test.head()

data_test.center_id.unique()

array(['55', '24', '11', '83', '32', '13', '109', '52', '93', '186',
       '146', '57', '149', '89', '124', '152', '97', '74', '108', '99',
       '66', '94', '91', '20', '34', '137', '92', '126', '36', '162',
       '75', '177', '27', '157', '106', '64', '129', '14', '17', '153',
       '139', '161', '81', '26', '73', '50', '104', '42', '113', '145',
       '53', '72', '67', '174', '29', '77', '41', '30', '76', '59', '88',
       '143', '58', '10', '101', '80', '43', '65', '39', '102', '110',
       '132', '23', '86', '68', '51', '61'], dtype=object)

In [233]:
# select feature columns
feature_columns = ['week', 'center_id', 'meal_id', 'checkout_price', 'base_price',
                   'emailer_for_promotion', 'homepage_featured', 'city_code',
                   'region_code', 'center_type', 'op_area', 'category', 'cuisine']

X_train = data_train[feature_columns]
X_test = data_test[feature_columns]
y_train = data_train['num_orders']

print('X_train.shape:', X_train.shape)
print('X_test.shape:', X_test.shape)
print('y_train.shape:',y_train.shape)

X_train.shape: (456548, 13)
X_test.shape: (32573, 13)
y_train.shape: (456548,)


In [234]:
# change week number to 1-52 range

if any(data_train['week'] > 52):
    a = data_train['week'] % 52 
    a[a == 0] = 52
    data_train['week'] = a
    
print(data_train['week'].unique(), '\n')    
    
    
if any(data_test['week'] > 52):
    a = data_test['week'] % 52 
    a[a == 0] = 52
    data_test['week'] = a
    
print(data_test['week'].unique())


[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
 49 50 51 52] 

[42 43 44 45 46 47 48 49 50 51]


In [235]:
def nonlinear_features(x, varname='x'):    
    """Non-linear features for numeric x"""
    
    x05 = np.sqrt(x)
    x2 = x**2
    x3 = x2*x
    x4 = x3*x
    xlog = np.log(x)
    
    features = pd.DataFrame({varname + '_root': x05, varname + '_square': x2, 
                             varname + '_cube': x3,   varname + '_quad': x4, 
                             varname + '_log': xlog})    
    return features
    
    
df = pd.DataFrame({'a': [1,2,3,4,5,6,7,8,9]})
f = nonlinear_features(df.a, varname='a')

pd.concat([df, f], axis=1)

Unnamed: 0,a,a_root,a_square,a_cube,a_quad,a_log
0,1,1.0,1,1,1,0.0
1,2,1.414214,4,8,16,0.693147
2,3,1.732051,9,27,81,1.098612
3,4,2.0,16,64,256,1.386294
4,5,2.236068,25,125,625,1.609438
5,6,2.44949,36,216,1296,1.791759
6,7,2.645751,49,343,2401,1.94591
7,8,2.828427,64,512,4096,2.079442
8,9,3.0,81,729,6561,2.197225


In [236]:
numeric_vars = ['checkout_price', 'base_price', 'op_area']

for var in numeric_vars:
    
    f = nonlinear_features(X_train[var], varname=var)    
    X_train = pd.concat([X_train, f], axis=1) 
    
    f = nonlinear_features(X_test[var], varname=var)
    X_test = pd.concat([X_test, f], axis=1) 




In [237]:
print('X_train.shape:', X_train.shape)
print('X_test.shape:', X_test.shape)
print('y_train.shape:',y_train.shape, '\n')
print(X_train.columns, '\n')
print(X_test.columns)


X_train.shape: (456548, 28)
X_test.shape: (32573, 28)
y_train.shape: (456548,) 

Index(['week', 'center_id', 'meal_id', 'checkout_price', 'base_price',
       'emailer_for_promotion', 'homepage_featured', 'city_code',
       'region_code', 'center_type', 'op_area', 'category', 'cuisine',
       'checkout_price_root', 'checkout_price_square', 'checkout_price_cube',
       'checkout_price_quad', 'checkout_price_log', 'base_price_root',
       'base_price_square', 'base_price_cube', 'base_price_quad',
       'base_price_log', 'op_area_root', 'op_area_square', 'op_area_cube',
       'op_area_quad', 'op_area_log'],
      dtype='object') 

Index(['week', 'center_id', 'meal_id', 'checkout_price', 'base_price',
       'emailer_for_promotion', 'homepage_featured', 'city_code',
       'region_code', 'center_type', 'op_area', 'category', 'cuisine',
       'checkout_price_root', 'checkout_price_square', 'checkout_price_cube',
       'checkout_price_quad', 'checkout_price_log', 'base_price_root',
 

In [239]:
# ensure labels of categorical vars
categorical_vars = X_train.columns[X_train.dtypes == 'object']

for var in categorical_vars:
    categories = X_train[var].append(X_test[var]).unique()
    X_train[var] = pd.Categorical(X_train[var], categories=categories, ordered=False)
    X_test[var] = pd.Categorical(X_test[var], categories=categories, ordered=False)
    

In [244]:
# dummy encoding
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

print('X_train.shape:', X_train.shape)
print('X_test.shape:', X_test.shape)


X_train.shape: (456548, 222)
X_test.shape: (32573, 222)


In [None]:
# Scale data

#from sklearn.preprocessing import StandardScaler

#scaler = StandardScaler()
#scaler.fit(X_train)
#X_train = scaler.transform(X_train)
#X_test = scaler.transform(X_test)


In [261]:
# training and validation sets

from sklearn.model_selection import train_test_split

X_train_sub, X_valid, y_train_sub, y_valid = train_test_split(X_train, y_train, test_size=0.5, random_state=42)


## Evaluation Metric

In [246]:
# Evaluation Metric = 100 * RMSLE 
# RMSLE is Root of Mean Squared Logarithmic Error across all entries in the test set

from sklearn.metrics import mean_squared_log_error

def predict_evaluate(model, X, y):
    # predict with a model
    y_predicted = model.predict(X).round()

    # trunkate negative velues to zeros
    y_predicted[y_predicted < 0] = 0
        
    return mean_squared_log_error(y_true=y, y_pred=y_predicted)


## Linear models

In [262]:
# Linear models
from sklearn.linear_model import LinearRegression, Ridge, Lasso

# train on training subset
linear_model = LinearRegression(normalize=True).fit(X_train_sub, y_train_sub)
ridge_model = Ridge(normalize=True, alpha=0.1).fit(X_train_sub, y_train_sub)
lasso_model = Lasso(normalize=True, alpha=0.001).fit(X_train_sub, y_train_sub)


  positive)


In [263]:
# evaluation on validation set
print('RMSLE Linear model on validation set:', predict_evaluate(linear_model, X_valid, y_valid))
print('RMSLE Ridge model on validation set:', predict_evaluate(ridge_model, X_valid, y_valid))
print('RMSLE Lasso model on validation set:', predict_evaluate(lasso_model, X_valid, y_valid))


RMSLE Linear model on validation set: 3.0726880435216586
RMSLE Ridge model on validation set: 2.6320932222467723
RMSLE Lasso model on validation set: 2.911348798264864


In [250]:
# retrain models on the whole training data
linear_model = LinearRegression(normalize=True).fit(X_train, y_train)
ridge_model = Ridge(normalize=True, alpha=0.1).fit(X_train, y_train)
lasso_model = Lasso(normalize=True, alpha=0.001).fit(X_train, y_train)


  positive)


In [251]:
# evaluation on the whole training data
print('RMSLE Linear model on training data:', predict_evaluate(linear_model, X_train, y_train))
print('RMSLE Ridge model on training data:', predict_evaluate(ridge_model, X_train, y_train))
print('RMSLE Lasso model on training data:', predict_evaluate(lasso_model, X_train, y_train))


RMSLE Linear model on training data: 3.0648370875919095
RMSLE Ridge model on training data: 2.626183637955585
RMSLE Lasso model on training data: 2.886360110730599


In [None]:
# Variable importance from Ridge coefficients 
print('Intercept:', ridge_model.coef_,'\n')
print('Coefficients:\n' ridge_model.coef_)


## Advanced models

In [264]:
# SVM - very slow but good, RMSLE = 0.84
from sklearn.svm import SVR

svm_model = SVR(gamma='scale').fit(X_train_sub, y_train_sub)


In [265]:
# # evaluate - very slow
# print('RMSLE SVM model on validation set:', predict_evaluate(svm_model, X_valid, y_valid))

# no sense to retrain on the whole training set, use as it is

In [275]:
# XGboost 
from sklearn.ensemble import GradientBoostingRegressor

# train on training subset 
params = {'n_estimators': 1000,'learning_rate': 0.01, 'loss': 'ls', 'random_state': 42 }


In [None]:
xgboost_model = GradientBoostingRegressor(**params).fit(X_train_sub, y_train_sub)

In [270]:
print('RMSLE XGboost model on validation set:', predict_evaluate(xgboost_model, X_valid, y_valid))

RMSLE XGboost model on validation set: 0.7595496188999313


In [276]:
# retrain on all training data
xgboost_model = GradientBoostingRegressor(**params).fit(X_train, y_train)


In [277]:
print('RMSLE XGboost model on training data:', predict_evaluate(xgboost_model, X_train, y_train))

RMSLE XGboost model on training data: 0.7238038432297658


## Final model prediction and submission




In [278]:
final_model = xgboost_model

y_test_predicted = final_model.predict(X_test).astype(int)

# trunkate negative values to zeros
y_test_predicted[y_test_predicted < 0] = 0

df_submission['num_orders'] = y_test_predicted

print(df_submission.head())

df_submission.to_csv('./Documents/Competitions/Food Demand/submission_5.csv', index=False)



        id  num_orders
0  1028232         163
1  1127204         201
2  1212707         138
3  1082698          39
4  1400926          24


In [None]:
svm_model

In [290]:
# rank at submission
rank = 618
participants = 6450
print('Acheaved rank is {} from {} participants, or top {} %'.format(
            rank, participants, round(100*rank/participants, 2)))

Acheaved rank is 618 from 6450 participants, or top 9.58 %
