In [1]:
# one lightGBM model for EGM

In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from datetime import date, timedelta
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error

In [4]:
# load
data = pd.read_csv('df_egm.csv', parse_dates=['order_date'])
# feature with no variance / duplicated feature
data.drop(['store_id', 'is_liquidation', 'is_cancelled'], axis = 1, inplace = True)
# duplicated
data = data[~data['id'].duplicated()]
# drop cancelled
data = data[data['quantity_cancelled'] == 0]

data['date'] = data['order_date'].dt.date
data['cost'] = data['cost_product'] + data['cost_shipping'] + data['cost_other']

In [5]:
ds_egm = data[['id', 'date','revenue_net', 'cost']].copy()
ds_egm['egm'] = 1 - ds_egm['cost']/ds_egm['revenue_net']
# exluding revenue_net = 0: can't calculate EGM
ds_egm.loc[ds_egm['revenue_net'] == 0, 'egm'] = np.nan

In [6]:
in_mask = ds_egm['egm'].abs()<=3
out_mask = ds_egm['egm'].abs()>3
nan_mask = ds_egm['egm'].isnull()
ntotal = ds_egm.shape[0]

In [7]:
print('total_used:')
print(in_mask.sum()/ntotal)
print('\ntotal_used (no NaN):')
print(in_mask.sum()/(ntotal-nan_mask.sum()))

total_used:
0.9495986646016417

total_used (no NaN):
0.9892794970722577


In [8]:
data = data[in_mask]
data['egm'] = 1 - data['cost']/data['revenue_net']

In [9]:
# var_name to predict
name = 'egm'
# continuous features
conts = ['date', 'sale_price', 'wholesale_price', 'revenue_product', 'revenue_shipping', 'shipping_pred', 'quantity_initial', 'egm']
# categorical features
cats = ['shipping_speed_id', 'supplier_id', 'category_id', 'class_id', 'carrier_id', 'manufacturer_id', 'is_b2b', 'is_giftcard']

In [10]:
def prep_data(data, conts, cats, y_name):
    # continuous features
    ds = data[conts].copy()
    # categorical features: code to 0 - (n-1)
    for cat in cats:
        ds[cat] = LabelEncoder().fit_transform(data[cat].copy())
        ds.loc[data[cat].isnull(), cat] = np.nan
    
    # time
    ds['month'] = data['order_date'].dt.month.copy() - 1
    ds['dom'] = data['order_date'].dt.day.copy() - 1
    ds['dow'] = data['order_date'].dt.dayofweek.copy()
    cats.extend(['month', 'dow'])
    
    # train/val/test split
    ds_train = ds[ds['date'] < date(2018, 6, 17)].copy()
    ds_val = ds[(ds['date'] >= date(2018, 6, 17)) & (ds['date'] <= date(2018, 7, 16))].copy()
    ds_test = ds[ds['date'] > date(2018, 7, 16)].copy()
    ds_train.drop('date', axis = 1, inplace = True)
    ds_val.drop('date', axis = 1, inplace = True)
    ds_test.drop('date', axis = 1, inplace = True)
    
    # X/y split
    X_train = ds_train.drop(y_name, axis = 1)
    y_train = ds_train[y_name]
    
    X_val = ds_val.drop(y_name, axis = 1)
    y_val = ds_val[y_name]
    
    X_test = ds_test.drop(y_name, axis = 1)
    y_test = ds_test[y_name]
    
    return X_train, X_val, X_test, y_train, y_val, y_test, cats

In [11]:
X_train, X_val, X_test, y_train, y_val, y_test, cats = prep_data(data, conts, cats, name)

In [12]:
# set params
feat = X_train.columns.tolist()
params = {
    'num_leaves': 21,
    'objective': 'regression',
    'min_data_in_leaf': 600,
    'learning_rate': 0.1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 2,
    'metric': 'l1',
    'num_threads': 4
}

MAX_ROUNDS = 3000
dtrain = lgb.Dataset(X_train, label = y_train, categorical_feature=cats)
dval = lgb.Dataset(X_val, label = y_val, reference=dtrain, categorical_feature=cats)

In [13]:
# train revenue model
bst = lgb.train(params, dtrain, num_boost_round=MAX_ROUNDS, valid_sets=[dtrain, dval],
                    early_stopping_rounds=50, verbose_eval=50)



Training until validation scores don't improve for 50 rounds.
[50]	training's l1: 0.133182	valid_1's l1: 0.141888
[100]	training's l1: 0.129143	valid_1's l1: 0.138866
[150]	training's l1: 0.126753	valid_1's l1: 0.136683
[200]	training's l1: 0.125587	valid_1's l1: 0.135562
[250]	training's l1: 0.124873	valid_1's l1: 0.13501
[300]	training's l1: 0.124215	valid_1's l1: 0.134464
[350]	training's l1: 0.123876	valid_1's l1: 0.134116
[400]	training's l1: 0.123418	valid_1's l1: 0.133595
[450]	training's l1: 0.123129	valid_1's l1: 0.13342
[500]	training's l1: 0.122894	valid_1's l1: 0.133203
[550]	training's l1: 0.122628	valid_1's l1: 0.132859
[600]	training's l1: 0.122438	valid_1's l1: 0.132684
[650]	training's l1: 0.122384	valid_1's l1: 0.132622
[700]	training's l1: 0.122264	valid_1's l1: 0.132492
[750]	training's l1: 0.122091	valid_1's l1: 0.132286
[800]	training's l1: 0.121945	valid_1's l1: 0.132161
[850]	training's l1: 0.121822	valid_1's l1: 0.13204
[900]	training's l1: 0.121636	valid_1's l

In [14]:
print('Feature_importance:')
print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))

Feature_importance:
supplier_id: 35365.82
class_id: 29058.08
manufacturer_id: 21221.39
revenue_product: 11370.99
wholesale_price: 10183.87
sale_price: 4653.54
month: 2226.20
revenue_shipping: 1092.34
is_b2b: 785.06
shipping_speed_id: 577.57
dow: 388.13
shipping_pred: 382.12
category_id: 376.22
quantity_initial: 322.58
dom: 121.27
carrier_id: 56.72
is_giftcard: 0.00


In [15]:
train_pred = bst.predict(X_train)
mean_absolute_error(y_train, train_pred)

0.12154786426500057

In [16]:
test_pred = bst.predict(X_test)
mean_absolute_error(y_test, test_pred)

0.13487910493701635