In [None]:
# build 6 lightGBM models for Hierachical model

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from datetime import date, timedelta
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error

In [None]:
# load
data = pd.read_csv('df_egm.csv', parse_dates=['order_date'])
# feature with no variance / duplicated feature
data.drop(['store_id', 'is_liquidation', 'is_cancelled'], axis = 1, inplace = True)
# duplicated
data = data[~data['id'].duplicated()]
# drop cancelled
data = data[data['quantity_cancelled'] == 0]

data['date'] = data['order_date'].dt.date
data['cost'] = data['cost_product'] + data['cost_shipping'] + data['cost_other']

In [None]:
rtmask = (data['quantity_returned'] > 0) & (data['quantity_replaced'] == 0)
rpmask = (data['quantity_replaced'] > 0) & (data['quantity_returned'] == 0)
demask = (data['quantity_replaced'] == 0) & (data['quantity_returned'] == 0)

In [None]:
def prep_data(data, conts, cats, y_name):
    # continuous features
    ds = data[conts].copy()
    # categorical features: code to 0 - (n-1)
    for cat in cats:
        ds[cat] = LabelEncoder().fit_transform(data[cat].copy())
        ds.loc[data[cat].isnull(), cat] = np.nan
    
    # time
    ds['month'] = data['order_date'].dt.month.copy() - 1
    ds['dom'] = data['order_date'].dt.day.copy() - 1
    ds['dow'] = data['order_date'].dt.dayofweek.copy()
    cats.extend(['month', 'dow'])
    
    # train/val/test split
    ds_train = ds[ds['date'] < date(2018, 6, 17)].copy()
    ds_val = ds[(ds['date'] >= date(2018, 6, 17)) & (ds['date'] <= date(2018, 7, 16))].copy()
    ds_test = ds[ds['date'] > date(2018, 7, 16)].copy()
    ds_train.drop('date', axis = 1, inplace = True)
    ds_val.drop('date', axis = 1, inplace = True)
    ds_test.drop('date', axis = 1, inplace = True)
    
    # X/y split
    X_train = ds_train.drop(y_name, axis = 1)
    y_train = ds_train[y_name]
    
    X_val = ds_val.drop(y_name, axis = 1)
    y_val = ds_val[y_name]
    
    X_test = ds_test.drop(y_name, axis = 1)
    y_test = ds_test[y_name]
    
    return X_train, X_val, X_test, y_train, y_val, y_test, cats

In [None]:
def train_model(data, conts, cats, y_name):
    X_train, X_val, X_test, y_train, y_val, y_test, cats = prep_data(data, conts, cats, y_name)
    # set params
    feat = X_train.columns.tolist()
    params = {
        'num_leaves': 31,
        'objective': 'regression',
        'min_data_in_leaf': 600,
        'learning_rate': 0.05,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 2,
        'metric': 'l1',
        'num_threads': 4
    }

    MAX_ROUNDS = 800
    dtrain = lgb.Dataset(X_train, label = y_train, categorical_feature=cats)
    dval = lgb.Dataset(X_val, label = y_val, reference=dtrain, categorical_feature=cats)
    
    bst = lgb.train(params, dtrain, num_boost_round=MAX_ROUNDS, valid_sets=[dtrain, dval],
                    early_stopping_rounds=50, verbose_eval=50)
    return bst

In [None]:
rev_name = 'revenue_net'
rev_conts = ['date', 'sale_price', 'revenue_product', 'revenue_shipping', 'quantity_initial', 'revenue_net']
rev_cats = ['shipping_speed_id', 'supplier_id', 'category_id', 'class_id', 'carrier_id', 'manufacturer_id', 'is_b2b', 'is_giftcard']

cos_name = 'cost'
cos_conts = ['date', 'sale_price', 'wholesale_price', 'quantity_initial', 'shipping_pred', 'cost']
cos_cats = ['shipping_speed_id', 'supplier_id', 'category_id', 'class_id', 'carrier_id', 'manufacturer_id', 'is_b2b', 'is_giftcard']

In [None]:
def prep_data2(data, conts, cats, y_name):
    # continuous features
    ds = data[conts].copy()
    ds['id'] = data['id'].copy()
    
    # categorical features: code to 0 - (n-1)
    for cat in cats:
        ds[cat] = LabelEncoder().fit_transform(data[cat].copy())
        ds.loc[data[cat].isnull(), cat] = np.nan
    
    # time
    ds['month'] = data['order_date'].dt.month.copy() - 1
    ds['dom'] = data['order_date'].dt.day.copy() - 1
    ds['dow'] = data['order_date'].dt.dayofweek.copy()
    cats.extend(['month', 'dow'])
    
    # train/val/test split
    ds_train = ds[ds['date'] < date(2018, 6, 17)].copy()
    ds_val = ds[(ds['date'] >= date(2018, 6, 17)) & (ds['date'] <= date(2018, 7, 16))].copy()
    ds_test = ds[ds['date'] > date(2018, 7, 16)].copy()
    ds_train.drop('date', axis = 1, inplace = True)
    ds_val.drop('date', axis = 1, inplace = True)
    ds_test.drop('date', axis = 1, inplace = True)
    
    # X/y split
    X_train = ds_train.drop(y_name, axis = 1)
    y_train = ds_train[y_name]
    
    X_val = ds_val.drop(y_name, axis = 1)
    y_val = ds_val[y_name]
    
    X_test = ds_test.drop(y_name, axis = 1)
    y_test = ds_test[y_name]
    
    return X_train, X_val, X_test, y_train, y_val, y_test, cats

In [None]:
_, _, X_rev_test, _, _, y_rev_test, _ = prep_data2(data, rev_conts, rev_cats.copy(), rev_name)
_, _, X_cos_test, _, _, y_cos_test, _ = prep_data2(data, cos_conts, cos_cats.copy(), cos_name)

In [None]:
res = pd.DataFrame()
res['id'] = X_rev_test['id'].copy()
X_rev_test.drop('id', axis = 1, inplace = True)
X_cos_test.drop('id', axis = 1, inplace = True)

rtdata = data[rtmask].copy()
bst = train_model(rtdata, rev_conts, rev_cats.copy(), rev_name)
res['rt_rev_pred'] = bst.predict(X_rev_test)
bst = train_model(rtdata, cos_conts, cos_cats.copy(), cos_name)
res['rt_cos_pred'] = bst.predict(X_cos_test)

rp = pd.DataFrame()
rpdata = data[rpmask].copy()
bst = train_model(rpdata, rev_conts, rev_cats.copy(), rev_name)
res['rp_rev_pred'] = bst.predict(X_rev_test)
bst = train_model(rpdata, cos_conts, cos_cats.copy(), cos_name)
res['rp_cos_pred'] = bst.predict(X_cos_test)

de = pd.DataFrame()
dedata = data[demask].copy()
bst = train_model(dedata, rev_conts, rev_cats.copy(), rev_name)
res['de_rev_pred'] = bst.predict(X_rev_test)
bst = train_model(dedata, cos_conts, cos_cats.copy(), cos_name)
res['de_cos_pred'] = bst.predict(X_cos_test)

In [None]:
res.reset_index().drop('index', axis = 1).set_index('id').to_csv('six_models.csv')