In [1]:
import math
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 70)
pd.options.mode.chained_assignment = None  # default='warn'
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from bayes_opt import BayesianOptimization #From https://github.com/fmfn/BayesianOptimization

%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings('ignore')

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
sales = pd.read_csv('sales.csv', nrows=100)

float_cols = [c for c in sales if (sales[c].dtype == "float64")&('revenue' not in c)&('ratio' not in c)]
float16_cols = {c: np.float16 for c in float_cols}

int_cols = [c for c in sales if (sales[c].dtype == "int64")&(c != 'itemid')]
int8_cols = {c: np.int8 for c in int_cols}

float16_cols.update(int8_cols)
sales = pd.read_csv('sales.csv', engine='c', dtype=float16_cols)

In [3]:
#All features were selected using (very time consuming) feature selection, consisting in a mix of intuition,
#manual recursive feature elimination, use of feature importance from lgb

In [8]:
features = [
    'month',
    'shopid',
    'itemcategoryid',
    'meanlagitemshop',
    'meanlagitem',
    'cumulativemeanitemshop',
    'shopcityid',
    'shoptypeid',
    'itemcategorytypeid',
    'itemcategorysubtypeid',
    'nbdays',
    'year',
    'monthofyear',
    'nbweekends',
    'ratiolastanteprice',
    'ratiolastavgprice',
    'ratiolastminprice',
    'ratiolastmaxprice',
    'ratiorevenue12',
    'newitem',
    'tsshopid1',
    'tsshopid2',
    'tsshopid3',
    'tsshopid6',
    'tsshopid12',
    'tsshopcityid1',
    'tsshoptypeid1',
    'tsitemcategoryid1',
    'tsitemcategorysubtypeid1',
    'tsitemcategoryiditemcategorytypeid1',
    'tsitemcategoryiditemcategorysubtypeid1',
    'tsitemcategoryidshopcityid1',
    'tsitemcategoryidshoptypeid1',
    'tsitemcategoryidshopid1',
    'tsitemcategorytypeiditemcategorysubtypeid1',
    'tsitemcategorytypeidshopcityid1',
    'tsitemcategorytypeidshoptypeid1',
    'tsitemcategorytypeidshopid1',
    'tsitemcategorysubtypeidshopcityid1',
    'tsitemcategorysubtypeidshoptypeid1',
    'tsitemcategorysubtypeidshopid1',
    'tsshopcityidshoptypeid1',
    'tsshopcityidshopid1',
    'tsshoptypeidshopid1',
    'tsitemid1',
    'tsitemid2',
    'tsitemid3',
    'tsitemid6',
    'tsitemid12',
    'tsshopiditemid1',
    'tsshopiditemid2',
    'tsshopiditemid3',
    'tsshopiditemid6',
    'tsshopiditemid12'
]

categorical_features = [
    'shopid',
    'itemcategoryid',
    'shopcityid',
    'shoptypeid',
    'itemcategorytypeid',
    'itemcategorysubtypeid',
]

In [11]:
#Train until month 32, predict month 33
def run_lgb(features,categorical_features,params):
    minmonth = 12
    monthpred = 33

    ROUNDS = 10000
    
    traindf = sales[(sales.month < monthpred) & (sales.month >= minmonth)]
    testdf = sales[(sales.month == monthpred) & (sales.month >= minmonth)]
    
    traingbm = lgb.Dataset(traindf[features], label=traindf['target'], categorical_feature=categorical_features, free_raw_data=False)
    testgbm = lgb.Dataset(testdf[features], label=testdf['target'], categorical_feature=categorical_features, free_raw_data=False)
    
    m = lgb.train(params, traingbm, ROUNDS, valid_sets=[traingbm, testgbm], verbose_eval=1)
    predsgbm = m.predict(testdf[features])
    mse = mean_squared_error(testdf['target'],predsgbm)
    rmse = math.sqrt(mse)
    print('total rmse : {}'.format(rmse))
    return -rmse #(score for Bayesian optimization)

In [12]:
#HyperParameters found using Bayesian optimization (see below section)
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 116,
    'max_depth': 7,
    'feature_fraction': 0.9674089710795843,
    'bagging_fraction': 0.9064002765231052,
    'bagging_freq': 3,
    'learning_rate': 0.05096799128425091,
    'early_stopping_round': 60,
    'lambda': 0.9941443549841082,
    'min_split_gain': 7.324275428951591,
    'min_child_samples': 107,
    'min_data_per_group': 20,
    'min_child_weight': 9.558708559566462,
    'cat_smooth': 2.876702721398722,
    'min_data_in_leaf': 184,
    'max_bin': 341,
    'verbosity' : -1
}
run_lgb(features,categorical_features,params)

[1]	training's rmse: 1.16135	valid_1's rmse: 1.13081
Training until validation scores don't improve for 60 rounds.
[2]	training's rmse: 1.13523	valid_1's rmse: 1.10946
[3]	training's rmse: 1.111	valid_1's rmse: 1.08872
[4]	training's rmse: 1.08863	valid_1's rmse: 1.07032
[5]	training's rmse: 1.06794	valid_1's rmse: 1.05301
[6]	training's rmse: 1.04889	valid_1's rmse: 1.0365
[7]	training's rmse: 1.03134	valid_1's rmse: 1.02208
[8]	training's rmse: 1.01499	valid_1's rmse: 1.00796
[9]	training's rmse: 0.999741	valid_1's rmse: 0.994976
[10]	training's rmse: 0.985839	valid_1's rmse: 0.983602
[11]	training's rmse: 0.972997	valid_1's rmse: 0.973404
[12]	training's rmse: 0.961021	valid_1's rmse: 0.963455
[13]	training's rmse: 0.950062	valid_1's rmse: 0.954546
[14]	training's rmse: 0.939822	valid_1's rmse: 0.946114
[15]	training's rmse: 0.930542	valid_1's rmse: 0.939198
[16]	training's rmse: 0.921945	valid_1's rmse: 0.931859
[17]	training's rmse: 0.913756	valid_1's rmse: 0.924777
[18]	training'

[147]	training's rmse: 0.780524	valid_1's rmse: 0.843935
[148]	training's rmse: 0.780396	valid_1's rmse: 0.843956
[149]	training's rmse: 0.7803	valid_1's rmse: 0.844023
[150]	training's rmse: 0.780145	valid_1's rmse: 0.844089
[151]	training's rmse: 0.779812	valid_1's rmse: 0.84404
[152]	training's rmse: 0.7795	valid_1's rmse: 0.844075
[153]	training's rmse: 0.779254	valid_1's rmse: 0.844035
[154]	training's rmse: 0.779033	valid_1's rmse: 0.844015
[155]	training's rmse: 0.778835	valid_1's rmse: 0.843959
[156]	training's rmse: 0.778698	valid_1's rmse: 0.843959
[157]	training's rmse: 0.778591	valid_1's rmse: 0.843945
[158]	training's rmse: 0.778459	valid_1's rmse: 0.843899
[159]	training's rmse: 0.778241	valid_1's rmse: 0.843818
[160]	training's rmse: 0.777964	valid_1's rmse: 0.843802
[161]	training's rmse: 0.77785	valid_1's rmse: 0.843789
[162]	training's rmse: 0.777613	valid_1's rmse: 0.84375
[163]	training's rmse: 0.77751	valid_1's rmse: 0.843768
[164]	training's rmse: 0.777428	valid_1

[292]	training's rmse: 0.761505	valid_1's rmse: 0.843151
[293]	training's rmse: 0.761439	valid_1's rmse: 0.843167
[294]	training's rmse: 0.761374	valid_1's rmse: 0.843185
[295]	training's rmse: 0.7613	valid_1's rmse: 0.843205
[296]	training's rmse: 0.76122	valid_1's rmse: 0.843216
[297]	training's rmse: 0.761086	valid_1's rmse: 0.843219
[298]	training's rmse: 0.76097	valid_1's rmse: 0.843281
[299]	training's rmse: 0.760878	valid_1's rmse: 0.843335
[300]	training's rmse: 0.760806	valid_1's rmse: 0.843299
[301]	training's rmse: 0.760708	valid_1's rmse: 0.843304
[302]	training's rmse: 0.760619	valid_1's rmse: 0.843289
[303]	training's rmse: 0.760582	valid_1's rmse: 0.843141
[304]	training's rmse: 0.760493	valid_1's rmse: 0.843161
[305]	training's rmse: 0.760339	valid_1's rmse: 0.843121
[306]	training's rmse: 0.760202	valid_1's rmse: 0.843196
[307]	training's rmse: 0.760132	valid_1's rmse: 0.843254
[308]	training's rmse: 0.760057	valid_1's rmse: 0.843256
[309]	training's rmse: 0.760002	val

-0.8425161900015496

In [None]:
#Train one last time on the whole dataset before submission

In [14]:
ROUNDS = 200#Found by averaging the optimal nb of rounds for predictions of month 32 and 33
traindf = sales[(sales.month <= 33) & (sales.month >= 12)]
traingbm = lgb.Dataset(traindf[features], label=traindf['target'], categorical_feature=categorical_features, free_raw_data=False)
m = lgb.train(params, traingbm, ROUNDS, valid_sets=[traingbm], verbose_eval=10)

Training until validation scores don't improve for 60 rounds.
[10]	training's rmse: 0.98564
[20]	training's rmse: 0.895203
[30]	training's rmse: 0.8535
[40]	training's rmse: 0.832808
[50]	training's rmse: 0.820881
[60]	training's rmse: 0.814011
[70]	training's rmse: 0.808419
[80]	training's rmse: 0.804365
[90]	training's rmse: 0.80107
[100]	training's rmse: 0.798154
[110]	training's rmse: 0.795268
[120]	training's rmse: 0.792629
[130]	training's rmse: 0.790126
[140]	training's rmse: 0.787491
[150]	training's rmse: 0.785591
[160]	training's rmse: 0.783866
[170]	training's rmse: 0.782401
[180]	training's rmse: 0.780607
[190]	training's rmse: 0.778943
[200]	training's rmse: 0.777715
Did not meet early stopping. Best iteration is:
[200]	training's rmse: 0.777715


In [13]:
#Check feature importance

In [14]:
def get_features_importance(model):
    gain = model.feature_importance('gain')
    feat_imp = pd.DataFrame(
        {
            'feature':model.feature_name(), 
            'split':model.feature_importance('split'), 
            'gain':100 * gain / gain.sum()
        }
    ).sort_values('gain', ascending=False)
    return feat_imp

In [46]:
get_features_importance(m)

Unnamed: 0,feature,split,gain
48,tsshopiditemid1,670,45.212525
3,meanlagitemshop,505,15.694011
2,itemcategoryid,2759,9.858314
4,meanlagitem,1498,4.963407
18,newitem,224,4.374682
1,shopid,2147,3.280105
12,monthofyear,925,1.967077
39,tsitemcategorysubtypeidshopid1,290,1.811962
44,tsitemid2,792,1.712440
5,cumulativemeanitemshop,376,1.423727


In [16]:
#Save model

In [17]:
m.save_model('lgb.txt')

<lightgbm.basic.Booster at 0x10a93b750>

In [None]:
#Load model if needed

In [None]:
m = lgb.Booster(model_file='lgb.txt')

In [None]:
#Create submission file

In [17]:
test = pd.read_csv('./data/test.csv')
test.columns = ['ID','shopid','itemid']
test.loc[test.shopid == 0, 'shopid'] = 57
test.loc[test.shopid == 1, 'shopid'] = 58
test.loc[test.shopid == 10, 'shopid'] = 11
test.head(1)

Unnamed: 0,ID,shopid,itemid
0,0,5,5037


In [16]:
testdf = sales[sales.month == 34]
preds = m.predict(testdf[features])
testdf['item_cnt_month'] = preds
submission = pd.merge(test,testdf[['shopid','itemid','item_cnt_month']],on=['shopid','itemid'],how='left')
submission = submission[['ID','item_cnt_month']]
print(submission.shape)
print(submission['item_cnt_month'].min())
print(submission['item_cnt_month'].max())
submission['item_cnt_month'] = submission['item_cnt_month'].clip(0,20)
print(submission['item_cnt_month'].min())
print(submission['item_cnt_month'].max())

submission.to_csv('submission.csv',index=False)

(214200, 2)
-0.7244908667897672
20.128192412081304
0.0
20.0


<img src="Kaggle Ranking 2.png">

<img src="Kaggle ranking 1.png">

In [21]:
#Find good hyperparameters using Bayesian optimization

In [22]:
def bayesian_lgb(num_leaves, max_depth, min_child_weight, feature_fraction, bagging_fraction, bagging_freq, learning_rate, lambda_param, min_split_gain, min_child_samples, min_data_per_group, cat_smooth, min_data_in_leaf, max_bin):
    params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': 'rmse',
        'num_leaves': int(num_leaves),
        'max_depth': int(max_depth),
        'min_child_weight': min_child_weight,
        'feature_fraction': feature_fraction,
        'bagging_fraction': bagging_fraction, 
        'bagging_freq': int(bagging_freq),
        'learning_rate': learning_rate,
        'early_stopping_round': 60,
        'lambda': lambda_param,
        'min_split_gain': min_split_gain,
        'min_child_samples': int(min_child_samples),
        'min_data_per_group': int(min_data_per_group),
        'cat_smooth': cat_smooth,
        'min_data_in_leaf': int(min_data_in_leaf),
        'max_bin': int(max_bin),
        'verbosity' : -1
    }
    score = run_lgb(features,categorical_features,params)
    return score

In [25]:
pbounds = {
    'num_leaves': (20, 120),
    'max_depth': (2, 20),
    'min_child_weight': (0.5,10.0),
    'feature_fraction': (0.9, 1.0),
    'bagging_fraction': (0.9, 1.0),
    'bagging_freq': (1.0, 20.0),
    'learning_rate': (0.001, 0.8),
    'lambda_param': (0.0, 2.0),
    'min_split_gain': (0.0, 10.0),
    'min_child_samples': (20, 150),
    'min_data_per_group': (20, 400),
    'cat_smooth': (0.0, 5.0),
    'min_data_in_leaf': (10, 200),
    'max_bin': (32, 512)
}

optimizer = BayesianOptimization(
    f=bayesian_lgb,
    pbounds=pbounds,
    random_state=42,
)
optimizer.maximize(
    init_points=1,
    n_iter=2,
)

|   iter    |  target   | baggin... | baggin... | cat_sm... | featur... | lambda... | learni... |  max_bin  | max_depth | min_ch... | min_ch... | min_da... | min_da... | min_sp... | num_le... |
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
[1]	training's rmse: 1.12509	valid_1's rmse: 1.08905
Training until validation scores don't improve for 60 rounds.
[2]	training's rmse: 1.07154	valid_1's rmse: 1.04815
[3]	training's rmse: 1.02854	valid_1's rmse: 1.01589
[4]	training's rmse: 0.992647	valid_1's rmse: 0.991493
[5]	training's rmse: 0.963905	valid_1's rmse: 0.971132
[6]	training's rmse: 0.940176	valid_1's rmse: 0.954991
[7]	training's rmse: 0.920797	valid_1's rmse: 0.942607
[8]	training's rmse: 0.904682	valid_1's rmse: 0.931317
[9]	training's rmse: 0.891917	valid_1's rmse: 0.923861
[10]	training's rmse: 0.881004	valid_1's rmse: 0.91683
[11]	t

[43]	training's rmse: 0.854204	valid_1's rmse: 0.907115
[44]	training's rmse: 0.85396	valid_1's rmse: 0.90641
[45]	training's rmse: 0.853455	valid_1's rmse: 0.905891
[46]	training's rmse: 0.853152	valid_1's rmse: 0.905756
[47]	training's rmse: 0.85293	valid_1's rmse: 0.906083
[48]	training's rmse: 0.852543	valid_1's rmse: 0.906161
[49]	training's rmse: 0.852205	valid_1's rmse: 0.906055
[50]	training's rmse: 0.851957	valid_1's rmse: 0.90606
[51]	training's rmse: 0.851753	valid_1's rmse: 0.906109
[52]	training's rmse: 0.851564	valid_1's rmse: 0.906089
[53]	training's rmse: 0.851177	valid_1's rmse: 0.906281
[54]	training's rmse: 0.850846	valid_1's rmse: 0.905601
[55]	training's rmse: 0.850526	valid_1's rmse: 0.906074
[56]	training's rmse: 0.850317	valid_1's rmse: 0.906415
[57]	training's rmse: 0.849918	valid_1's rmse: 0.905743
[58]	training's rmse: 0.849581	valid_1's rmse: 0.905025
[59]	training's rmse: 0.849439	valid_1's rmse: 0.904963
[60]	training's rmse: 0.849273	valid_1's rmse: 0.904

[189]	training's rmse: 0.828712	valid_1's rmse: 0.895137
[190]	training's rmse: 0.828625	valid_1's rmse: 0.895143
[191]	training's rmse: 0.828551	valid_1's rmse: 0.895134
[192]	training's rmse: 0.828378	valid_1's rmse: 0.895434
[193]	training's rmse: 0.82832	valid_1's rmse: 0.89539
[194]	training's rmse: 0.828247	valid_1's rmse: 0.895288
[195]	training's rmse: 0.828201	valid_1's rmse: 0.895128
[196]	training's rmse: 0.828094	valid_1's rmse: 0.894886
[197]	training's rmse: 0.828059	valid_1's rmse: 0.89489
[198]	training's rmse: 0.827993	valid_1's rmse: 0.894858
[199]	training's rmse: 0.827912	valid_1's rmse: 0.897846
Early stopping, best iteration is:
[139]	training's rmse: 0.833256	valid_1's rmse: 0.892898
total rmse : 0.8863172649851351
| [0m 2       [0m | [0m-0.8863  [0m | [0m 0.9376  [0m | [0m 19.38   [0m | [0m 2.076   [0m | [0m 0.9334  [0m | [0m 1.981   [0m | [0m 0.6971  [0m | [0m 510.4   [0m | [0m 2.236   [0m | [0m 24.0    [0m | [0m 4.935   [0m | [0m 108.7

In [26]:
print(optimizer.max)

{'target': -0.8763972148816184, 'params': {'bagging_fraction': 0.9374540118847363, 'bagging_freq': 19.063571821788408, 'cat_smooth': 3.6599697090570253, 'feature_fraction': 0.9598658484197037, 'lambda_param': 0.31203728088487304, 'learning_rate': 0.12563962174862592, 'max_bin': 59.88013384073574, 'max_depth': 17.591170623948834, 'min_child_samples': 98.14495152661715, 'min_child_weight': 7.226689489062432, 'min_data_in_leaf': 13.911053916202466, 'min_data_per_group': 388.5657438215578, 'min_split_gain': 8.324426408004218, 'num_leaves': 41.23391106782762}}


### Did not work :
- Preprocess features with PCA and n_components=len(features)
- Xgboost with the below hyperparameters alone:  
model = XGBRegressor(  
    max_depth=10,  
    n_estimators=1000,  
    min_child_weight=0.5,   
    colsample_bytree=0.8,   
    subsample=0.8,   
    eta=0.1,  
    seed=42)  
- Stacking : linear regression (using Scikit learn LinearRegression() between lgb and xgb predictions
- Deep Learning : See Deep Learning Models notebook
- Building a specific model only for new items with no historical data
- Rounding predictions before submission
- Forcing predicted value below a small threshold (e.g. 0.2) to be exactly 0.0 before submission