# Modeling

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import shap
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn.model_selection import cross_val_score

pd.set_option('display.max_rows', 160)
pd.set_option('display.max_columns', 160)
pd.set_option('display.max_colwidth', 30)

import warnings
warnings.filterwarnings("ignore")

In [2]:
# load the proeprocessed data
df = pd.read_pickle('data/df_complete.pkl')

X_train = df[~df.date_block_num.isin([0, 1, 33, 34])]
y_train = X_train['item_cnt']
del X_train['item_cnt']

X_val = df[df['date_block_num']==33]
y_val = X_val['item_cnt']
del X_val['item_cnt']

X_test = df[df['date_block_num']==34].drop(columns='item_cnt')
X_test = X_test.reset_index()
del X_test['index']

# free memory
del df

#### LightGBM

In [3]:
def build_lgb_model(params, X_train, X_val, y_train, y_val, cat_features):
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_val = lgb.Dataset(X_val, y_val)
    model = lgb.train(params=params, train_set=lgb_train, valid_sets=(lgb_train, lgb_val), verbose_eval=50,
                     categorical_feature=cat_features)
    return model
#skip this cell if directly loading saved model 
params = {
    'objective': 'rmse',
    'metric': 'rmse',
    'num_leaves': 1023,
    'min_data_in_leaf':10,
    'feature_fraction':0.7,
    'learning_rate': 0.01,
    'num_rounds': 1000,
    'early_stopping_rounds': 30,
    'seed': 1
}
#designating the categorical features which should be focused on
cat_features = ['item_category_id','month','shop_id','shop_city']

lgb_model = build_lgb_model(params, X_train, X_val, y_train, y_val, cat_features)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14918
[LightGBM] [Info] Number of data points in the train set: 7785068, number of used features: 152
[LightGBM] [Info] Start training from score 0.313209
Training until validation scores don't improve for 30 rounds
[50]	training's rmse: 0.978905	valid_1's rmse: 0.893572
[100]	training's rmse: 0.840032	valid_1's rmse: 0.80698
[150]	training's rmse: 0.7648	valid_1's rmse: 0.77043
[200]	training's rmse: 0.722624	valid_1's rmse: 0.754407
[250]	training's rmse: 0.694978	valid_1's rmse: 0.746354
[300]	training's rmse: 0.675147	valid_1's rmse: 0.742143
[350]	training's rmse: 0.659851	valid_1's rmse: 0.740023
[400]	training's rmse: 0.647107	valid_1's rmse: 0.73918
[450]	training's rmse: 0.636354	valid_1's rmse: 0.738924
[500]	training's rmse: 0.62659	valid_1's rmse: 0.738778
Early stopping, best iteration is:
[508]	training's rmse: 0.625269	val

In [4]:
submission = pd.read_csv('data/sample_submission.csv')
submission['item_cnt_month'] = lgb_model.predict(X_test).clip(0,20)
submission[['ID', 'item_cnt_month']].to_csv('second_lgb_submission.csv', index=False)