# Stacking LGBM

## 사용할 package 불러오기

In [1]:
pip install lightgbm



In [2]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV

## 학습에 필요한 파일 불러오기

In [107]:
df=pd.read_excel('./bc/base_line_model.xlsx',header=0)

In [78]:
len(df.columns)

21

In [108]:
df['brand_name']=pd.Categorical(df['brand_name']).codes

In [109]:
del df['fill_exp_min']

In [7]:
print(df.columns,len(df.columns))

Index(['m_code', 'p_code', 'unit_price', 'MONTH', 'DAY', 'HOUR', 'MINUTE',
       'DAY_NUM', 'p_group_code', 'day_of_week_and_hour', 'rain', '52_week',
       'holiday', 'brand_name', 'brand_power', 'mask_ratio', 'parcel_ratio',
       'timeslot', 'season', 'div_exp_min', 'total_price'],
      dtype='object') 21


## 데이터셋 나누기

In [110]:
# 6월 데이터 뺀 나머지 데이터로 모델을 학습시킴
train_set=df[df['MONTH']!=6]
eval_set=df[df['MONTH']==6]

x=train_set.iloc[:,:-1]
y=train_set.iloc[:,-1]

In [111]:
train_x,test_x,train_y,test_y=train_test_split(x,y,test_size=0.1,random_state=42)

## Gridsearch

In [112]:
params = {'boosting_type': 'gbdt',
          'max_depth' : -1,
          'objective': 'regression',
          'nthread': 3, # Updated from nthread
          'num_leaves': 64,
          'learning_rate': 0.01,
          'max_bin': 512,
          'subsample_for_bin': 200,
          'subsample': 1,
          'subsample_freq': 1,
          'colsample_bytree': 0.8,
          'lambda_l1': 5,
          'lambda_l2': 10,
          'min_split_gain': 0.5,
          'min_child_weight': 1,
          'min_child_samples': 5,
          'scale_pos_weight': 1,
          'num_class' : 1,
          'metric' : 'mape'}

In [113]:
# Initiate classifier to use
mdl = lgb.LGBMRegressor(boosting_type= 'gbdt', 
                        objective = 'regression', 
                        n_jobs = -1, 
                        silent = False,
                        metric='mape',
                        num_iterations = 1000,
                        n_estimators=500,
                        learning_rate=params['learning_rate'],
                        num_leaves=params['num_leaves'],        
                        max_depth=params['max_depth'],        
                        max_bin=params['max_bin'],
                        lambda_l1=params['lambda_l1'],
                        lambda_l2=params['lambda_l2'],
                        subsample_for_bin=params['subsample_for_bin'],
                        subsample=params['subsample'],
                        colsample_bytree=params['colsample_bytree'],
                        min_split_gain=params['min_split_gain'],
                        min_child_weight=params['min_child_weight'],
                        min_child_samples=params['min_child_samples']
                       )

# To view the default model parameters:
mdl.get_params().keys()

dict_keys(['boosting_type', 'class_weight', 'colsample_bytree', 'importance_type', 'learning_rate', 'max_depth', 'min_child_samples', 'min_child_weight', 'min_split_gain', 'n_estimators', 'n_jobs', 'num_leaves', 'objective', 'random_state', 'reg_alpha', 'reg_lambda', 'silent', 'subsample', 'subsample_for_bin', 'subsample_freq', 'metric', 'num_iterations', 'max_bin', 'lambda_l1', 'lambda_l2'])

In [114]:
gridParams = {
    # 'n_estimators': [500],
    'learning_rate': [0.1],
    'num_leaves': [200,100],
    'max_depth': [500,300], # <0 means no limit
    'max_bin': [500,300], # Small number of bins may reduce training accuracy but can deal with over-fitting
    'lambda_l1': [10], # L1 regularization
    'lambda_l2': [5], # L2 regularization
    'subsample_for_bin': [500], # number of samples for constructing bins
    'subsample': [1], # subsample ratio of the training instance
    'colsample_bytree': [0.5], # subsample ratio of columns when constructing the tree
    'min_split_gain': [0.5], # minimum loss reduction required to make further partition on a leaf node of the tree
    'min_child_weight': [1], # minimum sum of instance weight (hessian) needed in a leaf
    'min_child_samples': [5]
    }

grid = GridSearchCV(mdl, gridParams, verbose=1, cv=4, n_jobs=-1)
# Run the grid
grid.fit(train_x,train_y)

# Print the best parameters found
print(grid.best_params_)
print(grid.best_score_)

Fitting 4 folds for each of 8 candidates, totalling 32 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 out of  32 | elapsed:  2.8min finished


{'colsample_bytree': 0.5, 'lambda_l1': 10, 'lambda_l2': 5, 'learning_rate': 0.1, 'max_bin': 300, 'max_depth': 500, 'min_child_samples': 5, 'min_child_weight': 1, 'min_split_gain': 0.5, 'num_leaves': 100, 'subsample': 1, 'subsample_for_bin': 500}
0.7504670869049217


## lgbm modeling

In [115]:
d_train = lgb.Dataset(train_x, label=train_y)
d_test = lgb.Dataset(test_x, label=test_y)

model = lgb.train(params=grid.best_params_, 
                  train_set=d_train,
                  num_boost_round=5000)

In [116]:
pred_y=model.predict(test_x)
pred_eval_y=model.predict(eval_set.iloc[:,:-1])
train_pred_y=model.predict(train_x)

def MAPE(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

print('test',MAPE((test_y),(pred_y)),
      'train',MAPE(train_y,train_pred_y),
      'evaluation',MAPE((eval_set.iloc[:,-1]),(pred_eval_y)))

test 50.671551883647005 train 12.137000902981413 evaluation 57.66116149182675


In [117]:
params = grid.best_params_

## stacking

In [118]:
from sklearn.linear_model import Ridge,Lasso
from mlxtend.regressor import StackingRegressor
from sklearn.model_selection import KFold

In [119]:
# regression models for stacking
ridge=Ridge(alpha=0.3)
lasso=Lasso(alpha=0.3)

In [120]:
kfold = KFold(n_splits=5, shuffle=False)
final_model = []
for idx, (train_idx, val_idx) in enumerate(kfold.split(x)):
    train_x, train_y = x.iloc[train_idx,:], y.values[train_idx]
    val_x, val_y = x.iloc[val_idx,:],y.values[val_idx]

    stacking_model = StackingRegressor(regressors=(mdl,ridge,lasso),
                                meta_regressor=mdl,
                                use_features_in_secondary=True)
    mdl.fit(np.array(train_x),np.array(train_y))
    final_model.append(mdl)
    
    pred_y = mdl.predict(np.array(val_x))
    



In [121]:
pred_y=mdl.predict(test_x)
pred_eval_y=mdl.predict(eval_set.iloc[:,:-1])
train_pred_y=mdl.predict(train_x)

def MAPE(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

print('test',MAPE((test_y),(pred_y)),
      'train',MAPE(train_y,train_pred_y),
      'evaluation',MAPE((eval_set.iloc[:,-1]),(pred_eval_y)))

test 67.89715893547749 train 58.06914873355863 evaluation 60.798804013780995


## 평가데이터 취급액 예측 (20년 6월 데이터)

### 평가데이터 불러오기

In [122]:
final_test_df=pd.read_excel('./bc/base_line_eval_model.xlsx')

In [123]:
final_test_df

Unnamed: 0,m_code,p_code,unit_price,MONTH,DAY,HOUR,MINUTE,DAY_NUM,p_group_code,day_of_week_and_hour,rain,52_week,holiday,brand_name_code,brand_power,mask_ratio,parcel_ratio,timeslot,season,fill_exp_min,div_exp_min,total_price
0,100650,201971,59800,6,1,6,20,1,0,6,0,23,0,306.0,20.923161,1.85848,1.15168,1,2,20.000000,20.000000,
1,100650,201971,59800,6,1,6,40,1,0,6,0,23,0,306.0,20.923161,1.85848,1.15168,1,2,20.000000,20.000000,
2,100650,201971,59800,6,1,7,0,1,0,7,0,23,0,306.0,20.923161,1.85848,1.15168,1,2,20.000000,20.000000,
3,100445,202278,69900,6,1,7,20,1,1,7,0,23,0,353.0,26.304128,1.85848,1.15168,1,2,20.000000,20.000000,
4,100445,202278,69900,6,1,7,40,1,1,7,0,23,0,353.0,26.304128,1.85848,1.15168,1,2,20.000000,20.000000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2886,100660,201989,0,7,1,0,20,3,5,48,0,27,0,,,1.11449,1.81406,7,2,20.000000,20.000000,
2887,100660,201989,0,7,1,0,40,3,5,48,0,27,0,,,1.11449,1.81406,7,2,20.000000,20.000000,
2888,100660,201989,0,7,1,1,0,3,5,49,0,27,0,,,1.11449,1.81406,7,2,20.000000,20.000000,
2889,100261,200875,69900,7,1,1,20,3,0,49,0,27,0,166.0,24.387286,1.11449,1.81406,7,2,20.000000,20.000000,


In [124]:
final_test_df.columns

Index(['m_code', 'p_code', 'unit_price', 'MONTH', 'DAY', 'HOUR', 'MINUTE',
       'DAY_NUM', 'p_group_code', 'day_of_week_and_hour', 'rain', '52_week',
       'holiday', 'brand_name_code', 'brand_power', 'mask_ratio',
       'parcel_ratio', 'timeslot', 'season', 'fill_exp_min', 'div_exp_min',
       'total_price'],
      dtype='object')

In [125]:
df.columns

Index(['m_code', 'p_code', 'unit_price', 'MONTH', 'DAY', 'HOUR', 'MINUTE',
       'DAY_NUM', 'p_group_code', 'day_of_week_and_hour', 'rain', '52_week',
       'holiday', 'brand_name', 'brand_power', 'mask_ratio', 'parcel_ratio',
       'timeslot', 'season', 'div_exp_min', 'total_price'],
      dtype='object')

In [126]:
del final_test_df['fill_exp_min']

In [127]:
final_test_df.rename(columns = {'brand_name_code' : 'brand_name'}, inplace = True)

In [128]:
final_test_df=final_test_df[final_test_df['p_group_code']!=5] #상품군 무형 제거

In [129]:
f_test_x=final_test_df.iloc[:,:-1]
f_test_y=final_test_df.iloc[:,-1]

In [130]:
final_test_df['total_price']=mdl.predict(f_test_x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [131]:
final_test_df

Unnamed: 0,m_code,p_code,unit_price,MONTH,DAY,HOUR,MINUTE,DAY_NUM,p_group_code,day_of_week_and_hour,rain,52_week,holiday,brand_name,brand_power,mask_ratio,parcel_ratio,timeslot,season,div_exp_min,total_price
0,100650,201971,59800,6,1,6,20,1,0,6,0,23,0,306.0,20.923161,1.85848,1.15168,1,2,20.000000,1.257134e+07
1,100650,201971,59800,6,1,6,40,1,0,6,0,23,0,306.0,20.923161,1.85848,1.15168,1,2,20.000000,1.813555e+07
2,100650,201971,59800,6,1,7,0,1,0,7,0,23,0,306.0,20.923161,1.85848,1.15168,1,2,20.000000,1.315290e+07
3,100445,202278,69900,6,1,7,20,1,1,7,0,23,0,353.0,26.304128,1.85848,1.15168,1,2,20.000000,2.658949e+07
4,100445,202278,69900,6,1,7,40,1,1,7,0,23,0,353.0,26.304128,1.85848,1.15168,1,2,20.000000,3.470132e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2883,100099,200273,99000,7,1,0,10,3,1,48,0,27,0,94.0,19.105643,1.11449,1.81406,7,2,3.333333,1.605370e+07
2884,100099,200272,119000,7,1,0,10,3,1,48,0,27,0,94.0,19.105643,1.11449,1.81406,7,2,3.333333,1.655083e+07
2885,100099,200274,119000,7,1,0,10,3,1,48,0,27,0,94.0,19.105643,1.11449,1.81406,7,2,3.333333,1.655083e+07
2889,100261,200875,69900,7,1,1,20,3,0,49,0,27,0,166.0,24.387286,1.11449,1.81406,7,2,20.000000,1.422581e+07


In [132]:
ev_df = pd.read_excel('./bc/bigcontest_record_eval.xlsx',header=1)
ev_df = ev_df[ev_df['상품군']!='무형']
ev_df = ev_df[['방송일시', '노출(분)', '마더코드', '상품코드', '상품명', '상품군', '판매단가']]

In [133]:
final_test_df = final_test_df['total_price']

In [134]:
ev_df = pd.concat([ev_df, final_test_df], axis = 1)

In [135]:
ev_df.rename(columns = {'total_price' : '취급액'}, inplace = True)

In [136]:
ev_df.to_excel('./bc/20년6월예측.xlsx', index=False)