In [26]:
from pprint import pprint
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_columns', None)

from sklearn import *
from lightgbm import LGBMRegressor, LGBMClassifier

In [7]:
train = pd.read_pickle('train.pkl')
test = pd.read_pickle('test.pkl')
train.visit_date = pd.to_datetime(train.visit_date)
test.visit_date = pd.to_datetime(test.visit_date)

In [9]:
np.log1p(1)

0.6931471805599453

In [12]:
(train['visitors'] == np.log1p(1)).sum()

4811

In [18]:
for i in range(1, 11):
    train['le{}'.format(i)] = (train['visitors']<=np.log1p(i)).astype(int)

In [79]:
val_id = ((train.year == 2017) & (train.month >= 3))
trn_id = ~val_id
col = [c for c in train if c not in ['id', 'air_store_id', 'air_area_name', 'air_genre_name', 'visit_date', 'visitors'] + ['le{}'.format(i) for i in range(1, 11)]]
X_trn = train.loc[trn_id, col]
y_trn = train.loc[trn_id, 'le1'].values
X_val = train.loc[val_id, col]
y_val = train.loc[val_id, 'le1'].values
X_all = train[col]
y_all = train['le1'].values
X_tst = test[col]

In [80]:
def RMSLE(y, pred):
    return metrics.mean_squared_error(y, pred)**0.5

from sklearn.metrics import accuracy_score

In [81]:
from copy import deepcopy

In [83]:
pretrain_models = {}

for i in range(1, 11):
    params = {
        'learning_rate': 0.1,
        'min_child_samples': 20, 
        'n_estimators': 200, 
        'num_leaves': 50, 
        'random_state': 77, 
    }
    lgb = LGBMClassifier(**params)
    
    X = train[col]
    y = train[f'le{i}'].values
    
    print(f'train {i}')
    lgb.fit(X, y)
    
    train[f'le{i}_p'] = lgb.predict_proba(X_all)[:, 1]
    test[f'le{i}_p'] = lgb.predict_proba(X_tst)[:, 1]
    
    pretrain_models[i] = deepcopy(lgb)

train 1
train 2
train 3
train 4
train 5
train 6
train 7
train 8
train 9
train 10


## 本チャン

In [93]:
val_id = ((train.year == 2017) & (train.month >= 3))
trn_id = ~val_id
col = [c for c in train if c not in ['id', 'air_store_id', 'air_area_name', 'air_genre_name', 'visit_date', 'visitors'] + ['le{}'.format(i) for i in range(1, 11)] + ['le{}_p'.format(i) for i in range(3, 11)]]
X_trn = train.loc[trn_id, col]
y_trn = train.loc[trn_id, 'visitors'].values
X_val = train.loc[val_id, col]
y_val = train.loc[val_id, 'visitors'].values
X_all = train[col]
y_all = train['visitors'].values
X_tst = test[col]

In [94]:
# params = {
#     'boosting_type': 'gbdt',
#     'num_leaves': 70,
#     'n_estimators': 500,
#     'learning_rate': 0.05,
#     'random_state': 77,
#     'max_depth': 6,
#     'min_child_samples': 50,
#     'colsample_bytree': 0.8,
#     'subsample': 0.8
# }
params = {'colsample_bytree': 0.8, 'learning_rate': 0.05, 
 'min_child_samples': 50, 'n_estimators': 600, 'num_leaves': 70, 'random_state': 77, 'subsample': 1.0}
lgb = LGBMRegressor(**params)

lgb.fit(X_trn, y_trn)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.8,
       learning_rate=0.05, max_depth=-1, min_child_samples=50,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=600,
       n_jobs=-1, num_leaves=70, objective=None, random_state=77,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)

In [95]:
print(f'train: {RMSLE(y_trn, lgb.predict(X_trn)):.4f}, val: {RMSLE(y_val, lgb.predict(X_val)):.4f}')
print('params = ', params)
print('cols = ', list(X_trn.columns))

train: 0.4347, val: 0.4521
params =  {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'min_child_samples': 50, 'n_estimators': 600, 'num_leaves': 70, 'random_state': 77, 'subsample': 1.0}
cols =  ['dow', 'year', 'month', 'holiday_flg', 'min_visitors', 'mean_visitors', 'median_visitors', 'max_visitors', 'count_observations', 'latitude', 'longitude', 'air_genre_name0', 'air_area_name0', 'air_genre_name1', 'air_area_name1', 'air_genre_name2', 'air_area_name2', 'air_genre_name3', 'air_area_name3', 'air_genre_name4', 'air_area_name4', 'air_genre_name5', 'air_area_name5', 'air_genre_name6', 'air_area_name6', 'air_genre_name7', 'air_area_name7', 'air_genre_name8', 'air_area_name8', 'air_genre_name9', 'air_area_name9', 'rs1_x', 'rv1_x', 'rs2_x', 'rv2_x', 'rs1_y', 'rv1_y', 'rs2_y', 'rv2_y', 'total_reserv_sum', 'total_reserv_mean', 'total_reserv_dt_diff_mean', 'date_int', 'var_max_lat', 'var_max_long', 'lon_plus_lat', 'air_store_id2', 'air_store_id_target', 'air_genre_name_target', 'air_area_nam

In [96]:
test['visitors'] = np.expm1(lgb.predict(X_tst))
filename = '../output/180201_lgb_prelabel2'
test[['id', 'visitors']].to_csv(filename+'.csv.gz', index=False, compression='gzip')
test[['id', 'visitors']].to_csv(filename+'.csv', index=False)