In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.preprocessing import StandardScaler


In [2]:
air_reserve = pd.read_csv('../input/recruit-restaurant-visitor-forecasting-data/air_reserve.csv')
air_store_info = pd.read_csv('../input/recruit-restaurant-visitor-forecasting-data/air_store_info.csv')
air_visit_data = pd.read_csv('../input/recruit-restaurant-visitor-forecasting-data/air_visit_data.csv')
hpg_reserve = pd.read_csv('../input/recruit-restaurant-visitor-forecasting-data/hpg_reserve.csv')
hpg_store_info = pd.read_csv('../input/recruit-restaurant-visitor-forecasting-data/hpg_store_info.csv')
store_id_relation = pd.read_csv('../input/recruit-restaurant-visitor-forecasting-data/store_id_relation.csv')
date_info = pd.read_csv('../input/recruit-restaurant-visitor-forecasting-data/date_info.csv')
sample_submission = pd.read_csv('../input/recruit-restaurant-visitor-forecasting-data/sample_submission.csv')

In [3]:
# preprocessing AIR visitors data
air_data = pd.merge(air_visit_data,air_store_info,how='left', on=['air_store_id']) # merging dataframes
date_info.rename(columns={'calendar_date':'visit_date'},inplace=True)  # renaming columns
air_data = pd.merge(air_data,date_info,how='left', on=['visit_date'])
air_data.sort_values(by='visit_date',ignore_index=True,inplace=True)
air_data['visit_date'] = pd.to_datetime(air_data['visit_date'])
air_data['day'] = air_data['visit_date'].dt.day
air_data['dow'] = air_data['visit_date'].dt.weekday
air_data['year'] = air_data['visit_date'].dt.year
air_data['month'] = air_data['visit_date'].dt.month
air_data['week'] = air_data['visit_date'].dt.week
air_data['quarter'] = air_data['visit_date'].dt.quarter
air_data['visit_date'] = air_data['visit_date'].dt.date

  # This is added back by InteractiveShellApp.init_path()


In [4]:
# preprocessing AIR reservation data
air_reserve_data = pd.merge(air_reserve,air_store_info,how='left', on=['air_store_id'])
air_reserve_data.head()
air_reserve_data['visit_datetime'] = pd.to_datetime(air_reserve_data['visit_datetime'])
air_reserve_data['visit_hour'] = air_reserve_data['visit_datetime'].dt.hour
air_reserve_data['visit_date'] = air_reserve_data['visit_datetime'].dt.date
air_reserve_data['reserve_datetime'] = pd.to_datetime(air_reserve_data['reserve_datetime'])
air_reserve_data['reserve_hour'] = air_reserve_data['reserve_datetime'].dt.hour
air_reserve_data['reserve_date'] = air_reserve_data['reserve_datetime'].dt.date
#calculate reservation time difference 
air_reserve_data['res_vis_diff'] = air_reserve_data.apply(
        lambda d: (d['visit_date'] - d['reserve_date']).days, axis=1)
air_reserve_data.rename(columns={'reserve_visitors':'air_reserve_visitors'},inplace=True)

In [5]:
# preprocessing HPG reservation data
hpg_reserve_data = pd.merge(hpg_reserve,store_id_relation,on=['hpg_store_id'],how='inner')
hpg_reserve_data = pd.merge(hpg_reserve_data,hpg_store_info,on=['hpg_store_id'],how='left')
hpg_reserve_data['visit_datetime'] = pd.to_datetime(hpg_reserve_data['visit_datetime'])
hpg_reserve_data['visit_hour'] = hpg_reserve_data['visit_datetime'].dt.hour
hpg_reserve_data['visit_date'] = hpg_reserve_data['visit_datetime'].dt.date
hpg_reserve_data['reserve_datetime'] = pd.to_datetime(hpg_reserve_data['reserve_datetime'])
hpg_reserve_data['reserve_hour'] = hpg_reserve_data['reserve_datetime'].dt.hour
hpg_reserve_data['reserve_date'] = hpg_reserve_data['reserve_datetime'].dt.date
#calculate reserve time difference 
hpg_reserve_data['res_vis_diff'] = hpg_reserve_data.apply(
        lambda r: (r['visit_date'] - r['reserve_date']).days, axis=1)
hpg_reserve_data.rename(columns={'reserve_visitors':'hpg_reserve_visitors'},inplace=True)

In [6]:
air_data

Unnamed: 0,air_store_id,visit_date,visitors,air_genre_name,air_area_name,latitude,longitude,day_of_week,holiday_flg,day,dow,year,month,week,quarter
0,air_fab092c35776a9b1,2016-01-01,19,Cafe/Sweets,Fukuoka-ken Fukuoka-shi Momochi,33.581941,130.348436,Friday,1,1,4,2016,1,53,1
1,air_f26f36ec4dc5adb0,2016-01-01,64,Izakaya,Tōkyō-to Shinjuku-ku Kabukichō,35.693840,139.703549,Friday,1,1,4,2016,1,53,1
2,air_d97dabf7aae60da5,2016-01-01,102,Cafe/Sweets,Tōkyō-to Shibuya-ku Jingūmae,35.669290,139.707056,Friday,1,1,4,2016,1,53,1
3,air_39dccf7df20b1c6a,2016-01-01,55,Izakaya,Hyōgo-ken Takarazuka-shi Tōyōchō,34.799767,135.360073,Friday,1,1,4,2016,1,53,1
4,air_79f528087f49df06,2016-01-01,42,Western food,Tōkyō-to Suginami-ku Asagayaminami,35.699566,139.636438,Friday,1,1,4,2016,1,53,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
252103,air_6a15e4eae523189d,2017-04-22,12,Bar/Cocktail,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,Saturday,0,22,5,2017,4,16,2
252104,air_bf13014b6e3e60ca,2017-04-22,49,Cafe/Sweets,Tōkyō-to Setagaya-ku Setagaya,35.646572,139.653247,Saturday,0,22,5,2017,4,16,2
252105,air_c52c63c781fe48f6,2017-04-22,41,Cafe/Sweets,Fukuoka-ken Itoshima-shi Maebarunishi,33.557320,130.195555,Saturday,0,22,5,2017,4,16,2
252106,air_59cc9b2b209c6331,2017-04-22,12,Cafe/Sweets,Tōkyō-to Setagaya-ku Setagaya,35.646572,139.653247,Saturday,0,22,5,2017,4,16,2


In [7]:
# 요일 별 방문자 수 컬럼 생성
dow_avg_visitor = air_data.groupby(['dow'])['visitors'].mean()
dow_avg_visitor = pd.DataFrame(dow_avg_visitor)
dow_avg_visitor.rename(columns={'visitors':'dow_avg_visitor'},inplace=True)

air_data = pd.merge(air_data,dow_avg_visitor,how='left', on=['dow']) 
print(air_data.shape)

(252108, 16)


In [8]:
# 월 별 방문자 수 컬럼 생성
month_avg_visitor = air_data.groupby(['month'])['visitors'].mean()
month_avg_visitor = pd.DataFrame(month_avg_visitor)
month_avg_visitor.rename(columns={'visitors':'month_avg_visitor'},inplace=True)

air_data = pd.merge(air_data,month_avg_visitor,how='left', on=['month']) 
print(air_data.shape)

(252108, 17)


In [9]:
# 날짜 별 방문자 수 컬럼 생성
day_avg_visitor = air_data.groupby(['day'])['visitors'].mean()
day_avg_visitor = pd.DataFrame(day_avg_visitor)
day_avg_visitor.rename(columns={'visitors':'day_avg_visitor'},inplace=True)

air_data = pd.merge(air_data,day_avg_visitor,how='left', on=['day']) 
print(air_data.shape)

(252108, 18)


In [10]:
# 식당 별 평균 방문자 수 컬럼 생성
store_avg_visitors = air_data.groupby(['air_store_id'])['visitors'].mean()
store_avg_visitors = pd.DataFrame(store_avg_visitors)
store_avg_visitors.rename(columns={'visitors':'store_avg_visitors'},inplace=True)

air_data = pd.merge(air_data,store_avg_visitors,how='left', on=['air_store_id']) 
print(air_data.shape)

(252108, 19)


In [11]:
# 지역을 따로 구분하는 area 컬럼 생성
air_data['area'] = air_data['air_area_name'].map(lambda x: x.split()[0])
print(air_data.shape)
air_data['area'].value_counts()

(252108, 20)


Tōkyō-to         133063
Fukuoka-ken       39645
Ōsaka-fu          22821
Hyōgo-ken         17846
Hokkaidō          13055
Hiroshima-ken      9858
Miyagi-ken         5959
Shizuoka-ken       5798
Niigata-ken        4063
Name: area, dtype: int64

In [12]:
air_data

Unnamed: 0,air_store_id,visit_date,visitors,air_genre_name,air_area_name,latitude,longitude,day_of_week,holiday_flg,day,dow,year,month,week,quarter,dow_avg_visitor,month_avg_visitor,day_avg_visitor,store_avg_visitors,area
0,air_fab092c35776a9b1,2016-01-01,19,Cafe/Sweets,Fukuoka-ken Fukuoka-shi Momochi,33.581941,130.348436,Friday,1,1,4,2016,1,53,1,23.072737,19.976758,19.981864,11.429825,Fukuoka-ken
1,air_f26f36ec4dc5adb0,2016-01-01,64,Izakaya,Tōkyō-to Shinjuku-ku Kabukichō,35.693840,139.703549,Friday,1,1,4,2016,1,53,1,23.072737,19.976758,19.981864,39.950538,Tōkyō-to
2,air_d97dabf7aae60da5,2016-01-01,102,Cafe/Sweets,Tōkyō-to Shibuya-ku Jingūmae,35.669290,139.707056,Friday,1,1,4,2016,1,53,1,23.072737,19.976758,19.981864,37.754202,Tōkyō-to
3,air_39dccf7df20b1c6a,2016-01-01,55,Izakaya,Hyōgo-ken Takarazuka-shi Tōyōchō,34.799767,135.360073,Friday,1,1,4,2016,1,53,1,23.072737,19.976758,19.981864,24.645435,Hyōgo-ken
4,air_79f528087f49df06,2016-01-01,42,Western food,Tōkyō-to Suginami-ku Asagayaminami,35.699566,139.636438,Friday,1,1,4,2016,1,53,1,23.072737,19.976758,19.981864,30.053269,Tōkyō-to
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
252103,air_6a15e4eae523189d,2017-04-22,12,Bar/Cocktail,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,Saturday,0,22,5,2017,4,16,2,26.313688,21.816002,22.125014,19.320513,Tōkyō-to
252104,air_bf13014b6e3e60ca,2017-04-22,49,Cafe/Sweets,Tōkyō-to Setagaya-ku Setagaya,35.646572,139.653247,Saturday,0,22,5,2017,4,16,2,26.313688,21.816002,22.125014,33.034188,Tōkyō-to
252105,air_c52c63c781fe48f6,2017-04-22,41,Cafe/Sweets,Fukuoka-ken Itoshima-shi Maebarunishi,33.557320,130.195555,Saturday,0,22,5,2017,4,16,2,26.313688,21.816002,22.125014,29.221774,Fukuoka-ken
252106,air_59cc9b2b209c6331,2017-04-22,12,Cafe/Sweets,Tōkyō-to Setagaya-ku Setagaya,35.646572,139.653247,Saturday,0,22,5,2017,4,16,2,26.313688,21.816002,22.125014,14.867089,Tōkyō-to


## 필요한 columns 으로 Train dataset 생성

In [13]:
train = air_data.copy()
train = train[['air_store_id','visit_date','visitors','air_genre_name','holiday_flg','dow_avg_visitor','month_avg_visitor','day_avg_visitor','store_avg_visitors','area']]
train

Unnamed: 0,air_store_id,visit_date,visitors,air_genre_name,holiday_flg,dow_avg_visitor,month_avg_visitor,day_avg_visitor,store_avg_visitors,area
0,air_fab092c35776a9b1,2016-01-01,19,Cafe/Sweets,1,23.072737,19.976758,19.981864,11.429825,Fukuoka-ken
1,air_f26f36ec4dc5adb0,2016-01-01,64,Izakaya,1,23.072737,19.976758,19.981864,39.950538,Tōkyō-to
2,air_d97dabf7aae60da5,2016-01-01,102,Cafe/Sweets,1,23.072737,19.976758,19.981864,37.754202,Tōkyō-to
3,air_39dccf7df20b1c6a,2016-01-01,55,Izakaya,1,23.072737,19.976758,19.981864,24.645435,Hyōgo-ken
4,air_79f528087f49df06,2016-01-01,42,Western food,1,23.072737,19.976758,19.981864,30.053269,Tōkyō-to
...,...,...,...,...,...,...,...,...,...,...
252103,air_6a15e4eae523189d,2017-04-22,12,Bar/Cocktail,0,26.313688,21.816002,22.125014,19.320513,Tōkyō-to
252104,air_bf13014b6e3e60ca,2017-04-22,49,Cafe/Sweets,0,26.313688,21.816002,22.125014,33.034188,Tōkyō-to
252105,air_c52c63c781fe48f6,2017-04-22,41,Cafe/Sweets,0,26.313688,21.816002,22.125014,29.221774,Fukuoka-ken
252106,air_59cc9b2b209c6331,2017-04-22,12,Cafe/Sweets,0,26.313688,21.816002,22.125014,14.867089,Tōkyō-to


## 이상치 처리

In [14]:
outlier = air_data[air_data['store_avg_visitors'] > 80]
outlier

Unnamed: 0,air_store_id,visit_date,visitors,air_genre_name,air_area_name,latitude,longitude,day_of_week,holiday_flg,day,dow,year,month,week,quarter,dow_avg_visitor,month_avg_visitor,day_avg_visitor,store_avg_visitors,area
214942,air_900d755ebd2f7bbd,2017-03-02,4,Italian/French,Tōkyō-to Chūō-ku Ginza,35.672114,139.770825,Thursday,0,2,3,2017,3,9,1,18.922702,22.400360,21.107756,82.200000,Tōkyō-to
215211,air_1c0b150f9e696a5f,2017-03-02,87,Okonomiyaki/Monja/Teppanyaki,Tōkyō-to Shibuya-ku Shibuya,35.661777,139.704051,Thursday,0,2,3,2017,3,9,1,18.922702,22.400360,21.107756,115.470588,Tōkyō-to
216036,air_1c0b150f9e696a5f,2017-03-03,96,Okonomiyaki/Monja/Teppanyaki,Tōkyō-to Shibuya-ku Shibuya,35.661777,139.704051,Friday,0,3,4,2017,3,9,1,23.072737,22.400360,21.788706,115.470588,Tōkyō-to
216556,air_1c0b150f9e696a5f,2017-03-04,114,Okonomiyaki/Monja/Teppanyaki,Tōkyō-to Shibuya-ku Shibuya,35.661777,139.704051,Saturday,0,4,5,2017,3,9,1,26.313688,22.400360,21.057714,115.470588,Tōkyō-to
217308,air_1c0b150f9e696a5f,2017-03-05,134,Okonomiyaki/Monja/Teppanyaki,Tōkyō-to Shibuya-ku Shibuya,35.661777,139.704051,Sunday,0,5,6,2017,3,9,1,23.873362,22.400360,20.076961,115.470588,Tōkyō-to
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250508,air_1c0b150f9e696a5f,2017-04-20,86,Okonomiyaki/Monja/Teppanyaki,Tōkyō-to Shibuya-ku Shibuya,35.661777,139.704051,Thursday,0,20,3,2017,4,16,2,18.922702,21.816002,19.978030,115.470588,Tōkyō-to
250681,air_900d755ebd2f7bbd,2017-04-21,114,Italian/French,Tōkyō-to Chūō-ku Ginza,35.672114,139.770825,Friday,0,21,4,2017,4,16,2,23.072737,21.816002,21.332212,82.200000,Tōkyō-to
251071,air_1c0b150f9e696a5f,2017-04-21,89,Okonomiyaki/Monja/Teppanyaki,Tōkyō-to Shibuya-ku Shibuya,35.661777,139.704051,Friday,0,21,4,2017,4,16,2,23.072737,21.816002,21.332212,115.470588,Tōkyō-to
251459,air_900d755ebd2f7bbd,2017-04-22,83,Italian/French,Tōkyō-to Chūō-ku Ginza,35.672114,139.770825,Saturday,0,22,5,2017,4,16,2,26.313688,21.816002,22.125014,82.200000,Tōkyō-to


In [15]:
air_data[air_data['air_store_id'] == 'air_1c0b150f9e696a5f']

Unnamed: 0,air_store_id,visit_date,visitors,air_genre_name,air_area_name,latitude,longitude,day_of_week,holiday_flg,day,dow,year,month,week,quarter,dow_avg_visitor,month_avg_visitor,day_avg_visitor,store_avg_visitors,area
215211,air_1c0b150f9e696a5f,2017-03-02,87,Okonomiyaki/Monja/Teppanyaki,Tōkyō-to Shibuya-ku Shibuya,35.661777,139.704051,Thursday,0,2,3,2017,3,9,1,18.922702,22.40036,21.107756,115.470588,Tōkyō-to
216036,air_1c0b150f9e696a5f,2017-03-03,96,Okonomiyaki/Monja/Teppanyaki,Tōkyō-to Shibuya-ku Shibuya,35.661777,139.704051,Friday,0,3,4,2017,3,9,1,23.072737,22.40036,21.788706,115.470588,Tōkyō-to
216556,air_1c0b150f9e696a5f,2017-03-04,114,Okonomiyaki/Monja/Teppanyaki,Tōkyō-to Shibuya-ku Shibuya,35.661777,139.704051,Saturday,0,4,5,2017,3,9,1,26.313688,22.40036,21.057714,115.470588,Tōkyō-to
217308,air_1c0b150f9e696a5f,2017-03-05,134,Okonomiyaki/Monja/Teppanyaki,Tōkyō-to Shibuya-ku Shibuya,35.661777,139.704051,Sunday,0,5,6,2017,3,9,1,23.873362,22.40036,20.076961,115.470588,Tōkyō-to
218110,air_1c0b150f9e696a5f,2017-03-06,101,Okonomiyaki/Monja/Teppanyaki,Tōkyō-to Shibuya-ku Shibuya,35.661777,139.704051,Monday,0,6,0,2017,3,10,1,17.177009,22.40036,18.749206,115.470588,Tōkyō-to
218559,air_1c0b150f9e696a5f,2017-03-07,86,Okonomiyaki/Monja/Teppanyaki,Tōkyō-to Shibuya-ku Shibuya,35.661777,139.704051,Tuesday,0,7,1,2017,3,10,1,17.672137,22.40036,19.483672,115.470588,Tōkyō-to
219604,air_1c0b150f9e696a5f,2017-03-08,102,Okonomiyaki/Monja/Teppanyaki,Tōkyō-to Shibuya-ku Shibuya,35.661777,139.704051,Wednesday,0,8,2,2017,3,10,1,19.230121,22.40036,20.283643,115.470588,Tōkyō-to
220244,air_1c0b150f9e696a5f,2017-03-09,118,Okonomiyaki/Monja/Teppanyaki,Tōkyō-to Shibuya-ku Shibuya,35.661777,139.704051,Thursday,0,9,3,2017,3,10,1,18.922702,22.40036,21.046287,115.470588,Tōkyō-to
220717,air_1c0b150f9e696a5f,2017-03-10,105,Okonomiyaki/Monja/Teppanyaki,Tōkyō-to Shibuya-ku Shibuya,35.661777,139.704051,Friday,0,10,4,2017,3,10,1,23.072737,22.40036,22.211093,115.470588,Tōkyō-to
222026,air_1c0b150f9e696a5f,2017-03-11,139,Okonomiyaki/Monja/Teppanyaki,Tōkyō-to Shibuya-ku Shibuya,35.661777,139.704051,Saturday,0,11,5,2017,3,10,1,26.313688,22.40036,21.406384,115.470588,Tōkyō-to


## Sample_submission 파일 이용하여 Test dataset 생성

In [16]:
sample_submission = pd.read_csv('../input/recruit-restaurant-visitor-forecasting-data/sample_submission.csv')

test = sample_submission.copy()
test['air_store_id'] = test['id'].apply(lambda x: str(x)[:-11])
test['visit_date'] = test['id'].apply(lambda x: str(x)[-10:])
test = test.drop(['id', 'visitors'], axis = 1)

test = pd.merge(test,air_store_info,how='left', on=['air_store_id'])
#date_info.rename(columns={'calendar_date':'visit_date'},inplace=True)
test = pd.merge(test,date_info,how='left', on=['visit_date'])
test = test.drop(['day_of_week'], axis = 1)     # 'daw' column 과 겹치므로 삭제 


test['visit_date'] = pd.to_datetime(test['visit_date'])
test['day'] = test['visit_date'].dt.day
test['dow'] = test['visit_date'].dt.weekday
test['year'] = test['visit_date'].dt.year
test['month'] = test['visit_date'].dt.month
test['week'] = test['visit_date'].dt.week
test['quarter'] = test['visit_date'].dt.quarter
test['visit_date'] = test['visit_date'].dt.date



In [17]:
test = pd.merge(test,dow_avg_visitor,how='left', on=['dow'])     # 요일 별 방문자 수 컬럼 생성
test = pd.merge(test,month_avg_visitor,how='left', on=['month'])   # 월 별 방문자 수 컬럼 생성
test = pd.merge(test,day_avg_visitor,how='left', on=['day'])    # 날짜 별 방문자 수 컬럼 생성
test = pd.merge(test,store_avg_visitors,how='left', on=['air_store_id'])     # 식당 별 평균 방문자 수 컬럼 생성
test['area'] = test['air_area_name'].map(lambda x: x.split()[0])    # 지역을 따로 구분하는 area 컬럼 생성
print(test.shape)
test['area'].value_counts()

(32019, 18)


Tōkyō-to         17160
Fukuoka-ken       4875
Ōsaka-fu          2886
Hyōgo-ken         2223
Hokkaidō          1716
Hiroshima-ken     1248
Shizuoka-ken       702
Miyagi-ken         663
Niigata-ken        546
Name: area, dtype: int64

In [18]:
# 필요한 컬럼만 사용
test = test[['air_store_id','visit_date','air_genre_name','holiday_flg','dow_avg_visitor','month_avg_visitor','day_avg_visitor','store_avg_visitors','area']]
test

Unnamed: 0,air_store_id,visit_date,air_genre_name,holiday_flg,dow_avg_visitor,month_avg_visitor,day_avg_visitor,store_avg_visitors,area
0,air_00a91d42b08b08d9,2017-04-23,Italian/French,0,23.873362,21.816002,21.007387,26.081897,Tōkyō-to
1,air_00a91d42b08b08d9,2017-04-24,Italian/French,0,17.177009,21.816002,21.214331,26.081897,Tōkyō-to
2,air_00a91d42b08b08d9,2017-04-25,Italian/French,0,17.672137,21.816002,21.996978,26.081897,Tōkyō-to
3,air_00a91d42b08b08d9,2017-04-26,Italian/French,0,19.230121,21.816002,21.770895,26.081897,Tōkyō-to
4,air_00a91d42b08b08d9,2017-04-27,Italian/French,0,18.922702,21.816002,20.820884,26.081897,Tōkyō-to
...,...,...,...,...,...,...,...,...,...
32014,air_fff68b929994bfbd,2017-05-27,Bar/Cocktail,0,26.313688,22.009313,20.820884,5.089219,Tōkyō-to
32015,air_fff68b929994bfbd,2017-05-28,Bar/Cocktail,0,23.873362,22.009313,21.753372,5.089219,Tōkyō-to
32016,air_fff68b929994bfbd,2017-05-29,Bar/Cocktail,0,17.177009,22.009313,21.995716,5.089219,Tōkyō-to
32017,air_fff68b929994bfbd,2017-05-30,Bar/Cocktail,0,17.672137,22.009313,21.864183,5.089219,Tōkyō-to


## One-Hot Encoding

In [19]:
encoded_train = pd.get_dummies(data = train, columns =['air_genre_name','area'])
print(encoded_train.shape)

(252108, 31)


In [20]:
encoded_test = pd.get_dummies(data = test, columns =['air_genre_name','area'])
print(encoded_test.shape)

(32019, 30)


In [21]:
target_train = encoded_train['visitors']   # 목표 변수

encoded_train = encoded_train.drop(['visitors', 'air_store_id', 'visit_date'], axis=1)
encoded_test = encoded_test.drop(['air_store_id', 'visit_date'], axis=1)
print(encoded_train.shape)
print(encoded_test.shape)

(252108, 28)
(32019, 28)


## Training Set, Validation Set 분리
주어진 Training set 에서 다시 training set 과 validation set 을 분리한다.

In [22]:
X_train, X_val, y_train, y_val = train_test_split(encoded_train,
                                                  target_train, 
                                                  test_size = 0.2, 
                                                  random_state = 42)
print(X_train.shape)
print(X_val.shape)

(201686, 28)
(50422, 28)


----
# Modeling
## 1. 선형 회귀 LinearRegression

In [23]:
model = LinearRegression()

model.fit(X_train, y_train)

LinearRegression()

In [24]:
def rmsle(y_true,y_pred): 

    return np.sqrt(np.square(np.log(y_pred + 1) - np.log(y_true + 1)).mean())

score = make_scorer(rmsle, greater_is_better=False)

In [25]:
y_val_pred = model.predict(X_val)

val_rmsle = rmsle(y_val, y_val_pred)

print(val_rmsle)

0.6267877719399059


  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
final_visitors  = model.predict(encoded_test)

final = sample_submission.copy()
final['visitors'] = final_visitors

final.to_csv('submission_{:.3f}_linearReg.csv'.format(val_rmsle), index=False)
print('submission_{:.3f}_linearReg.csv 파일 저장 완료'.format(val_rmsle))

----

## 2. KNeighbors Regression

In [26]:
# hyperparameters
parameters = {'n_neighbors': [5, 10, 15]}

# model
knn = KNeighborsRegressor(n_jobs=-1)

# cross validation
knn_cv = GridSearchCV(estimator=knn,
                      param_grid=parameters,
                      scoring=score,
                      n_jobs=-1,
                      cv=3,
                      verbose=10,
                      return_train_score=True)
knn_cv.fit(X_train,y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   40.0s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done   7 out of   9 | elapsed:  3.4min remaining:   57.7s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:  4.0min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:  4.0min finished


GridSearchCV(cv=3, estimator=KNeighborsRegressor(n_jobs=-1), n_jobs=-1,
             param_grid={'n_neighbors': [5, 10, 15]}, return_train_score=True,
             scoring=make_scorer(rmsle, greater_is_better=False), verbose=10)

In [27]:
knn_cv.best_params_

{'n_neighbors': 15}

In [28]:
knn_model = KNeighborsRegressor(n_neighbors=15,n_jobs=-1)
knn_model.fit(X_train, y_train)

KNeighborsRegressor(n_jobs=-1, n_neighbors=15)

In [29]:
y_val_pred = knn_model.predict(X_val)

val_rmsle = rmsle(y_val, y_val_pred)

print(val_rmsle)

0.577570981599654


In [None]:
final_visitors  = knn_model.predict(encoded_test)

final = sample_submission.copy()
final['visitors'] = final_visitors

final.to_csv('submission_{:.3f}_knn.csv'.format(val_rmsle), index=False)
print('submission_{:.3f}_knn.csv 파일 저장 완료'.format(val_rmsle))

----

## 3. SGD Regression

In [30]:
std = StandardScaler() 
X_train_std = std.fit_transform(X_train)
X_val_std = std.fit_transform(X_val)

In [31]:
sgd_reg = SGDRegressor()
# parameters
parameters = {'alpha':[0.00001,0.0001,0.001,0.01,0.1,1]}

# cross validation
sgd_reg_cv = GridSearchCV(estimator=sgd_reg,
                      param_grid=parameters,
                      scoring=score,
                      n_jobs=-1,
                      cv=5,
                      verbose=10,
                      return_train_score=True)
sgd_reg_cv.fit(X_train_std,y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   11.3s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   17.6s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:   24.1s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   33.4s finished


GridSearchCV(cv=5, estimator=SGDRegressor(), n_jobs=-1,
             param_grid={'alpha': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1]},
             return_train_score=True,
             scoring=make_scorer(rmsle, greater_is_better=False), verbose=10)

In [32]:
sgd_reg_cv.best_params_

{'alpha': 0.1}

In [33]:
sgd_model = SGDRegressor(alpha=0.1)
sgd_model.fit(X_train_std, y_train)

SGDRegressor(alpha=0.1)

In [34]:
y_val_pred = sgd_model.predict(X_val_std)

val_rmsle = rmsle(y_val, y_val_pred)

print(val_rmsle)

0.6261941684893819


  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
final_visitors  = sgd_model.predict(encoded_test)

final = sample_submission.copy()
final['visitors'] = final_visitors

final.to_csv('submission_{:.3f}_sgd.csv'.format(val_rmsle), index=False)
print('submission_{:.3f}_sgd.csv 파일 저장 완료'.format(val_rmsle))

----
## 4. DecisionTree

In [35]:
dt = DecisionTreeRegressor()
# hyperparameters
parameters = {'max_depth': [1, 5, 10, 50],
              'min_samples_split' : [10, 100, 500]}
# cross validation
dt_cv = GridSearchCV(estimator=dt,
                      param_grid=parameters,
                      scoring=score,
                      n_jobs=-1,
                      cv=3,
                      verbose=10,
                      return_train_score=True)
dt_cv.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:   14.2s finished


GridSearchCV(cv=3, estimator=DecisionTreeRegressor(), n_jobs=-1,
             param_grid={'max_depth': [1, 5, 10, 50],
                         'min_samples_split': [10, 100, 500]},
             return_train_score=True,
             scoring=make_scorer(rmsle, greater_is_better=False), verbose=10)

In [36]:
dt_cv.best_params_

{'max_depth': 50, 'min_samples_split': 100}

In [37]:
dt_model = DecisionTreeRegressor(max_depth = 50, min_samples_split = 100)
dt_model.fit(X_train,y_train)

DecisionTreeRegressor(max_depth=50, min_samples_split=100)

In [38]:
y_val_pred = dt_model.predict(X_val)

val_rmsle = rmsle(y_val, y_val_pred)

print(val_rmsle)

0.567361109167675


In [None]:
final_visitors  = dt_model.predict(encoded_test)

final = sample_submission.copy()
final['visitors'] = final_visitors

final.to_csv('submission_{:.3f}_dt.csv'.format(val_rmsle), index=False)
print('submission_{:.3f}_dt.csv 파일 저장 완료'.format(val_rmsle))

----
## 5. RandomForest

In [39]:
rf = RandomForestRegressor()
# hyperparameters
parameters = {'max_depth': [1, 5, 10],
              'n_estimators' : [10, 50, 100]}
# cross validation
rf_cv = GridSearchCV(estimator=rf,
                     param_grid=parameters,
                     cv=3,
                     scoring=score,
                     return_train_score=True,
                     n_jobs=1)
rf_cv.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=1,
             param_grid={'max_depth': [1, 5, 10],
                         'n_estimators': [10, 50, 100]},
             return_train_score=True,
             scoring=make_scorer(rmsle, greater_is_better=False))

In [40]:
rf_cv.best_params_

{'max_depth': 10, 'n_estimators': 50}

In [41]:
rf_model = RandomForestRegressor(max_depth = 10, n_estimators = 100, n_jobs=1)
rf_model.fit(X_train, y_train)

RandomForestRegressor(max_depth=10, n_jobs=1)

In [42]:
y_val_pred = rf_model.predict(X_val)

val_rmsle = rmsle(y_val, y_val_pred)

print(val_rmsle)

0.5815534682792002


In [None]:
final_visitors  = rf_model.predict(encoded_test)

final = sample_submission.copy()
final['visitors'] = final_visitors

final.to_csv('submission_{:.3f}_rf.csv'.format(val_rmsle), index=False)
print('submission_{:.3f}_rf.csv 파일 저장 완료'.format(val_rmsle))

----


## 6. XGBoost

In [43]:
# hyperparameters
parameters = {'learning_rate':[0.1,0.01],
              'min_child_weight':[0.8,0.9,1],
              'max_depth': [2,4,8]}

# XGBoost optimized matrix
train_matrix = xgb.DMatrix(data=X_train,label=y_train)
val_matrix = xgb.DMatrix(data=X_val,label=y_val)

# model
xgb_reg = xgb.XGBRegressor(tree_method='gpu_hist')

# cross validation
xgb_reg_cv = GridSearchCV(estimator=xgb_reg,
                     param_grid=parameters,
                     cv=3,
                     return_train_score=True,
                     n_jobs=-1,
                     scoring=score)
xgb_reg_cv.fit(X=X_train,y=y_train)

GridSearchCV(cv=3,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=...,
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=None, reg_lambda=None,
                                    scale_pos_weight=None, subsample=None,
                                    tree_method='gpu_hist',
        

In [44]:
xgb_reg_cv.best_params_

{'learning_rate': 0.1, 'max_depth': 8, 'min_child_weight': 0.8}

In [45]:
xgb_model = xgb.XGBRegressor(learning_rate= 0.1, max_depth= 8, min_child_weight= 0.8)
xgb_model.fit(X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=8,
             min_child_weight=0.8, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=2, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [46]:
y_val_pred = xgb_model.predict(X_val)

val_rmsle = rmsle(y_val, y_val_pred)

print(val_rmsle)

0.5711478811796952


In [None]:
final_visitors  = xgb_model.predict(encoded_test)

final = sample_submission.copy()
final['visitors'] = final_visitors

final.to_csv('submission_{:.3f}_xgb.csv'.format(val_rmsle), index=False)
print('submission_{:.3f}_xgb.csv 파일 저장 완료'.format(val_rmsle))