In [1]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
import  matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error

from random import sample
seed_list = list(range(10000))

import warnings
warnings.simplefilter('ignore')
os.chdir('C:\\Users\\yeonjun.in\\Desktop\\연준\\캐글\\제주도\\data\\raw')

TODAY = str(datetime.now().year)+str(datetime.now().month)+str(datetime.now().day)

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv("test.csv")

os.chdir('C:\\Users\\yeonjun.in\\Desktop\\연준\\캐글\\제주도\\data')
sub = pd.read_csv('submission_sample.csv')

os.chdir('C:\\Users\\yeonjun.in\\Desktop\\연준\\캐글\\제주도\\code\\experiment')
experiment_db = pd.read_csv('experiment_DB.csv')

In [3]:
# column 명 재지정
train.columns  = ['id', 'date', 'bus_route_id', 'in_out', 'station_code', 'station_name',
 'latitude', 'longitude', 'ride_6_7', 'ride_7_8', 'ride_8_9',
 'ride_9_10', 'ride_10_11', 'ride_11_12', 'takeoff_6_7', 'takeoff_7_8',
 'takeoff_8_9', 'takeoff_9_10', 'takeoff_10_11', 'takeoff_11_12',
 'ride_18_20']

test.columns = ['id', 'date', 'bus_route_id', 'in_out', 'station_code', 'station_name',
 'latitude', 'longitude', 'ride_6_7', 'ride_7_8', 'ride_8_9',
 'ride_9_10', 'ride_10_11', 'ride_11_12', 'takeoff_6_7', 'takeoff_7_8',
 'takeoff_8_9', 'takeoff_9_10', 'takeoff_10_11', 'takeoff_11_12'] 

In [4]:

train['date'] = pd.to_datetime(train['date'])
train['weekday'] = train['date'].dt.weekday
train = pd.get_dummies(train,columns=['weekday'])
train['weekday_var'] = train['date'].dt.weekday

test['date'] = pd.to_datetime(test['date'])
test['weekday'] = test['date'].dt.weekday
test = pd.get_dummies(test,columns=['weekday'])
test['weekday_var'] = test['date'].dt.weekday

In [5]:
train['in_out'] = train['in_out'].map({'시내':0,'시외':1})
test['in_out'] = test['in_out'].map({'시내':0,'시외':1})

In [6]:
import geopy.distance

coords_jejusi = (33.500770, 126.522761) #제주시의 위도 경도
coords_seoquipo = (33.259429, 126.558217) #서귀포시의 위도 경도


train['dis_jejusi'] = [geopy.distance.vincenty((train['latitude'].iloc[i],train['longitude'].iloc[i]), coords_jejusi).km for i in range(len(train))]
train['dis_seoquipo'] = [geopy.distance.vincenty((train['latitude'].iloc[i],train['longitude'].iloc[i]), coords_seoquipo).km for i in range(len(train))]

test['dis_jejusi'] = [geopy.distance.vincenty((test['latitude'].iloc[i],test['longitude'].iloc[i]), coords_jejusi).km for i in range(len(test))]
test['dis_seoquipo'] = [geopy.distance.vincenty((test['latitude'].iloc[i],test['longitude'].iloc[i]), coords_seoquipo).km for i in range(len(test))]

In [7]:
train['ride_6_12'] = train[['ride_6_7','ride_7_8','ride_8_9','ride_9_10','ride_10_11','ride_11_12']].sum(axis=1)
test['ride_6_12'] = test[['ride_6_7','ride_7_8','ride_8_9','ride_9_10','ride_10_11','ride_11_12']].sum(axis=1)

train['takeoff_6_12'] = train[['takeoff_6_7','takeoff_7_8','takeoff_8_9','takeoff_9_10','takeoff_10_11','takeoff_11_12']].sum(axis=1)
test['takeoff_6_12'] = test[['takeoff_6_7','takeoff_7_8','takeoff_8_9','takeoff_9_10','takeoff_10_11','takeoff_11_12']].sum(axis=1)

In [8]:
added = []
input_var=['in_out','latitude','longitude','ride_6_7', 'ride_7_8', 'ride_8_9', 
           'ride_9_10','ride_10_11', 'ride_11_12','ride_6_12',
           'takeoff_6_7', 'takeoff_7_8', 'takeoff_8_9','takeoff_9_10', 
           'takeoff_10_11', 'takeoff_11_12','takeoff_6_12',
           'weekday_0', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4',
           'weekday_5', 'weekday_6', 
           'dis_jejusi', 'dis_seoquipo']
target=['ride_18_20']

In [9]:
holi = ['2019-09-12','2019-09-13','2019-09-14', '2019-10-03','2019-10-09']
wkend = ['2019-09-01','2019-09-07','2019-09-08','2019-09-14','2019-09-15',
         '2019-09-21','2019-09-22','2019-09-28','2019-09-29',
        '2019-10-05','2019-10-06','2019-10-12','2019-10-13']
workday = sorted(list(set(pd.concat([train.date,test.date],axis=0).astype('str').unique()) - set(holi+wkend)))

train['day_type'] =  np.where(train.date.isin(holi),1, 
                            np.where(train.date.isin(wkend),2,3))
test['day_type'] =  np.where(test.date.isin(holi),1, 
                            np.where(test.date.isin(wkend),2,3))



train = pd.get_dummies(train,columns=['day_type'])
test = pd.get_dummies(test,columns=['day_type'])

added += [a for a in train.columns if 'day_type' in a]

In [10]:
all = pd.concat([train,test],axis=0).reset_index()

for aaa in ['bus_route_id','station_code','station_name']:
    temp = all.groupby(aaa)['id'].count().reset_index().\
    rename(columns = {'id' : str(aaa) + '_freq'})

    train = pd.merge(train,temp,how='left',on=aaa)
    test = pd.merge(test,temp,how='left',on=aaa)

    
del all
added += [a for a in train.columns if 'freq' in a]

In [11]:
os.chdir('C:\\Users\\yeonjun.in\\Desktop\\연준\\캐글\\제주도\\data')

# 2015년 9월, 10월 의 제주도 좌표별 유동인구 공공데이터 활용
move_18_20 = pd.read_pickle('move_18_20.pkl')
all = pd.concat([train,test],axis=0).reset_index(drop=True)

remove_outlier =\
move_18_20[~((move_18_20.x < 126.46) | ((move_18_20['x'] > 126.56) & (move_18_20['y'] > 33.5)))].reset_index(drop=True)

logic1 = ((all.latitude < 33.5) & (all.latitude > 33.47)) & ((all.longitude > 126.47) & (all.longitude < 126.50))
logic2 = ((all.latitude < 33.53) & (all.latitude > 33.48)) & ((all.longitude > 126.51) & (all.longitude < 126.54))
logic3 = ((all.latitude < 33.26) & (all.latitude > 33.24)) & ((all.longitude > 126.55) & (all.longitude < 126.57))

all['high_move'] = np.where(logic1,'1',np.where(logic2,'2',np.where(logic3,'3','0')))
all = pd.get_dummies(all,columns=['high_move'])

train = all.loc[:(train.shape[0]-1),]
test = all.loc[train.shape[0]:,].drop('ride_18_20',axis=1).reset_index(drop=True)

del all

added += [a for a in train.columns if 'high_move' in a]

In [12]:
all = pd.concat([train,test],axis=0).reset_index(drop=True)

logic1 = ((remove_outlier.y < 33.5) & (remove_outlier.y > 33.47)) & ((remove_outlier.x > 126.47) & (remove_outlier.x < 126.50))
logic2 = ((remove_outlier.y < 33.53) & (remove_outlier.y > 33.48)) & ((remove_outlier.x > 126.51) & (remove_outlier.x < 126.54))
logic3 = ((remove_outlier.y < 33.26) & (remove_outlier.y > 33.24)) & ((remove_outlier.x > 126.55) & (remove_outlier.x < 126.57))

which1 = (remove_outlier[logic1].y.mean(), remove_outlier[logic1].x.mean())
which2 = (remove_outlier[logic2].y.mean(), remove_outlier[logic2].x.mean())
which3 = (remove_outlier[logic3].y.mean(), remove_outlier[logic3].x.mean())

import geopy.distance
all['dis_1'] = [geopy.distance.vincenty((all['latitude'].iloc[i],all['longitude'].iloc[i]), which1).km for i in range(len(all))]
all['dis_2'] = [geopy.distance.vincenty((all['latitude'].iloc[i],all['longitude'].iloc[i]), which2).km for i in range(len(all))] 
all['dis_3']  = [geopy.distance.vincenty((all['latitude'].iloc[i],all['longitude'].iloc[i]), which3).km for i in range(len(all))]

train = all.loc[:(train.shape[0]-1),]
test = all.loc[train.shape[0]:,].drop('ride_18_20',axis=1).reset_index(drop=True)

del all

added += ['dis_1','dis_2','dis_3']

In [13]:
os.chdir('C:\\Users\\yeonjun.in\\Desktop\\연준\\캐글\\제주도\\data')
sep = pd.read_pickle('sep_move.pkl')
octo = pd.read_pickle('octo_move.pkl')

total = pd.concat([sep,octo],axis=0)
total[['x','y']] = round(total[['x','y']],2)
temp = total.groupby(['x','y'])['move_18_20'].sum().reset_index().\
rename(columns = {'x' : 'longitude','y':'latitude'})

all = pd.concat([train,test],axis=0).reset_index(drop=True)
all[['latitude','longitude']] = round(all[['latitude','longitude']],2)

all = pd.merge(all,temp,how='left',on=['latitude','longitude'])
all['move_18_20'] = all['move_18_20'].fillna(0).astype('int')

train = all.loc[:(train.shape[0]-1),]
test = all.loc[train.shape[0]:,].drop('ride_18_20',axis=1).reset_index(drop=True)

del all

added += ['move_18_20']

In [14]:
all = pd.concat([train,test],axis=0).reset_index(drop=True)

for col in ['station_code','station_name','bus_route_id']:
    temp = all.groupby([col])['ride_6_12'].agg(['mean','max','min','count']).reset_index().\
        rename(columns = {'mean' : col+'_'+'ride_6_12'+'_'+'mean_morning',
                         'max' : col+'_'+'ride_6_12'+'_'+'max_morning',
                         'min' : col+'_'+'ride_6_12'+'_'+'min_morning',
                         'count' : col+'_'+'ride_6_12'+'_'+'count_morning'})
    all = pd.merge(all,temp,how='left',on=col)

train = all.loc[:(train.shape[0]-1),]
test = all.loc[train.shape[0]:,].drop('ride_18_20',axis=1).reset_index(drop=True)

del all

In [15]:
all = pd.concat([train,test],axis=0).reset_index(drop=True)

for col in ['station_code','station_name','bus_route_id']:
    temp = all.groupby([col])['takeoff_6_12'].agg(['mean','max','min','count']).reset_index().\
        rename(columns = {'mean' : col+'_'+'takeoff_6_12'+'_'+'mean_morning',
                         'max' : col+'_'+'takeoff_6_12'+'_'+'max_morning',
                         'min' : col+'_'+'takeoff_6_12'+'_'+'min_morning',
                         'count' : col+'_'+'takeoff_6_12'+'_'+'count_morning'})
    all = pd.merge(all,temp,how='left',on=col)

train = all.loc[:(train.shape[0]-1),]
test = all.loc[train.shape[0]:,].drop('ride_18_20',axis=1).reset_index(drop=True)

del all

added += [a for a in train.columns if '_morning' in a]

In [16]:
all = pd.concat([train,test],axis=0).reset_index(drop=True)

all['in_out_bus_route_id'] = all['bus_route_id'].astype('str') + all['in_out'].astype('str')

temp = all.groupby('in_out_bus_route_id')['id'].count().to_dict()

all['inout_bus_route_id_freq'] = all['in_out_bus_route_id'].map(temp)

temp = all.groupby('in_out_bus_route_id')['ride_6_12'].agg(['mean','min','max','sum']).rename(
columns = {
    'mean' : 'inout_bus_route_id'+ '_' + 'ride_6_12' +'_'+'mean',
    'min' : 'inout_bus_route_id'+ '_' + 'ride_6_12' +'_'+'min',
    'max' : 'inout_bus_route_id'+ '_' + 'ride_6_12' +'_'+'max',
    'sum' : 'inout_bus_route_id'+ '_' + 'ride_6_12' +'_'+'sum'
})

all = pd.merge(all,temp,how='left',on='in_out_bus_route_id')

temp = all.groupby('in_out_bus_route_id')['takeoff_6_12'].agg(['mean','min','max','sum']).rename(
columns = {
    'mean' : 'inout_bus_route_id'+ '_' + 'takeoff_6_12' +'_'+'mean',
    'min' : 'inout_bus_route_id'+ '_' + 'takeoff_6_12' +'_'+'min',
    'max' : 'inout_bus_route_id'+ '_' + 'takeoff_6_12' +'_'+'max',
    'sum' : 'inout_bus_route_id'+ '_' + 'takeoff_6_12' +'_'+'sum'
})

all = pd.merge(all,temp,how='left',on='in_out_bus_route_id')

train = all.loc[:(train.shape[0]-1),]
test = all.loc[train.shape[0]:,].drop('ride_18_20',axis=1).reset_index(drop=True)

del all

added += [a for a in train.columns if 'inout_' in a]

In [17]:
all = pd.concat([train,test],axis=0).reset_index(drop=True)
all['diff_ride_takeoff'] = all['ride_6_12'] - all['takeoff_6_12']

train = all.loc[:(train.shape[0]-1),]
test = all.loc[train.shape[0]:,].drop('ride_18_20',axis=1).reset_index(drop=True)

del all

added += [a for a in train.columns if 'diff_ride_takeoff' in a]

In [18]:
all = pd.concat([train,test],axis=0).reset_index(drop=True)

all['bus_route_id_station_code_concat'] = str(all['station_code']) + str(all['bus_route_id'])

train = all.loc[:(train.shape[0]-1),]
test = all.loc[train.shape[0]:,].drop('ride_18_20',axis=1).reset_index(drop=True)

del all

In [19]:
os.chdir('C:\\Users\\yeonjun.in\\Desktop\\연준\\캐글\\제주도\\data\\raw')
bus = pd.read_csv('bus_bts.csv')

all = pd.concat([train,test],axis=0).reset_index(drop=True)

code_which = all[['station_code','latitude','longitude']].drop_duplicates().reset_index(drop=True).rename(columns = {
    'station_code' : 'geton_station_code',
    'latitude' : 'geton_lat',
    'longitude' : 'geton_long'
})
bus = pd.merge(bus,code_which, how='left',on='geton_station_code')

code_which = all[['station_code','latitude','longitude']].drop_duplicates().reset_index(drop=True).rename(columns = {
    'station_code' : 'getoff_station_code',
    'latitude' : 'getoff_lat',
    'longitude' : 'getoff_long'
})
bus = pd.merge(bus,code_which, how='left',on='getoff_station_code')

all = pd.merge(all,pd.DataFrame({'station_code':bus[bus['geton_station_code'] == bus['getoff_station_code']].geton_station_code.unique(),
             'same_on_off' : 1}),how='left',on='station_code') 

all['same_on_off'] = all['same_on_off'].fillna(0)

In [20]:
geton = []
getoff = []
for aa,bb,cc,dd in zip(bus['geton_lat'],bus['geton_long'],bus['getoff_lat'],bus['getoff_long']):
    a = (aa,bb)
    b = (cc,dd)
    geton += [a]
    getoff += [b]
    
from haversine import haversine

dis = []
for on,off in zip(geton,getoff):
    dis += [haversine(on,off)]
    
bus['moving_dis'] = dis

temp = bus.groupby('bus_route_id')['moving_dis'].mean().fillna(0).to_dict()

all['moving_dis_per_bus'] = all['bus_route_id'].map(temp)

temp = bus.groupby('geton_station_code')['moving_dis'].mean().fillna(0).to_dict()

all['moving_dis_per_geton'] = all['station_code'].map(temp)

temp = bus.groupby('getoff_station_code')['moving_dis'].mean().fillna(0).to_dict()

all['moving_dis_per_getoff'] = all['station_code'].map(temp)

all['moving_dis_per_bus'] =  all['moving_dis_per_bus'].fillna(all['moving_dis_per_bus'].median()) 
all['moving_dis_per_getoff'] = all['moving_dis_per_getoff'].fillna(all['moving_dis_per_getoff'].median()) 
all['moving_dis_per_geton'] = all['moving_dis_per_geton'].fillna(all['moving_dis_per_geton'].median()) 

train = all.loc[:(train.shape[0]-1),]
test = all.loc[train.shape[0]:,].drop('ride_18_20',axis=1).reset_index(drop=True)

del all

added += [a for a in train.columns if 'moving_dis' in a]

In [21]:
all = pd.concat([train,test],axis=0).reset_index(drop=True)

bus['travel'] = bus['user_card_id'].map(bus['user_card_id'].value_counts()[bus['user_card_id'].value_counts()<10].to_dict())
bus['travel'] = np.where(bus['travel'].isnull(),1,0)
temp = bus.groupby('geton_station_code')['travel'].sum().to_dict()

all['travel1'] = all['station_code'].map(temp).fillna(0)


bus['travel'] = bus['user_card_id'].map(bus['user_card_id'].value_counts()[bus['user_card_id'].value_counts()<10].to_dict())
bus['travel'] = np.where(bus['travel'].isnull(),1,0)
temp = bus.groupby('getoff_station_code')['travel'].sum().to_dict()

all['travel2'] = all['station_code'].map(temp).fillna(0)

bus['travel'] = bus['user_card_id'].map(bus['user_card_id'].value_counts()[bus['user_card_id'].value_counts()<10].to_dict())
bus['travel'] = np.where(bus['travel'].isnull(),0,1)
temp = bus.groupby('geton_station_code')['travel'].sum().to_dict()

all['travel3'] = all['station_code'].map(temp).fillna(0)

bus['travel'] = bus['user_card_id'].map(bus['user_card_id'].value_counts()[bus['user_card_id'].value_counts()<10].to_dict())
bus['travel'] = np.where(bus['travel'].isnull(),0,1)
temp = bus.groupby('getoff_station_code')['travel'].sum().to_dict()

all['travel4'] = all['station_code'].map(temp).fillna(0)

train = all.loc[:(train.shape[0]-1),]
test = all.loc[train.shape[0]:,].drop('ride_18_20',axis=1).reset_index(drop=True)

del all, bus

added += ['travel1','travel2','travel3','travel4']

In [22]:
all = pd.concat([train,test],axis=0).reset_index(drop=True)

temp = np.power(all[['ride_6_7','ride_7_8','ride_8_9','ride_9_10','ride_10_11','ride_11_12','takeoff_6_7','takeoff_7_8','takeoff_8_9','takeoff_9_10','takeoff_10_11','takeoff_11_12']],2)
temp.columns = [a+'_power' for a in tuple(temp.columns)]

all = pd.concat([all,temp],axis=1)

train = all.loc[:(train.shape[0]-1),]
test = all.loc[train.shape[0]:,].drop('ride_18_20',axis=1).reset_index(drop=True)

del all

added += [a for a in train.columns if 'power' in a]

In [23]:
import geopy.distance 

all = pd.concat([train,test],axis=0).reset_index(drop=True)

jeju=(33.51411, 126.52969) # 제주 측정소 근처
gosan=(33.29382, 126.16283) #고산 측정소 근처
seongsan=(33.38677, 126.8802) #성산 측정소 근처
po=(33.24616, 126.5653) #서귀포 측정소 근처

t1 = [geopy.distance.vincenty( (i,j), jeju).km for i,j in list( zip( all['latitude'],all['longitude'] )) ]
t2 = [geopy.distance.vincenty( (i,j), gosan).km for i,j in list( zip( all['latitude'],all['longitude'] )) ]
t3 = [geopy.distance.vincenty( (i,j), seongsan).km for i,j in list( zip( all['latitude'],all['longitude'] )) ]
t4 = [geopy.distance.vincenty( (i,j), po).km for i,j in list( zip( all['latitude'],all['longitude'] )) ]

all['dis_jeju']=t1
all['dis_gosan']=t2
all['dis_seongsan']=t3
all['dis_po']=t4

all['dist_name'] = all[['dis_jeju','dis_gosan','dis_seongsan','dis_po']].apply(lambda x: np.argmin(x),axis=1).str.slice(4,)

train = all.loc[:(train.shape[0]-1),]
test = all.loc[train.shape[0]:,].drop('ride_18_20',axis=1).reset_index(drop=True)

del all

added += ['dis_jeju','dis_gosan','dis_seongsan','dis_po']

In [24]:
# 2019년 9월, 10월 06~12 시의 날씨 데이터 기상청에서 가져옴.
os.chdir('C:\\Users\\yeonjun.in\\Desktop\\연준\\캐글\\제주도\\data')
rain = pd.read_csv('rain.csv')[['date','dist_name','sum','std']].rename(columns={'sum':'rain_sum',
                                              'std':'rain_std',
                                              'max' : 'rain_max',
                                              'min': 'rain_min'})
temper = pd.read_csv('temper.csv')[['date','dist_name','mean','std']].rename(columns = {'mean':'temp_mean',
                                                    'std':'temp_std',
                                                    'max' : 'temp_max',
                                                    'min' : 'temp_min'})

train['date'] = train['date'].astype('str')
test['date'] = test['date'].astype('str')

train = pd.merge(train,rain,how='left',on=['dist_name','date'])
train = pd.merge(train,temper,how='left',on=['dist_name','date'])
train['temp_mean'] = train['temp_mean']/(train['temp_mean'].max())
train['temp_std'] = train['temp_std']/(train['temp_std'].max())

test = pd.merge(test,rain,how='left',on=['dist_name','date'])
test = pd.merge(test,temper,how='left',on=['dist_name','date'])
test['temp_mean'] = test['temp_mean']/(test['temp_mean'].max())
test['temp_std'] = test['temp_std']/(test['temp_std'].max())

added += ['rain_sum','rain_std','temp_mean','temp_std']#,'rain_max','rain_min','temp_max','temp_min']

In [25]:
all = pd.concat([train,test],axis=0).reset_index(drop=True)
rmean = all[['date','bus_route_id','station_code','ride_6_12']].sort_values(['bus_route_id','station_code','date']).groupby(['bus_route_id','station_code'])['ride_6_12'].rolling(3).mean().reset_index()[['bus_route_id','station_code','ride_6_12']]
rmean['date'] = all[['date','bus_route_id','station_code']].sort_values(['bus_route_id','station_code','date']).reset_index(drop=True)['date']
all = pd.merge(all,rmean.rename(columns = {'ride_6_12' : 'r3mean_ride_6_12'}),how='left',on=['bus_route_id','station_code','date'])

all['r3mean_ride_6_12'] = all[['bus_route_id','station_code','ride_6_12','r3mean_ride_6_12']].groupby(['bus_route_id','station_code']).\
apply(lambda x: x.fillna(x['ride_6_12'].median()))['r3mean_ride_6_12']

rmean = all[['date','bus_route_id','station_code','ride_6_12']].sort_values(['bus_route_id','station_code','date']).groupby(['bus_route_id','station_code'])['ride_6_12'].rolling(5).mean().reset_index()[['bus_route_id','station_code','ride_6_12']]
rmean['date'] = all[['date','bus_route_id','station_code']].sort_values(['bus_route_id','station_code','date']).reset_index(drop=True)['date']
all = pd.merge(all,rmean.rename(columns = {'ride_6_12' : 'r5mean_ride_6_12'}),how='left',on=['bus_route_id','station_code','date'])

all['r5mean_ride_6_12'] = all[['bus_route_id','station_code','ride_6_12','r5mean_ride_6_12']].groupby(['bus_route_id','station_code']).\
apply(lambda x: x.fillna(x['ride_6_12'].median()))['r5mean_ride_6_12']

train = all.loc[:(train.shape[0]-1),]
test = all.loc[train.shape[0]:,].drop('ride_18_20',axis=1).reset_index(drop=True)

del all

added += ['r3mean_ride_6_12','r5mean_ride_6_12']

In [26]:
import geopy.distance
os.chdir('C:\\Users\\yeonjun.in\\Desktop\\연준\\캐글\\제주도\\data\\raw')
all = pd.concat([train,test],axis=0).reset_index(drop=True)

pop_refine = pd.read_csv('pop_refine.csv',engine='python')

temp = all[['latitude','longitude']].drop_duplicates()

temp = pd.DataFrame({
    'latitude' : list(np.repeat(temp['latitude'],len(pop_refine))),
    'longitude' : list(np.repeat(temp['longitude'],len(pop_refine))),
    'index' : list(np.repeat(range(len(temp)),len(pop_refine))),
    'latitude_' : list(np.tile(pop_refine['latitude'],len(temp))),
    'longitude_' :list(np.tile(pop_refine['longitude'],len(temp))),
    'pop' :list(np.tile(pop_refine['pop'],len(temp)))})

temp['dist'] = temp.apply(lambda x: geopy.distance.vincenty((x['latitude'],x['longitude']), (x['latitude_'],x['longitude_'])).km,axis=1)

temp = pd.merge(temp,temp.groupby(['latitude','longitude'])['dist'].min().reset_index(),how='right',on=['latitude','longitude','dist'])[['latitude','longitude','pop']]

temp['pop'] = [int(a.replace(',','')) for a in temp['pop'] ]

all = pd.merge(all,temp,how='left',on=['latitude','longitude'])

all['pop_weekday'] = (all['pop'].astype('str') + all['weekday_var'].astype('str')).astype('int')

train = all.loc[:(train.shape[0]-1),]
test = all.loc[train.shape[0]:,].drop('ride_18_20',axis=1).reset_index(drop=True)

del all,temp

added += ['pop','pop_weekday']

In [27]:
input_var = input_var + added

In [28]:
mean_encoding_col = ['station_code','bus_route_id','station_name','bus_route_id_station_code_concat']

In [29]:
for col in mean_encoding_col:
    input_var += [col+'_'+'mean_target_encoding']
    input_var += [col+'_'+'max_target_encoding']
    input_var += [col+'_'+'min_target_encoding']

In [30]:
y_train=train[target]

feature_imporatnce = pd.DataFrame()

NFOLDS = 6
random_seed = sample(seed_list,1)

stk = StratifiedKFold(n_splits=NFOLDS,random_state = random_seed[0],shuffle=True)

In [31]:
def mean_encoding(col,tr,vl,tst):

    temp = tr.groupby([col])['ride_18_20'].agg(['mean','max','min']).reset_index().\
    rename(columns = {'mean' : col+'_'+'mean_target_encoding',
                     'max' : col+'_'+'max_target_encoding',
                     'min' : col+'_'+'min_target_encoding'
                     })
    tr_ = pd.merge(tr,temp,how='left',on= col)
    vl_ = pd.merge(vl,temp,how='left',on= col)
    tst_ = pd.merge(tst,temp,how='left',on= col)
    
    cols = [a for a in tr_.columns if 'target_encoding' in a]
    
    tr_[cols] = tr_[cols].fillna(0)
    vl_[cols] = vl_[cols].fillna(0)
    tst_[cols] = tst_[cols].fillna(0)

    return tr_, vl_,tst_

In [32]:
start = datetime.now()

time = str(start.hour)+'hr'
minute = str(start.minute)+'min'

cv_train = np.zeros(len(y_train))
cv_pred = np.zeros(test.shape[0])
fold_scores = []


for fold_, (tr_index, vl_index) in enumerate(stk.split(train,train['date'])):
    print('Fold:', fold_+1)
  
    x_tr, x_vl = train.iloc[tr_index], train.iloc[vl_index]
    y_tr, y_vl = train[target].iloc[tr_index], train[target].iloc[vl_index]
    x_tst = test.copy()
    
    for aaaa in mean_encoding_col:
        x_tr,x_vl,x_tst = mean_encoding(aaaa,x_tr,x_vl,x_tst)
    
    tr = x_tr[input_var]
    vl = x_vl[input_var]
    tst = x_tst[input_var]
    
    rf = RandomForestRegressor(random_state=random_seed[0],n_estimators=100,criterion='mse')
    rf.fit(tr,y_tr)
    
    feature_imporatnce = pd.concat([feature_imporatnce, pd.DataFrame({'feature':input_var,'importance':rf.feature_importances_})],axis=0)
    
    pred = rf.predict(vl)
    
    print(np.sqrt(mean_squared_error(y_vl,pred)))
    cv_train[vl_index] += pred
    cv_pred += rf.predict(tst)
    
    print('-'*40+'\n\n')
    
cv_pred /= NFOLDS

vl_error = np.sqrt(mean_squared_error(np.array(y_train).flatten(),cv_train))

print('cv score:')
print(vl_error)

sub['18~20_ride'] = cv_pred

end = datetime.now()

print(end - start)

Fold: 1
2.19871576875772
----------------------------------------


Fold: 2
2.220187327859567
----------------------------------------


Fold: 3
2.331361702075442
----------------------------------------


Fold: 4
2.388437173577692
----------------------------------------


Fold: 5
2.438631318137795
----------------------------------------


Fold: 6
2.3642441683452904
----------------------------------------


cv score:
2.325215824101237
2:01:50.699081


In [42]:
feature_imporatnce.groupby(['feature'])['importance'].mean().reset_index().sort_values('importance',ascending=False).head(20)

Unnamed: 0,feature,importance
51,r5mean_ride_6_12,0.186138
50,r3mean_ride_6_12,0.116973
68,station_code_max_target_encoding,0.107976
58,ride_6_12,0.100534
2,bus_route_id_mean_target_encoding,0.045989
69,station_code_mean_target_encoding,0.043526
81,station_name_mean_target_encoding,0.040656
80,station_name_max_target_encoding,0.040159
49,pop_weekday,0.023517
105,temp_std,0.018739
