In [75]:
%matplotlib inline
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime

In [76]:
data = {
    'air_visit_data': pd.read_csv('Data/air_visit_data.csv' ,parse_dates=['visit_date']),
    'air_store_info': pd.read_csv('Data/air_store_info.csv'),
    'hpg_store_info': pd.read_csv('Data/hpg_store_info.csv'),
    'air_reserve': pd.read_csv('Data/air_reserve.csv' ,parse_dates=['visit_datetime', 'reserve_datetime']),
    'hpg_reserve': pd.read_csv('Data/hpg_reserve.csv',parse_dates=['visit_datetime', 'reserve_datetime']),
    'store_id_relation': pd.read_csv('Data/store_id_relation.csv'),
    'sample_submission': pd.read_csv('Data/sample_submission.csv'),
    'date_info': pd.read_csv('Data/date_info.csv',parse_dates=['calendar_date']).rename(columns={'calendar_date':'visit_date'})
    }

In [77]:
#merge store_id_relation to hpg_reserve on hpg_store_id, related with air_store_id
data['hpg_reserve'] = pd.merge(data['hpg_reserve'], data['store_id_relation'], how='inner', on=['hpg_store_id'])

# **Adding date and time features **

In [78]:
data['air_visit_data']['DayInMonth'] = data['air_visit_data']['visit_date'].dt.day
data['air_visit_data']['Month']  = data['air_visit_data']['visit_date'].dt.month
data['air_visit_data']['DayOfWeek'] = data['air_visit_data']['visit_date'].dt.weekday
data['air_visit_data']['Year'] = data['air_visit_data']['visit_date'].dt.year
data['air_visit_data']['Quarter'] = data['air_visit_data']['visit_date'].dt.quarter
data['air_visit_data']['visit_date'] = data['air_visit_data']['visit_date'].dt.date

In [79]:
data['air_reserve']['Visit_DayInMonth'] = data['air_reserve']['visit_datetime'].dt.day
data['air_reserve']['Visit_Month']  = data['air_reserve']['visit_datetime'].dt.month
data['air_reserve']['Visit_DayOfWeek'] = data['air_reserve']['visit_datetime'].dt.weekday
data['air_reserve']['Visit_Year'] = data['air_reserve']['visit_datetime'].dt.year
data['air_reserve']['Visit_Quarter'] = data['air_reserve']['visit_datetime'].dt.quarter

In [80]:
data['hpg_reserve']['Visit_DayInMonth'] = data['hpg_reserve']['visit_datetime'].dt.day
data['hpg_reserve']['Visit_Month']  = data['hpg_reserve']['visit_datetime'].dt.month
data['hpg_reserve']['Visit_DayOfWeek'] = data['hpg_reserve']['visit_datetime'].dt.weekday
data['hpg_reserve']['Visit_Year'] = data['hpg_reserve']['visit_datetime'].dt.year
data['hpg_reserve']['Visit_Quarter'] = data['hpg_reserve']['visit_datetime'].dt.quarter

In [81]:
for df in ['air_reserve','hpg_reserve']:
    data[df]['visit_datetime'] = data[df]['visit_datetime'].dt.date
    data[df]['reserve_datetime'] = data[df]['reserve_datetime'].dt.date
    data[df]['reserve_datetime_diff'] = data[df].apply(lambda r: (r['visit_datetime'] - r['reserve_datetime']).days, axis=1)
    tmp1 = data[df].groupby(['air_store_id','visit_datetime'], as_index=False)[['reserve_datetime_diff', 'reserve_visitors']].sum().rename(columns={'visit_datetime':'visit_date', 'reserve_datetime_diff': 'rs1_date', 'reserve_visitors':'rv1_date'})
    tmp2 = data[df].groupby(['air_store_id','visit_datetime'], as_index=False)[['reserve_datetime_diff', 'reserve_visitors']].mean().rename(columns={'visit_datetime':'visit_date', 'reserve_datetime_diff': 'rs2_date', 'reserve_visitors':'rv2_date'})
    data[df+'_date'] = pd.merge(tmp1, tmp2, how='inner', on=['air_store_id','visit_date'])

In [82]:
for df in ['air_reserve','hpg_reserve']:
    tmp1 = data[df].groupby(['air_store_id','Visit_DayInMonth'], as_index=False)[['reserve_datetime_diff', 'reserve_visitors']].sum().rename(columns={'Visit_DayInMonth':'DayInMonth', 'reserve_datetime_diff': 'rs1_day', 'reserve_visitors':'rv1_day'})
    tmp2 = data[df].groupby(['air_store_id','Visit_DayInMonth'], as_index=False)[['reserve_datetime_diff', 'reserve_visitors']].mean().rename(columns={'Visit_DayInMonth':'DayInMonth', 'reserve_datetime_diff': 'rs2_day', 'reserve_visitors':'rv2_day'})
    data[df+'_day'] = pd.merge(tmp1, tmp2, how='inner', on=['air_store_id','DayInMonth'])

In [83]:
for df in ['air_reserve','hpg_reserve']:
    tmp1 = data[df].groupby(['air_store_id','Visit_Month'], as_index=False)[['reserve_datetime_diff', 'reserve_visitors']].sum().rename(columns={'Visit_Month':'Month', 'reserve_datetime_diff': 'rs1_month', 'reserve_visitors':'rv1_month'})
    tmp2 = data[df].groupby(['air_store_id','Visit_Month'], as_index=False)[['reserve_datetime_diff', 'reserve_visitors']].mean().rename(columns={'Visit_Month':'Month', 'reserve_datetime_diff': 'rs2_month', 'reserve_visitors':'rv2_month'})
    data[df+'_month'] = pd.merge(tmp1, tmp2, how='inner', on=['air_store_id','Month'])

In [84]:
for df in ['air_reserve','hpg_reserve']:
    tmp1 = data[df].groupby(['air_store_id','Visit_DayOfWeek'], as_index=False)[['reserve_datetime_diff', 'reserve_visitors']].sum().rename(columns={'Visit_DayOfWeek':'DayOfWeek', 'reserve_datetime_diff': 'rs1_week', 'reserve_visitors':'rv1_week'})
    tmp2 = data[df].groupby(['air_store_id','Visit_DayOfWeek'], as_index=False)[['reserve_datetime_diff', 'reserve_visitors']].mean().rename(columns={'Visit_DayOfWeek':'DayOfWeek', 'reserve_datetime_diff': 'rs2_week', 'reserve_visitors':'rv2_week'})    
    #tmp1 = data[df].groupby(['air_store_id','Visit_DayOfWeek'], as_index=False)['reserve_datetime_diff'].agg([np.sum, np.mean]).rename(columns={'Visit_DayOfWeek':'DayOfWeek', 'sum': 'rs_sum_day', 'mean':'rs2_mean_day'})
    #tmp2 = data[df].groupby(['air_store_id','Visit_DayOfWeek'], as_index=False)['reserve_visitors'].agg([np.sum, np.mean, np.max, np.min]).rename(columns={'Visit_DayOfWeek':'DayOfWeek', 'sum': 'rv_sum_day', 'mean':'rv_mean_day', 'max':'rv_max_day', 'min':'rv_min_day'})
    data[df+'_week'] = pd.merge(tmp1, tmp2, how='inner', on=['air_store_id','DayOfWeek'])

In [85]:
for df in ['air_reserve','hpg_reserve']:
    tmp1 = data[df].groupby(['air_store_id','Visit_Year'], as_index=False)[['reserve_datetime_diff', 'reserve_visitors']].sum().rename(columns={'Visit_Year':'Year', 'reserve_datetime_diff': 'rs1_year', 'reserve_visitors':'rv1_year'})
    tmp2 = data[df].groupby(['air_store_id','Visit_Year'], as_index=False)[['reserve_datetime_diff', 'reserve_visitors']].mean().rename(columns={'Visit_Year':'Year', 'reserve_datetime_diff': 'rs2_year', 'reserve_visitors':'rv2_year'})
    data[df+'_year'] = pd.merge(tmp1, tmp2, how='inner', on=['air_store_id','Year'])

In [86]:
for df in ['air_reserve','hpg_reserve']:
    tmp1 = data[df].groupby(['air_store_id','Visit_Quarter'], as_index=False)[['reserve_datetime_diff', 'reserve_visitors']].sum().rename(columns={'Visit_Quarter':'Quarter', 'reserve_datetime_diff': 'rs1_quarter', 'reserve_visitors':'rv1_quarter'})
    tmp2 = data[df].groupby(['air_store_id','Visit_Quarter'], as_index=False)[['reserve_datetime_diff', 'reserve_visitors']].mean().rename(columns={'Visit_Quarter':'Quarter', 'reserve_datetime_diff': 'rs2_quarter', 'reserve_visitors':'rv2_quarter'})
    data[df+'_quarter'] = pd.merge(tmp1, tmp2, how='inner', on=['air_store_id','Quarter'])

In [87]:
def city_name(area):
    return (area.split(' ')[0])

def state_name(area):
    return (area.split(' ')[0] + ' ' + area.split(' ')[1])

data['air_store_info']['city_name'] = data['air_store_info']['air_area_name'].map(city_name)
data['air_store_info']['region_name'] = data['air_store_info']['air_area_name'].map(state_name)

### **Calculate distance in kilometers from latitude and longitude**

In [88]:
data['sample_submission']['air_store_id'] = data['sample_submission']['id'].map(lambda x: '_'.join(x.split('_')[:2]))
data['sample_submission']['visit_date'] = data['sample_submission']['id'].map(lambda x: str(x).split('_')[2])
data['sample_submission']['visit_date'] = pd.to_datetime(data['sample_submission']['visit_date'])
data['sample_submission']['DayInMonth'] = data['sample_submission']['visit_date'].dt.day
data['sample_submission']['Month'] = data['sample_submission']['visit_date'].dt.month
data['sample_submission']['DayOfWeek'] = data['sample_submission']['visit_date'].dt.dayofweek
data['sample_submission']['Year'] = data['sample_submission']['visit_date'].dt.year
data['sample_submission']['Quarter'] = data['sample_submission']['visit_date'].dt.quarter
data['sample_submission']['visit_date'] = data['sample_submission']['visit_date'].dt.date
unique_stores = data['sample_submission']['air_store_id'].unique()

In [89]:
stores_weekly = pd.concat([pd.DataFrame({'air_store_id': unique_stores, 'DayOfWeek': [i]*len(unique_stores)}) for i in range(7)], axis=0, ignore_index=True).reset_index(drop=True)
stores_monthly = pd.concat([pd.DataFrame({'air_store_id': unique_stores, 'Month': [i]*len(unique_stores)}) for i in range(1,13)], axis=0, ignore_index=True).reset_index(drop=True)
stores_daily = pd.concat([pd.DataFrame({'air_store_id': unique_stores, 'DayInMonth': [i]*len(unique_stores)}) for i in range(1,31)], axis=0, ignore_index=True).reset_index(drop=True)
stores_Quarterly = pd.concat([pd.DataFrame({'air_store_id': unique_stores, 'Quarter': [i]*len(unique_stores)}) for i in range(1,5)], axis=0, ignore_index=True).reset_index(drop=True)
stores_Yearly = pd.concat([pd.DataFrame({'air_store_id': unique_stores, 'Year': [i]*len(unique_stores)}) for i in [2016,2017]], axis=0, ignore_index=True).reset_index(drop=True)

In [90]:
tmp = data['air_visit_data'].groupby(['air_store_id','DayOfWeek'], as_index=False)['visitors'].min().rename(columns={'visitors':'min_visitors_weekly'})
stores_weekly = pd.merge(stores_weekly, tmp, how='left', on=['air_store_id','DayOfWeek']) 
tmp = data['air_visit_data'].groupby(['air_store_id','DayOfWeek'], as_index=False)['visitors'].mean().rename(columns={'visitors':'mean_visitors_weekly'})
stores_weekly = pd.merge(stores_weekly, tmp, how='left', on=['air_store_id','DayOfWeek'])
tmp = data['air_visit_data'].groupby(['air_store_id','DayOfWeek'], as_index=False)['visitors'].median().rename(columns={'visitors':'median_visitors_weekly'})
stores_weekly = pd.merge(stores_weekly, tmp, how='left', on=['air_store_id','DayOfWeek'])
tmp = data['air_visit_data'].groupby(['air_store_id','DayOfWeek'], as_index=False)['visitors'].max().rename(columns={'visitors':'max_visitors_weekly'})
stores_weekly = pd.merge(stores_weekly, tmp, how='left', on=['air_store_id','DayOfWeek'])
tmp = data['air_visit_data'].groupby(['air_store_id','DayOfWeek'], as_index=False)['visitors'].count().rename(columns={'visitors':'count_observations_weekly'})
stores_weekly = pd.merge(stores_weekly, tmp, how='left', on=['air_store_id','DayOfWeek']) 

In [91]:
tmp = data['air_visit_data'].groupby(['air_store_id','Month'], as_index=False)['visitors'].min().rename(columns={'visitors':'min_visitors_monthly'})
stores_monthly = pd.merge(stores_monthly, tmp, how='left', on=['air_store_id','Month']) 
tmp = data['air_visit_data'].groupby(['air_store_id','Month'], as_index=False)['visitors'].mean().rename(columns={'visitors':'mean_visitors_monthly'})
stores_monthly = pd.merge(stores_monthly, tmp, how='left', on=['air_store_id','Month'])
tmp = data['air_visit_data'].groupby(['air_store_id','Month'], as_index=False)['visitors'].median().rename(columns={'visitors':'median_visitors_monthly'})
stores_monthly = pd.merge(stores_monthly, tmp, how='left', on=['air_store_id','Month'])
tmp = data['air_visit_data'].groupby(['air_store_id','Month'], as_index=False)['visitors'].max().rename(columns={'visitors':'max_visitors_monthly'})
stores_monthly = pd.merge(stores_monthly, tmp, how='left', on=['air_store_id','Month'])
tmp = data['air_visit_data'].groupby(['air_store_id','Month'], as_index=False)['visitors'].count().rename(columns={'visitors':'count_observations_monthly'})
stores_monthly = pd.merge(stores_monthly, tmp, how='left', on=['air_store_id','Month']) 

In [92]:
tmp = data['air_visit_data'].groupby(['air_store_id','DayInMonth'], as_index=False)['visitors'].min().rename(columns={'visitors':'min_visitors_daily'})
stores_daily = pd.merge(stores_daily, tmp, how='left', on=['air_store_id','DayInMonth']) 
tmp = data['air_visit_data'].groupby(['air_store_id','DayInMonth'], as_index=False)['visitors'].mean().rename(columns={'visitors':'mean_visitors_daily'})
stores_daily = pd.merge(stores_daily, tmp, how='left', on=['air_store_id','DayInMonth'])
tmp = data['air_visit_data'].groupby(['air_store_id','DayInMonth'], as_index=False)['visitors'].median().rename(columns={'visitors':'median_visitors_daily'})
stores_daily = pd.merge(stores_daily, tmp, how='left', on=['air_store_id','DayInMonth'])
tmp = data['air_visit_data'].groupby(['air_store_id','DayInMonth'], as_index=False)['visitors'].max().rename(columns={'visitors':'max_visitors_daily'})
stores_daily = pd.merge(stores_daily, tmp, how='left', on=['air_store_id','DayInMonth'])
tmp = data['air_visit_data'].groupby(['air_store_id','DayInMonth'], as_index=False)['visitors'].count().rename(columns={'visitors':'count_observations_daily'})
stores_daily = pd.merge(stores_daily, tmp, how='left', on=['air_store_id','DayInMonth']) 

In [93]:
tmp = data['air_visit_data'].groupby(['air_store_id','Quarter'], as_index=False)['visitors'].min().rename(columns={'visitors':'min_visitors_quarterly'})
stores_Quarterly = pd.merge(stores_Quarterly, tmp, how='left', on=['air_store_id','Quarter']) 
tmp = data['air_visit_data'].groupby(['air_store_id','Quarter'], as_index=False)['visitors'].mean().rename(columns={'visitors':'mean_visitors_quarterly'})
stores_Quarterly = pd.merge(stores_Quarterly, tmp, how='left', on=['air_store_id','Quarter'])
tmp = data['air_visit_data'].groupby(['air_store_id','Quarter'], as_index=False)['visitors'].median().rename(columns={'visitors':'median_visitors_quarterly'})
stores_Quarterly = pd.merge(stores_Quarterly, tmp, how='left', on=['air_store_id','Quarter'])
tmp = data['air_visit_data'].groupby(['air_store_id','Quarter'], as_index=False)['visitors'].max().rename(columns={'visitors':'max_visitors_quarterly'})
stores_Quarterly = pd.merge(stores_Quarterly, tmp, how='left', on=['air_store_id','Quarter'])
tmp = data['air_visit_data'].groupby(['air_store_id','Quarter'], as_index=False)['visitors'].count().rename(columns={'visitors':'count_observations_quarterly'})
stores_Quarterly = pd.merge(stores_Quarterly, tmp, how='left', on=['air_store_id','Quarter']) 

In [94]:
tmp = data['air_visit_data'].groupby(['air_store_id','Year'], as_index=False)['visitors'].min().rename(columns={'visitors':'min_visitors_yearly'})
stores_Yearly = pd.merge(stores_Yearly, tmp, how='left', on=['air_store_id','Year']) 
tmp = data['air_visit_data'].groupby(['air_store_id','Year'], as_index=False)['visitors'].mean().rename(columns={'visitors':'mean_visitors_yearly'})
stores_Yearly = pd.merge(stores_Yearly, tmp, how='left', on=['air_store_id','Year'])
tmp = data['air_visit_data'].groupby(['air_store_id','Year'], as_index=False)['visitors'].median().rename(columns={'visitors':'median_visitors_yearly'})
stores_Yearly = pd.merge(stores_Yearly, tmp, how='left', on=['air_store_id','Year'])
tmp = data['air_visit_data'].groupby(['air_store_id','Year'], as_index=False)['visitors'].max().rename(columns={'visitors':'max_visitors_yearly'})
stores_Yearly = pd.merge(stores_Yearly, tmp, how='left', on=['air_store_id','Year'])
tmp = data['air_visit_data'].groupby(['air_store_id','Year'], as_index=False)['visitors'].count().rename(columns={'visitors':'count_observations_yearly'})
stores_Yearly = pd.merge(stores_Yearly, tmp, how='left', on=['air_store_id','Year']) 

In [95]:
stores_weekly = pd.merge(stores_weekly, data['air_store_info'], how='left', on=['air_store_id']) 
stores_monthly = pd.merge(stores_monthly, data['air_store_info'], how='left', on=['air_store_id']) 
stores_daily = pd.merge(stores_daily, data['air_store_info'], how='left', on=['air_store_id']) 
stores_Quarterly = pd.merge(stores_Quarterly, data['air_store_info'], how='left', on=['air_store_id']) 
stores_Yearly = pd.merge(stores_Yearly, data['air_store_info'], how='left', on=['air_store_id']) 

In [96]:
def day_of_week(day):
    if day== 'Monday':
        return 0
    elif day=='Tuesday':
        return 1
    elif day == 'Wednesday':
        return 2
    elif day=='Thursday':
        return 3
    elif day =='Friday':
        return 4
    elif day =='Saturday':
        return 5
    elif day == 'Sunday':
        return 6
data['date_info']['day_of_week'] =data['date_info']['day_of_week'].map(day_of_week)

In [97]:
data['date_info']['visit_date'] = data['date_info']['visit_date'].dt.date

In [98]:
data['date_info'].rename(columns={'day_of_week':'DayOfWeek'}, inplace=True)

In [99]:
traindf = pd.merge(data['air_visit_data'], data['date_info'], how='left', on=['visit_date','DayOfWeek'])
testdf = pd.merge(data['sample_submission'], data['date_info'], how='left', on=['visit_date','DayOfWeek']) 

In [100]:
train = pd.merge(traindf, stores_weekly, how='left', on=['air_store_id','DayOfWeek']) 
test = pd.merge(testdf, stores_weekly, how='left', on=['air_store_id','DayOfWeek'])

In [101]:
train = pd.merge(train, stores_daily, how='left', on=['air_store_id','DayInMonth','air_genre_name','air_area_name',
                                                     'latitude', 'longitude', 'city_name', 'region_name']) 
test = pd.merge(test, stores_daily, how='left', on=['air_store_id','DayInMonth','air_genre_name','air_area_name',
                                                     'latitude', 'longitude', 'city_name', 'region_name'])

In [102]:
train = pd.merge(train, stores_monthly, how='left', on=['air_store_id','Month','air_genre_name','air_area_name',
                                                     'latitude', 'longitude', 'city_name', 'region_name']) 
test = pd.merge(test, stores_monthly, how='left', on=['air_store_id','Month','air_genre_name','air_area_name',
                                                     'latitude', 'longitude', 'city_name', 'region_name'])

In [103]:
train = pd.merge(train, stores_Quarterly, how='left', on=['air_store_id','Quarter','air_genre_name','air_area_name',
                                                     'latitude', 'longitude', 'city_name', 'region_name']) 
test = pd.merge(test, stores_Quarterly, how='left', on=['air_store_id','Quarter','air_genre_name','air_area_name',
                                                     'latitude', 'longitude', 'city_name', 'region_name'])

In [104]:
train = pd.merge(train, stores_Yearly, how='left', on=['air_store_id','Year','air_genre_name','air_area_name',
                                                     'latitude', 'longitude', 'city_name', 'region_name']) 
test = pd.merge(test, stores_Yearly, how='left', on=['air_store_id','Year','air_genre_name','air_area_name',
                                                     'latitude', 'longitude', 'city_name', 'region_name'])

In [105]:
for df in ['air_reserve_date','hpg_reserve_date']:
    train = pd.merge(train, data[df], how='left', on=['air_store_id','visit_date']) 
    test = pd.merge(test, data[df], how='left', on=['air_store_id','visit_date'])

In [106]:
for df in ['air_reserve_day','hpg_reserve_day']:
    train = pd.merge(train, data[df], how='left', on=['air_store_id','DayInMonth']) 
    test = pd.merge(test, data[df], how='left', on=['air_store_id','DayInMonth'])

In [107]:
for df in ['air_reserve_month','hpg_reserve_month']:
    train = pd.merge(train, data[df], how='left', on=['air_store_id','Month']) 
    test = pd.merge(test, data[df], how='left', on=['air_store_id','Month'])

In [108]:
for df in ['air_reserve_week','hpg_reserve_week']:
    train = pd.merge(train, data[df], how='left', on=['air_store_id','DayOfWeek']) 
    test = pd.merge(test, data[df], how='left', on=['air_store_id','DayOfWeek'])

In [109]:
for df in ['air_reserve_year','hpg_reserve_year']:
    train = pd.merge(train, data[df], how='left', on=['air_store_id','Year']) 
    test = pd.merge(test, data[df], how='left', on=['air_store_id','Year'])

In [110]:
for df in ['air_reserve_quarter','hpg_reserve_quarter']:
    train = pd.merge(train, data[df], how='left', on=['air_store_id','Quarter']) 
    test = pd.merge(test, data[df], how='left', on=['air_store_id','Quarter'])

In [111]:
#date
train['rsrv_sum_date'] = (train['rv1_date_x'] + train['rv1_date_y'])
train['rsrv_mean_date'] = (train['rv2_date_x'] + train['rv2_date_y'])/2
train['rsv_datetime_mean_date'] = (train['rs2_date_x'] + train['rs2_date_y'])/2
train['rsv_datetime_sum_date'] = (train['rs1_date_x'] + train['rs1_date_y'])



In [112]:

#day
train['rsrv_sum_day'] = (train['rv1_day_x'] + train['rv1_day_y'])
train['rsrv_mean_day'] = (train['rv2_day_x'] + train['rv2_day_y'])/2
train['rsv_datetime_mean_day'] = (train['rs2_day_x'] + train['rs2_day_y'])/2
train['rsv_datetime_sum_day'] = (train['rs1_day_x'] + train['rs1_day_y'])

#month
train['rsrv_sum_month'] = (train['rv1_month_x'] + train['rv1_date_y'])
train['rsrv_mean_month'] = (train['rv2_date_x'] + train['rv2_date_y'])/2
train['rsv_datetime_mean_month'] = (train['rs2_month_x'] + train['rs2_month_y'])/2
train['rsv_datetime_sum_month'] = (train['rs1_month_x'] + train['rs1_month_y'])

#week
train['rsrv_sum_week'] = (train['rv1_week_x'] + train['rv1_week_y'])
train['rsrv_mean_week'] = (train['rv2_week_x'] + train['rv2_week_y'])/2
train['rsv_datetime_mean_week'] = (train['rs2_week_x'] + train['rs2_week_y'])/2
train['rsv_datetime_sum_week'] = (train['rs1_week_x'] + train['rs1_week_y'])

#year
train['rsrv_sum_year'] = (train['rv1_year_x'] + train['rv1_year_y'])
train['rsrv_mean_year'] = (train['rv2_year_x'] + train['rv2_year_y'])/2
train['rsv_datetime_mean_year'] = (train['rs2_year_x'] + train['rs2_year_y'])/2
train['rsv_datetime_sum_year'] = (train['rs1_year_x'] + train['rs1_year_y'])

#Quarter
train['rsrv_sum_quarter'] = (train['rv1_quarter_x'] + train['rv1_quarter_y'])
train['rsrv_mean_quarter'] = (train['rv2_quarter_x'] + train['rv2_quarter_y'])/2
train['rsv_datetime_mean_quarter'] = (train['rs2_quarter_x'] + train['rs2_quarter_y'])/2
train['rsv_datetime_sum_quarter'] = (train['rs1_quarter_x'] + train['rs1_quarter_y'])

In [113]:
#date
test['rsrv_sum_date'] = (test['rv1_date_x'] + test['rv1_date_y'])
test['rsrv_mean_date'] = (test['rv2_date_x'] + test['rv2_date_y'])/2
test['rsv_datetime_mean_date'] = (test['rs2_date_x'] + test['rs2_date_y'])/2
test['rsv_datetime_sum_date'] = (test['rs1_date_x'] + test['rs1_date_y'])



In [114]:

#day
test['rsrv_sum_day'] = (test['rv1_day_x'] + test['rv1_day_y'])
test['rsrv_mean_day'] = (test['rv2_day_x'] + test['rv2_day_y'])/2
test['rsv_datetime_mean_day'] = (test['rs2_day_x'] + test['rs2_day_y'])/2
test['rsv_datetime_sum_day'] = (test['rs1_day_x'] + test['rs1_day_y'])

#month
test['rsrv_sum_month'] = (test['rv1_month_x'] + test['rv1_date_y'])
test['rsrv_mean_month'] = (test['rv2_date_x'] + test['rv2_date_y'])/2
test['rsv_datetime_mean_month'] = (test['rs2_month_x'] + test['rs2_month_y'])/2
test['rsv_datetime_sum_month'] = (test['rs1_month_x'] + test['rs1_month_y'])

#week
test['rsrv_sum_week'] = (test['rv1_week_x'] + test['rv1_week_y'])
test['rsrv_mean_week'] = (test['rv2_week_x'] + test['rv2_week_y'])/2
test['rsv_datetime_mean_week'] = (test['rs2_week_x'] + test['rs2_week_y'])/2
test['rsv_datetime_sum_week'] = (test['rs1_week_x'] + test['rs1_week_y'])

#year
test['rsrv_sum_year'] = (test['rv1_year_x'] + test['rv1_year_y'])
test['rsrv_mean_year'] = (test['rv2_year_x'] + test['rv2_year_y'])/2
test['rsv_datetime_mean_year'] = (test['rs2_year_x'] + test['rs2_year_y'])/2
test['rsv_datetime_sum_year'] = (test['rs1_year_x'] + test['rs1_year_y'])

#Quarter
test['rsrv_sum_quarter'] = (test['rv1_quarter_x'] + test['rv1_quarter_y'])
test['rsrv_mean_quarter'] = (test['rv2_quarter_x'] + test['rv2_quarter_y'])/2
test['rsv_datetime_mean_quarter'] = (test['rs2_quarter_x'] + test['rs2_quarter_y'])/2
test['rsv_datetime_sum_quarter'] = (test['rs1_quarter_x'] + test['rs1_quarter_y'])

In [115]:
train['genre_dow'] = train.apply(lambda r: '_'.join([str(r['air_genre_name']), str(r['DayOfWeek'])]), axis=1)
test['genre_dow'] = test.apply(lambda r: '_'.join([str(r['air_genre_name']), str(r['DayOfWeek'])]), axis=1)

In [116]:
train['id'] = train.apply(lambda r: '_'.join([str(r['air_store_id']), str(r['visit_date'])]), axis=1)

In [117]:
def Quarter_visit(quarter):
    if (quarter == 1):
        return 0
    else:
        return 1
train['quarterly_variation'] = train['Quarter'].map(Quarter_visit)
test['quarterly_variation'] = test['Quarter'].map(Quarter_visit)


def month_visit(month):
    if (month>=1) & (month<3):
        return 0
    elif (month>3) & (month<6):
        return 1
    else:
        return 2

train['monthly_variation'] = train['Month'].map(month_visit)
test['monthly_variation'] = test['Month'].map(month_visit)

def week_day(day):
    if (day==4) or (day==5):
        return 1
    else:
        return 0
train['weekend'] = train['DayOfWeek'].map(week_day)
test['weekend'] = test['DayOfWeek'].map(week_day)

In [118]:
from sklearn.preprocessing import LabelEncoder
# NEW FEATURES FROM JMBULL
train['date_int'] = train['visit_date'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
test['date_int'] = test['visit_date'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
train['var_max_lat'] = train['latitude'].max() - train['latitude']
train['var_max_long'] = train['longitude'].max() - train['longitude']
test['var_max_lat'] = test['latitude'].max() - test['latitude']
test['var_max_long'] = test['longitude'].max() - test['longitude']


# NEW FEATURES FROM Georgii Vyshnia
train['lon_plus_lat'] = train['longitude'] + train['latitude'] 
test['lon_plus_lat'] = test['longitude'] + test['latitude']


lbl =LabelEncoder()
train['air_store_id2'] = lbl.fit_transform(train['air_store_id'])
test['air_store_id2'] = lbl.transform(test['air_store_id'])

In [119]:
train['air_genre_name'] = pd.get_dummies(train['air_genre_name'])
test['air_genre_name'] = pd.get_dummies(test['air_genre_name'])

In [120]:
train['city_name'] = pd.get_dummies(train['city_name'])
test['city_name'] = pd.get_dummies(test['city_name'])

In [121]:
train['region_name'] = pd.get_dummies(train['region_name'])
test['region_name'] = pd.get_dummies(test['region_name'])

In [122]:
train['air_area_name'] = pd.get_dummies(train['air_area_name'])
test['air_area_name'] = pd.get_dummies(test['air_area_name'])

In [123]:
train['genre_dow'] = lbl.fit_transform(train['genre_dow'])
test['genre_dow'] = lbl.transform(test['genre_dow'])

In [124]:
train = train.fillna(-1)
test = test.fillna(-1)

In [134]:
test.shape

(25211, 122)

In [125]:
print('Binding to float32')
for c, dtype in zip(train.columns, train.dtypes):
    if dtype == np.float64:
        train[c] = train[c].astype(np.float32)

for c, dtype in zip(test.columns, test.dtypes):
    if dtype == np.float64:
        test[c] = test[c].astype(np.float32)

Binding to float32


## **Validation Strategy**

In [52]:
train_x = train.drop(['id','air_store_id', 'visit_date', 'visitors'], axis=1)
train_y = np.log1p(train['visitors'].values)
print(train_x.shape, train_y.shape)
#test_x = test.drop(['id', 'air_store_id', 'visit_date', 'visitors'], axis=1)
test_x = test.drop(['id','air_store_id', 'visit_date', 'visitors'], axis=1)
test_y = np.log1p(test['visitors'].values)
print(test_x.shape, test_y.shape)

(252108, 118) (252108,)
(32019, 118) (32019,)


In [132]:
train_x.values.shape

(226897, 118)

In [126]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

train, test = train_test_split(train, test_size=0.1, shuffle=False)
train_x = train.drop(['id','air_store_id', 'visit_date', 'visitors'], axis=1)
train_y = np.log1p(train['visitors'].values)
test_x = test.drop(['id','air_store_id', 'visit_date', 'visitors'], axis=1)
test_y = np.log1p(test['visitors'].values)
boost_params = {'eval_metric': 'rmse'}
xgb0 = xgb.XGBRegressor(
    max_depth=8,
    learning_rate=0.01,
    n_estimators=10000,
    objective='reg:linear',
    gamma=0,
    min_child_weight=1,
    subsample=1,
    colsample_bytree=1,
    scale_pos_weight=1,
    seed=27)

xgb0.fit(train_x, train_y)


ValueError: feature_names mismatch: ['DayInMonth', 'Month', 'DayOfWeek', 'Year', 'Quarter', 'holiday_flg', 'min_visitors_weekly', 'mean_visitors_weekly', 'median_visitors_weekly', 'max_visitors_weekly', 'count_observations_weekly', 'air_genre_name', 'air_area_name', 'latitude', 'longitude', 'city_name', 'region_name', 'min_visitors_daily', 'mean_visitors_daily', 'median_visitors_daily', 'max_visitors_daily', 'count_observations_daily', 'min_visitors_monthly', 'mean_visitors_monthly', 'median_visitors_monthly', 'max_visitors_monthly', 'count_observations_monthly', 'min_visitors_quarterly', 'mean_visitors_quarterly', 'median_visitors_quarterly', 'max_visitors_quarterly', 'count_observations_quarterly', 'min_visitors_yearly', 'mean_visitors_yearly', 'median_visitors_yearly', 'max_visitors_yearly', 'count_observations_yearly', 'rs1_date_x', 'rv1_date_x', 'rs2_date_x', 'rv2_date_x', 'rs1_date_y', 'rv1_date_y', 'rs2_date_y', 'rv2_date_y', 'rs1_day_x', 'rv1_day_x', 'rs2_day_x', 'rv2_day_x', 'rs1_day_y', 'rv1_day_y', 'rs2_day_y', 'rv2_day_y', 'rs1_month_x', 'rv1_month_x', 'rs2_month_x', 'rv2_month_x', 'rs1_month_y', 'rv1_month_y', 'rs2_month_y', 'rv2_month_y', 'rs1_week_x', 'rv1_week_x', 'rs2_week_x', 'rv2_week_x', 'rs1_week_y', 'rv1_week_y', 'rs2_week_y', 'rv2_week_y', 'rs1_year_x', 'rv1_year_x', 'rs2_year_x', 'rv2_year_x', 'rs1_year_y', 'rv1_year_y', 'rs2_year_y', 'rv2_year_y', 'rs1_quarter_x', 'rv1_quarter_x', 'rs2_quarter_x', 'rv2_quarter_x', 'rs1_quarter_y', 'rv1_quarter_y', 'rs2_quarter_y', 'rv2_quarter_y', 'rsrv_sum_date', 'rsrv_mean_date', 'rsv_datetime_mean_date', 'rsv_datetime_sum_date', 'rsrv_sum_day', 'rsrv_mean_day', 'rsv_datetime_mean_day', 'rsv_datetime_sum_day', 'rsrv_sum_month', 'rsrv_mean_month', 'rsv_datetime_mean_month', 'rsv_datetime_sum_month', 'rsrv_sum_week', 'rsrv_mean_week', 'rsv_datetime_mean_week', 'rsv_datetime_sum_week', 'rsrv_sum_year', 'rsrv_mean_year', 'rsv_datetime_mean_year', 'rsv_datetime_sum_year', 'rsrv_sum_quarter', 'rsrv_mean_quarter', 'rsv_datetime_mean_quarter', 'rsv_datetime_sum_quarter', 'genre_dow', 'quarterly_variation', 'monthly_variation', 'weekend', 'date_int', 'var_max_lat', 'var_max_long', 'lon_plus_lat', 'air_store_id2'] ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40', 'f41', 'f42', 'f43', 'f44', 'f45', 'f46', 'f47', 'f48', 'f49', 'f50', 'f51', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f58', 'f59', 'f60', 'f61', 'f62', 'f63', 'f64', 'f65', 'f66', 'f67', 'f68', 'f69', 'f70', 'f71', 'f72', 'f73', 'f74', 'f75', 'f76', 'f77', 'f78', 'f79', 'f80', 'f81', 'f82', 'f83', 'f84', 'f85', 'f86', 'f87', 'f88', 'f89', 'f90', 'f91', 'f92', 'f93', 'f94', 'f95', 'f96', 'f97', 'f98', 'f99', 'f100', 'f101', 'f102', 'f103', 'f104', 'f105', 'f106', 'f107', 'f108', 'f109', 'f110', 'f111', 'f112', 'f113', 'f114', 'f115', 'f116', 'f117']
expected rsrv_sum_year, monthly_variation, rv2_quarter_x, min_visitors_weekly, city_name, count_observations_yearly, rs1_month_y, Quarter, genre_dow, rs2_date_x, rs1_day_y, rsv_datetime_mean_week, air_genre_name, rv2_quarter_y, rv2_year_y, DayInMonth, mean_visitors_yearly, rv1_year_y, rs1_week_x, rs1_quarter_x, air_area_name, rv1_year_x, rv1_month_x, rs2_year_y, rsrv_mean_month, region_name, holiday_flg, rsv_datetime_sum_month, DayOfWeek, rsrv_mean_day, rv2_month_y, rsrv_sum_week, rsrv_sum_date, rv2_week_y, quarterly_variation, rv1_month_y, rs2_week_x, min_visitors_yearly, rv1_day_y, count_observations_monthly, rsv_datetime_mean_date, rsv_datetime_sum_quarter, rsrv_mean_year, median_visitors_daily, rv2_week_x, count_observations_daily, rs1_year_x, rsv_datetime_mean_quarter, max_visitors_yearly, rv1_quarter_x, rs1_day_x, median_visitors_monthly, rsv_datetime_mean_month, count_observations_weekly, mean_visitors_daily, mean_visitors_weekly, air_store_id2, max_visitors_monthly, count_observations_quarterly, min_visitors_monthly, rs1_date_x, longitude, rv1_date_y, mean_visitors_monthly, rv2_date_y, min_visitors_daily, weekend, rs2_day_y, var_max_long, rsv_datetime_sum_week, max_visitors_quarterly, mean_visitors_quarterly, rs2_day_x, rv1_date_x, rsrv_sum_day, rv1_week_y, rsrv_sum_quarter, rsv_datetime_mean_year, rs1_month_x, rs1_week_y, lon_plus_lat, rsrv_mean_quarter, rv2_date_x, Month, rv1_week_x, rv2_month_x, rs2_quarter_x, rs2_year_x, rs2_month_x, rv2_day_x, rs1_quarter_y, rsv_datetime_sum_day, rsv_datetime_sum_date, rs2_quarter_y, rv1_day_x, max_visitors_daily, rs2_week_y, median_visitors_weekly, rv2_year_x, min_visitors_quarterly, latitude, var_max_lat, median_visitors_yearly, rsrv_mean_date, rsrv_sum_month, rsv_datetime_sum_year, median_visitors_quarterly, rsv_datetime_mean_day, rs2_date_y, rs2_month_y, rs1_year_y, rs1_date_y, rv2_day_y, rv1_quarter_y, Year, rsrv_mean_week, max_visitors_weekly, date_int in input data
training data did not have the following fields: f6, f80, f58, f19, f23, f59, f114, f43, f68, f33, f82, f2, f46, f8, f79, f56, f71, f112, f78, f86, f22, f62, f117, f93, f92, f24, f15, f103, f73, f7, f37, f9, f72, f94, f39, f109, f74, f26, f104, f60, f107, f88, f70, f18, f12, f81, f87, f66, f34, f51, f76, f21, f116, f53, f35, f91, f47, f85, f64, f110, f38, f27, f25, f105, f52, f1, f20, f101, f29, f32, f115, f77, f97, f99, f17, f45, f31, f3, f36, f69, f10, f48, f63, f102, f40, f54, f50, f113, f42, f14, f100, f89, f108, f55, f30, f5, f44, f28, f41, f0, f84, f98, f83, f13, f16, f90, f49, f75, f106, f67, f57, f111, f96, f11, f4, f65, f61, f95

In [None]:
predict_y = xgb0.predict(test_x.values)
y_pred = np.expm1(predict_y)
#sub1= test[['id', 'visitors']]

In [None]:
from sklearn.metrics import mean_squared_error

def RMSLE(pred,y):
    return mean_squared_error(pred,y)**0.5
RMSLE(predict_y, test_y)

In [68]:
import h2o
from h2o.automl import H2OAutoML
h2o.init(ip='127.0.0.1', port='54320')

Checking whether there is an H2O instance running at http://127.0.0.1:54320..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_121"; OpenJDK Runtime Environment (Zulu 8.20.0.5-macosx) (build 1.8.0_121-b15); OpenJDK 64-Bit Server VM (Zulu 8.20.0.5-macosx) (build 25.121-b15, mixed mode)
  Starting server from /Users/ahmedaleshinloye/anaconda/envs/py36/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/gw/x1mwxt7d1f5gbj5g6kk82tmm0000gn/T/tmpwyso5lah
  JVM stdout: /var/folders/gw/x1mwxt7d1f5gbj5g6kk82tmm0000gn/T/tmpwyso5lah/h2o_ahmedaleshinloye_started_from_python.out
  JVM stderr: /var/folders/gw/x1mwxt7d1f5gbj5g6kk82tmm0000gn/T/tmpwyso5lah/h2o_ahmedaleshinloye_started_from_python.err
  Server is running at http://127.0.0.1:54320
Connecting to H2O server at http://127.0.0.1:54320... successful.


0,1
H2O cluster uptime:,02 secs
H2O cluster version:,3.16.0.3
H2O cluster version age:,6 days
H2O cluster name:,H2O_from_python_ahmedaleshinloye_vco3ei
H2O cluster total nodes:,1
H2O cluster free memory:,3.556 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4
H2O cluster status:,"accepting new members, healthy"
H2O connection url:,http://127.0.0.1:54320


In [69]:
trainframe = train.drop(['id', 'air_store_id', 'visit_date'], axis=1)
testframe = test.drop(['id', 'air_store_id', 'visit_date', 'visitors'], axis=1)
trainframe['visitors'] = np.log1p(trainframe['visitors'].values)
htrain = h2o.H2OFrame(trainframe)
htest = h2o.H2OFrame(testframe)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [74]:
def RMSLE(y_, pred):
    return metrics.mean_squared_error(y_, pred)**0.5

x =htrain.columns
y ='visitors'
x.remove(y)

print('Starting h2o autoML model!')  

aml = H2OAutoML(max_runtime_secs = 3350)
aml.train(x=x, y=y, training_frame=htrain, leaderboard_frame = htest)

print('Generate predictions...')

preds = aml.leader.predict(htrain)
preds = preds.as_data_frame()

print('RMSLE H2O automl leader: ', RMSLE(train['visitors'].values, preds))

preds = aml.leader.predict(htest)
preds = preds.as_data_frame()

test['visitors'] = preds
test['visitors'] = np.expm1(test['visitors']).clip(lower=0.)
sub1 = test[['id','visitors']].copy()
print('Leaderboard : ', aml.leaderboard)

print(' H2O automl leader performace : ', aml.leader)

Starting h2o autoML model!
AutoML progress: |██████████████████████████████████████████████████████ (failed)  98%


OSError: Job with key $03017f00000131d4ffffffff$_b0be511228f8462b10c623ba82e4419c failed with an exception: java.lang.NullPointerException
stacktrace: 
java.lang.NullPointerException
	at ai.h2o.automl.Leaderboard$1.atomic(Leaderboard.java:241)
	at ai.h2o.automl.Leaderboard$1.atomic(Leaderboard.java:206)
	at water.TAtomic.atomic(TAtomic.java:17)
	at water.Atomic.compute2(Atomic.java:56)
	at water.Atomic.fork(Atomic.java:39)
	at water.Atomic.invoke(Atomic.java:31)
	at ai.h2o.automl.Leaderboard.addModels(Leaderboard.java:280)
	at ai.h2o.automl.Leaderboard.addModel(Leaderboard.java:316)
	at ai.h2o.automl.AutoML.addModel(AutoML.java:1391)
	at ai.h2o.automl.AutoML.pollAndUpdateProgress(AutoML.java:547)
	at ai.h2o.automl.AutoML.learn(AutoML.java:1079)
	at ai.h2o.automl.AutoML.run(AutoML.java:439)
	at ai.h2o.automl.H2OJob$1.compute2(H2OJob.java:32)
	at water.H2O$H2OCountedCompleter.compute(H2O.java:1263)
	at jsr166y.CountedCompleter.exec(CountedCompleter.java:468)
	at jsr166y.ForkJoinTask.doExec(ForkJoinTask.java:263)
	at jsr166y.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:974)
	at jsr166y.ForkJoinPool.runWorker(ForkJoinPool.java:1477)
	at jsr166y.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:104)


In [48]:
train_train, train_val = train_test_split(train, test_size=0.1, shuffle=False )
train_train_path = 'Data/validation_new/train_train.csv'
train_val_path = 'Data/validation_new/train_val.csv'
train_path = 'Data/validation_new/train.csv'
test_path = 'Data/validation_new/test.csv'

train.to_csv(train_path)
test.to_csv(test_path)
train_train.to_csv(train_train_path)
train_val.to_csv(train_val_path)

In [49]:
#train_path = 'Data/validation/train.csv'
#test_path = 'Data/validation/test.csv'
train_path = 'Data/validation_new/train_train.csv'
test_path = 'Data/validation_new/train_val.csv'

train = pd.read_csv(train_path, index_col=0)
test = pd.read_csv(test_path, index_col=0)

In [134]:
train, test = train_test_split(train, test_size=0.1, shuffle=False)

In [253]:
#tmp = train.groupby('date_int')['visitors'].mean()
#train['mean_enc_1'] = train['date_int'].map(tmp)
#test['mean_enc_1'] = test['date_int'].map(tmp)

In [353]:
col = ['id', 'air_store_id', 'visit_date','visitors']

train_X = train.drop(col, axis=1).values
test_X = test.drop(col, axis=1).values

In [354]:
train_Y = np.log1p(train['visitors'].values)
#test_Y = np.log1p(test['visitors'].values)

In [None]:
from sklearn import neighbors
#KNN

#4 : 0.696305446652
#5 : 0.686078576784
#50 :0.64
knn = neighbors.KNeighborsRegressor(n_jobs=-1, n_neighbors=30)
knn.fit(train_X, train_Y)
knnpred = knn.predict(train_X)
#knn_testpred = knn.predict(test_X)
print('Train RMSE KNeighborsRegressor: ', RMSLE(train_Y, knnpred))
#print('Test RMSE KNeighborsRegressor: ', RMSLE(test_Y, knn_testpred))

In [239]:
# 8 :0.531785345969
#GradientBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
gb = GradientBoostingRegressor(learning_rate=0.3, max_depth=3, n_estimators=200, subsample=1, verbose=2)
#params={'learning_rate':0.1, 'n_estimators':500,'max_depth':16, 'subsample':1,'max_features':1}
#params = {'learning_rate':0.3, 'n_estimators':30, 'max_depth':8, 'verbose':2}
gb.fit(train_X,train_Y)

      Iter       Train Loss   Remaining Time 
         1           0.4693            3.12m
         2           0.3757            3.05m
         3           0.3248            3.29m
         4           0.2979            3.13m
         5           0.2813            3.04m
         6           0.2718            2.93m
         7           0.2646            2.95m
         8           0.2600            2.92m
         9           0.2562            2.97m
        10           0.2536            3.04m
        11           0.2514            2.97m
        12           0.2499            2.93m
        13           0.2486            2.91m
        14           0.2476            2.87m
        15           0.2467            2.82m
        16           0.2459            2.78m
        17           0.2439            2.77m
        18           0.2431            2.74m
        19           0.2421            2.69m
        20           0.2416            2.65m
        21           0.2410            2.60m
        2

       183           0.2077           12.63s
       184           0.2076           11.89s
       185           0.2076           11.15s
       186           0.2075           10.40s
       187           0.2075            9.65s
       188           0.2074            8.91s
       189           0.2072            8.17s
       190           0.2071            7.43s
       191           0.2070            6.69s
       192           0.2069            5.94s
       193           0.2068            5.20s
       194           0.2067            4.46s
       195           0.2066            3.72s
       196           0.2066            2.97s
       197           0.2065            2.23s
       198           0.2065            1.48s
       199           0.2065            0.74s
       200           0.2064            0.00s


GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.3, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=200, presort='auto', random_state=None,
             subsample=1, verbose=2, warm_start=False)

In [241]:
gbpred = gb.predict(train_X)
gb_testpred = gb.predict(test_X)
print('Train RMSE GradientBoostingRegressor: ', RMSLE(train_Y, gbpred))
#print('Test RMSE GradientBoostingRegressor: ', RMSLE(test_Y, gb_testpred))
# 0.537532760577
# 0.552073889385 0.506610734754

Train RMSE GradientBoostingRegressor:  0.454309034713


In [250]:
visitors_pred = np.expm1(gb_testpred)
y_clipped = np.clip(visitors_pred,a_min = 0, a_max=None)
sub_gb= pd.DataFrame({'visitors':y_clipped}, index=data['sample_submission']['id'])
sub_gb.to_csv('gb_submission.csv')

In [None]:
from sklearn.model_selection import KFold,GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
from lightgbm import LGBMRegressor


grid_params= {'num_leaves': [32,64,128],
              'max_depth':  [32,64,128]}

def my_grid_RMSLE(ground_truth, predictions):
    return metrics.mean_squared_error(ground_truth, predictions)**0.5

loss  = make_scorer(my_grid_RMSLE, greater_is_better=False)

kf = KFold(n_splits=5, shuffle=False, random_state=None)

lgb = LGBMRegressor(learning_rate=0.1, min_child_samples=20, subsample_freq=1, colsample_bytree=1, metric='l2_root')

gsearch = GridSearchCV(lgb, grid_params, scoring=loss, cv = kf, n_jobs=2, verbose=2)
gsearch.fit(train_X,train_Y)
y_train_pred = gsearch.predict(train_X)
y_eval = gsearch.predict(test_X)
print('TRAIN RMS {}'.format(my_grid_RMSLE(train_Y, y_train_pred)))
print('TEST RMS {}'.format(my_grid_RMSLE(test_Y, y_eval)))

In [63]:
#Lightgbm
import lightgbm as lgb

lgb_train = lgb.Dataset(train_x.values, label=train_y)
lgb_test = lgb.Dataset(test_x.values)

lgbm_pred = np.zeros_like(test_x.values[:,0])

for i in range(5):
    params= {'max_depth': 128,
             'min_data_in_leaf': 1,
             'feature_fraction':1,
             'feature_fraction_seed': 20+i,
             'bagging_fraction': 1,
             'bagging_fraction_seed': 20+i,
             'early_stopping_rounds': 200,
             'metric': 'l2_root',
             'application': 'regression',
             'learning_rate':0.2,
             'num_leaves': 64
            }


    lbm_boost = lgb.train(params, lgb_train, num_boost_round=150, valid_sets=[lgb_train],
                          valid_names=['train'])
    lgbm_pred+=lbm_boost.predict(test_x.values)/5



[1]	train's rmse: 0.713793
Training until validation scores don't improve for 200 rounds.
[2]	train's rmse: 0.645482
[3]	train's rmse: 0.596135
[4]	train's rmse: 0.561194
[5]	train's rmse: 0.536464
[6]	train's rmse: 0.518991
[7]	train's rmse: 0.50636
[8]	train's rmse: 0.497358
[9]	train's rmse: 0.490489
[10]	train's rmse: 0.485204
[11]	train's rmse: 0.48066
[12]	train's rmse: 0.47714
[13]	train's rmse: 0.474032
[14]	train's rmse: 0.471314
[15]	train's rmse: 0.46887
[16]	train's rmse: 0.46673
[17]	train's rmse: 0.46488
[18]	train's rmse: 0.462049
[19]	train's rmse: 0.460423
[20]	train's rmse: 0.458676
[21]	train's rmse: 0.457163
[22]	train's rmse: 0.454844
[23]	train's rmse: 0.453655
[24]	train's rmse: 0.452525
[25]	train's rmse: 0.451589
[26]	train's rmse: 0.450646
[27]	train's rmse: 0.449212
[28]	train's rmse: 0.447661
[29]	train's rmse: 0.446658
[30]	train's rmse: 0.445809
[31]	train's rmse: 0.444987
[32]	train's rmse: 0.44426
[33]	train's rmse: 0.443477
[34]	train's rmse: 0.442121
[

[141]	train's rmse: 0.405897
[142]	train's rmse: 0.405716
[143]	train's rmse: 0.405584
[144]	train's rmse: 0.405309
[145]	train's rmse: 0.40513
[146]	train's rmse: 0.404911
[147]	train's rmse: 0.404754
[148]	train's rmse: 0.404639
[149]	train's rmse: 0.404427
[150]	train's rmse: 0.404264
[1]	train's rmse: 0.713793
Training until validation scores don't improve for 200 rounds.
[2]	train's rmse: 0.645482
[3]	train's rmse: 0.596135
[4]	train's rmse: 0.561194
[5]	train's rmse: 0.536464
[6]	train's rmse: 0.518991
[7]	train's rmse: 0.50636
[8]	train's rmse: 0.497358
[9]	train's rmse: 0.490489
[10]	train's rmse: 0.485204
[11]	train's rmse: 0.48066
[12]	train's rmse: 0.47714
[13]	train's rmse: 0.474032
[14]	train's rmse: 0.471314
[15]	train's rmse: 0.46887
[16]	train's rmse: 0.46673
[17]	train's rmse: 0.46488
[18]	train's rmse: 0.462049
[19]	train's rmse: 0.460423
[20]	train's rmse: 0.458676
[21]	train's rmse: 0.457163
[22]	train's rmse: 0.454844
[23]	train's rmse: 0.453655
[24]	train's rmse: 

[129]	train's rmse: 0.409321
[130]	train's rmse: 0.409016
[131]	train's rmse: 0.408774
[132]	train's rmse: 0.408617
[133]	train's rmse: 0.408326
[134]	train's rmse: 0.408132
[135]	train's rmse: 0.407931
[136]	train's rmse: 0.407325
[137]	train's rmse: 0.407058
[138]	train's rmse: 0.406593
[139]	train's rmse: 0.406451
[140]	train's rmse: 0.40615
[141]	train's rmse: 0.405897
[142]	train's rmse: 0.405716
[143]	train's rmse: 0.405584
[144]	train's rmse: 0.405309
[145]	train's rmse: 0.40513
[146]	train's rmse: 0.404911
[147]	train's rmse: 0.404754
[148]	train's rmse: 0.404639
[149]	train's rmse: 0.404427
[150]	train's rmse: 0.404264
[1]	train's rmse: 0.713793
Training until validation scores don't improve for 200 rounds.
[2]	train's rmse: 0.645482
[3]	train's rmse: 0.596135
[4]	train's rmse: 0.561194
[5]	train's rmse: 0.536464
[6]	train's rmse: 0.518991
[7]	train's rmse: 0.50636
[8]	train's rmse: 0.497358
[9]	train's rmse: 0.490489
[10]	train's rmse: 0.485204
[11]	train's rmse: 0.48066
[12]

In [249]:
visitors_pred = np.expm1(lgbm_pred)
y_clipped = np.clip(visitors_pred,a_min = 0, a_max=None)
sub_lgb= pd.DataFrame({'visitors':y_clipped}, index=data['sample_submission']['id'])
sub_lgb.to_csv('lgbm_submission.csv')

In [179]:
print('RMS Train {}'.format(RMSLE(train_Y, lbm_boost.predict(train_X))))
#0.5366496682543428
print('RMS Test {}'.format(RMSLE(test_Y, lgbm_pred)))

RMS Train 0.399081721298702
RMS Test 0.49138563647734734


In [244]:
import xgboost as xgb

xgb_train = xgb.DMatrix(train_X, label = train_Y)
xgb_test = xgb.DMatrix(test_X)
xgb_pred = np.zeros_like(test_X[:,0])


for i in range(5):
    params = {'booster':'gbtree',
              'eta': 0.1,
              'max_depth': 8,
              'min_child_weight': 1,
              'subsample': 1,
              'colsample_bytree':1,
              'max_leaves': 128,
              'objective': "reg:linear",
              'eval_metric' : 'rmse',
              'seed': 27+i,
              'lambda':1,
              'n_estimators':300
             }

    xgbtrain = xgb.train(params, xgb_train, num_boost_round= 100, 
              evals=[(xgb_train,'train')])
    #early_stopping_rounds=200
    xgb_pred+=xgbtrain.predict(xgb_test)/5

[0]	train-rmse:2.20717
[1]	train-rmse:1.99832
[2]	train-rmse:1.81141
[3]	train-rmse:1.6444
[4]	train-rmse:1.49539
[5]	train-rmse:1.36258
[6]	train-rmse:1.24449
[7]	train-rmse:1.13965
[8]	train-rmse:1.04695
[9]	train-rmse:0.965087
[10]	train-rmse:0.893192
[11]	train-rmse:0.830089
[12]	train-rmse:0.775049
[13]	train-rmse:0.727287
[14]	train-rmse:0.685854
[15]	train-rmse:0.6502
[16]	train-rmse:0.619474
[17]	train-rmse:0.593358
[18]	train-rmse:0.571058
[19]	train-rmse:0.552186
[20]	train-rmse:0.536152
[21]	train-rmse:0.522521
[22]	train-rmse:0.510976
[23]	train-rmse:0.501401
[24]	train-rmse:0.493358
[25]	train-rmse:0.486513
[26]	train-rmse:0.48052
[27]	train-rmse:0.475392
[28]	train-rmse:0.470977
[29]	train-rmse:0.467324
[30]	train-rmse:0.464057
[31]	train-rmse:0.461248
[32]	train-rmse:0.458955
[33]	train-rmse:0.456783
[34]	train-rmse:0.45493
[35]	train-rmse:0.453223
[36]	train-rmse:0.451483
[37]	train-rmse:0.450155
[38]	train-rmse:0.448829
[39]	train-rmse:0.447566
[40]	train-rmse:0.446539

[32]	train-rmse:0.458955
[33]	train-rmse:0.456783
[34]	train-rmse:0.45493
[35]	train-rmse:0.453223
[36]	train-rmse:0.451483
[37]	train-rmse:0.450155
[38]	train-rmse:0.448829
[39]	train-rmse:0.447566
[40]	train-rmse:0.446539
[41]	train-rmse:0.44568
[42]	train-rmse:0.444607
[43]	train-rmse:0.443881
[44]	train-rmse:0.442904
[45]	train-rmse:0.442154
[46]	train-rmse:0.441253
[47]	train-rmse:0.440537
[48]	train-rmse:0.439966
[49]	train-rmse:0.439369
[50]	train-rmse:0.438837
[51]	train-rmse:0.438209
[52]	train-rmse:0.437253
[53]	train-rmse:0.436789
[54]	train-rmse:0.436388
[55]	train-rmse:0.435712
[56]	train-rmse:0.435342
[57]	train-rmse:0.434785
[58]	train-rmse:0.434409
[59]	train-rmse:0.434011
[60]	train-rmse:0.433179
[61]	train-rmse:0.432354
[62]	train-rmse:0.431805
[63]	train-rmse:0.431485
[64]	train-rmse:0.430981
[65]	train-rmse:0.430567
[66]	train-rmse:0.430165
[67]	train-rmse:0.429861
[68]	train-rmse:0.429211
[69]	train-rmse:0.428851
[70]	train-rmse:0.428349
[71]	train-rmse:0.428033
[7

In [245]:
print('RMS Train {}'.format(RMSLE(train_Y, xgbtrain.predict(xgb_train))))
#print('RMS Test {}'.format(RMSLE(test_Y, xgbtrain.predict(xgb_test))))

RMS Train 0.4188985488909083


In [252]:
visitors_pred = np.expm1(xgb_pred)
y_clipped = np.clip(visitors_pred,a_min = 0, a_max=None)
sub_xgbm = pd.DataFrame({'visitors':y_clipped}, index=data['sample_submission']['id'])
sub_xgbm.to_csv('xgb_submission.csv')

In [253]:
y_pred = (0.5*lgbm_pred)+(0.3*xgb_pred)+ (0.2*gb_testpred)
#y_pred = np.expm1(y_pred).clip(lower=0.)
#Submission
#sub1 = pd.DataFrame({'id':data['sample_submission'].id,'visitors':np.expm1(y_pred)})

In [None]:
print('Test RMS: {}'.format(RMSLE(test_Y, y_pred)))

In [254]:
visitors_pred = np.expm1(y_pred)
y_clipped = np.clip(visitors_pred,a_min = 0, a_max=None)
sub_bag = pd.DataFrame({'visitors':y_clipped}, index=data['sample_submission']['id'])

In [255]:
sub_bag.to_csv('submission_bagging.csv')

In [None]:
from tpot import TPOTRegressor

tpot = TPOTRegressor(generations=5,population_size=50,scoring ='neg_mean_squared_error',
                     cv = 3, n_jobs=1, verbosity=3)
tpot.fit(train_X, train_Y)

In [None]:
y_tpot = tpot.predict(test_X)
#print('Test RMS: {}'.format(RMSLE(y_tpot, test_Y)))

In [297]:
#STACKING CODE


#from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold
from tqdm import tqdm_notebook

'''
catboost_params = {'iteration':500,
                  'learning_rate': 0.03,
                  'depth': 6,
                  'loss_function': 'RMSE'}
                  
'''
#KNN_params = {'n_neighbors':4}
LGBM_params = {'num_leaves': 31,
              'max_depth':8,
              'learning_rate':0.1,
              'n_estimators': 100,
              'min_child_samples':20,
              'subsample_freq': 1,
              'colsample_bytree':1.0,
              'metric':'l2_root'}
XGBM_params = {'max_depth':3,
              'learning_rate': 0.1,
              #'estimators':100,
              'subsample':1,
              'colsample_bytree':1}
GB_params = {'learning_rate':0.1, 
             'n_estimators':100,
             'max_depth':5, 
             'subsample':0.9,
             'max_features':0.9}



#cat = CatBoostRegressor(**catboost_params)
#knn = neighbors.KNeighborsRegressor(n_jobs=-1, n_neighbors=30)
lgb = LGBMRegressor(**LGBM_params)
xgb = XGBRegressor(**XGBM_params)
gb = GradientBoostingRegressor(**GB_params)


kf = KFold(n_splits=3)
models = [lgb, xgb, gb]


second_layers = np.zeros_like(train_X[:,:3])
test_layers = np.zeros_like(test_X[:,:3])

'''
train_X_1, train_X_2, train_Y_1, train_Y_2 = train_test_split(train_X, train_Y, test_size=0.5)
for i,mod in enumerate(tqdm_notebook(models)):    
        mod.fit(train_X_1, train_Y_1)
        second_layers[test_idx,i] = mod.predict(train_X_2)   
        test_layers[:,i] = mod.predict(test_X)
'''

for i,mod in enumerate(tqdm_notebook(models)):    
    for train_idx, test_idx in kf.split(train_X):
        tr_X, tr_Y = train_X[train_idx], train_Y[train_idx]
        te_X, te_Y = train_X[test_idx], train_Y[test_idx]
        mod.fit(tr_X, tr_Y)
        second_layers[test_idx,i] = mod.predict(te_X)   
        test_layers[:,i] += mod.predict(test_X)/3
        
#High base learner
import xgboost
y_pred_stack = np.zeros_like(test_layers[:,0])
xgb_train = xgboost.DMatrix(second_layers, label=train_Y)
xgb_test = xgboost.DMatrix(test_layers)
for i in range(5):
    params = {'booster':'gbtree',
              'eta': 0.1,
              'max_depth': 8,
              'min_child_weight': 1,
              'subsample': 1,
              'colsample_bytree':1,
              'max_leaves': 128,
              'objective': "reg:linear",
              'eval_metric' : 'rmse',
              'seed': 27+i,
              'lambda':0,
              'n_estimators':1000
             }

    xgbtrain = xgboost.train(params, xgb_train, num_boost_round= 1000, 
              evals=[(xgb_train,'train')],early_stopping_rounds=200)
    xgb_pred+=xgbtrain.predict(xgb_test)/5




Exception in thread Thread-124:
Traceback (most recent call last):
  File "/Users/ahmedaleshinloye/anaconda/envs/py36/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/Users/ahmedaleshinloye/anaconda/envs/py36/lib/python3.6/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/Users/ahmedaleshinloye/anaconda/envs/py36/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration




[0]	train-rmse:2.20717
Will train until train-rmse hasn't improved in 200 rounds.
[1]	train-rmse:1.99823
[2]	train-rmse:1.8114
[3]	train-rmse:1.64455
[4]	train-rmse:1.49581
[5]	train-rmse:1.36346
[6]	train-rmse:1.24597
[7]	train-rmse:1.14195
[8]	train-rmse:1.05014
[9]	train-rmse:0.969385
[10]	train-rmse:0.89866
[11]	train-rmse:0.836965
[12]	train-rmse:0.783415
[13]	train-rmse:0.737137
[14]	train-rmse:0.69738
[15]	train-rmse:0.663409
[16]	train-rmse:0.634543
[17]	train-rmse:0.610118
[18]	train-rmse:0.589551
[19]	train-rmse:0.572296
[20]	train-rmse:0.557924
[21]	train-rmse:0.545954
[22]	train-rmse:0.536001
[23]	train-rmse:0.527764
[24]	train-rmse:0.520988
[25]	train-rmse:0.515436
[26]	train-rmse:0.510852
[27]	train-rmse:0.507074
[28]	train-rmse:0.503984
[29]	train-rmse:0.501448
[30]	train-rmse:0.499326
[31]	train-rmse:0.497616
[32]	train-rmse:0.496162
[33]	train-rmse:0.494995
[34]	train-rmse:0.494009
[35]	train-rmse:0.493211
[36]	train-rmse:0.492522
[37]	train-rmse:0.491912
[38]	train-r

[319]	train-rmse:0.472057
[320]	train-rmse:0.471984
[321]	train-rmse:0.471934
[322]	train-rmse:0.471874
[323]	train-rmse:0.471814
[324]	train-rmse:0.471761
[325]	train-rmse:0.47176
[326]	train-rmse:0.471677
[327]	train-rmse:0.47167
[328]	train-rmse:0.471656
[329]	train-rmse:0.471612
[330]	train-rmse:0.471585
[331]	train-rmse:0.471508
[332]	train-rmse:0.47145
[333]	train-rmse:0.471375
[334]	train-rmse:0.471293
[335]	train-rmse:0.471171
[336]	train-rmse:0.471105
[337]	train-rmse:0.471077
[338]	train-rmse:0.471049
[339]	train-rmse:0.470996
[340]	train-rmse:0.470936
[341]	train-rmse:0.470851
[342]	train-rmse:0.470782
[343]	train-rmse:0.4707
[344]	train-rmse:0.470652
[345]	train-rmse:0.470558
[346]	train-rmse:0.470467
[347]	train-rmse:0.470436
[348]	train-rmse:0.470393
[349]	train-rmse:0.470352
[350]	train-rmse:0.470305
[351]	train-rmse:0.470252
[352]	train-rmse:0.470224
[353]	train-rmse:0.470183
[354]	train-rmse:0.470137
[355]	train-rmse:0.470096
[356]	train-rmse:0.47006
[357]	train-rmse:0

[636]	train-rmse:0.455513
[637]	train-rmse:0.455466
[638]	train-rmse:0.455421
[639]	train-rmse:0.455375
[640]	train-rmse:0.455343
[641]	train-rmse:0.455299
[642]	train-rmse:0.455237
[643]	train-rmse:0.455213
[644]	train-rmse:0.455181
[645]	train-rmse:0.455128
[646]	train-rmse:0.455094
[647]	train-rmse:0.455066
[648]	train-rmse:0.455056
[649]	train-rmse:0.455031
[650]	train-rmse:0.454972
[651]	train-rmse:0.454919
[652]	train-rmse:0.454836
[653]	train-rmse:0.454791
[654]	train-rmse:0.454768
[655]	train-rmse:0.454732
[656]	train-rmse:0.4547
[657]	train-rmse:0.454693
[658]	train-rmse:0.454662
[659]	train-rmse:0.454659
[660]	train-rmse:0.454652
[661]	train-rmse:0.454652
[662]	train-rmse:0.454614
[663]	train-rmse:0.454581
[664]	train-rmse:0.454539
[665]	train-rmse:0.45451
[666]	train-rmse:0.454457
[667]	train-rmse:0.454397
[668]	train-rmse:0.454342
[669]	train-rmse:0.454273
[670]	train-rmse:0.45425
[671]	train-rmse:0.454179
[672]	train-rmse:0.454109
[673]	train-rmse:0.454064
[674]	train-rmse

[953]	train-rmse:0.442726
[954]	train-rmse:0.442688
[955]	train-rmse:0.442636
[956]	train-rmse:0.442589
[957]	train-rmse:0.44256
[958]	train-rmse:0.442514
[959]	train-rmse:0.442463
[960]	train-rmse:0.442409
[961]	train-rmse:0.442362
[962]	train-rmse:0.442297
[963]	train-rmse:0.442217
[964]	train-rmse:0.44217
[965]	train-rmse:0.442126
[966]	train-rmse:0.442081
[967]	train-rmse:0.442038
[968]	train-rmse:0.44201
[969]	train-rmse:0.441973
[970]	train-rmse:0.441927
[971]	train-rmse:0.441888
[972]	train-rmse:0.441839
[973]	train-rmse:0.441785
[974]	train-rmse:0.441755
[975]	train-rmse:0.441714
[976]	train-rmse:0.441662
[977]	train-rmse:0.441614
[978]	train-rmse:0.441566
[979]	train-rmse:0.44154
[980]	train-rmse:0.441493
[981]	train-rmse:0.441453
[982]	train-rmse:0.441431
[983]	train-rmse:0.441406
[984]	train-rmse:0.441334
[985]	train-rmse:0.441295
[986]	train-rmse:0.441257
[987]	train-rmse:0.441233
[988]	train-rmse:0.44121
[989]	train-rmse:0.441189
[990]	train-rmse:0.441147
[991]	train-rmse:

[272]	train-rmse:0.47453
[273]	train-rmse:0.474485
[274]	train-rmse:0.474439
[275]	train-rmse:0.474399
[276]	train-rmse:0.474341
[277]	train-rmse:0.474306
[278]	train-rmse:0.474298
[279]	train-rmse:0.474233
[280]	train-rmse:0.474167
[281]	train-rmse:0.474074
[282]	train-rmse:0.474062
[283]	train-rmse:0.474033
[284]	train-rmse:0.473958
[285]	train-rmse:0.473882
[286]	train-rmse:0.473859
[287]	train-rmse:0.473846
[288]	train-rmse:0.473801
[289]	train-rmse:0.473736
[290]	train-rmse:0.473689
[291]	train-rmse:0.473658
[292]	train-rmse:0.473628
[293]	train-rmse:0.473565
[294]	train-rmse:0.473497
[295]	train-rmse:0.473448
[296]	train-rmse:0.473431
[297]	train-rmse:0.473348
[298]	train-rmse:0.473271
[299]	train-rmse:0.473233
[300]	train-rmse:0.473177
[301]	train-rmse:0.473137
[302]	train-rmse:0.473065
[303]	train-rmse:0.47299
[304]	train-rmse:0.472902
[305]	train-rmse:0.47285
[306]	train-rmse:0.472804
[307]	train-rmse:0.472716
[308]	train-rmse:0.472657
[309]	train-rmse:0.472587
[310]	train-rms

[589]	train-rmse:0.457869
[590]	train-rmse:0.457816
[591]	train-rmse:0.457777
[592]	train-rmse:0.457698
[593]	train-rmse:0.457618
[594]	train-rmse:0.457559
[595]	train-rmse:0.457512
[596]	train-rmse:0.457439
[597]	train-rmse:0.4574
[598]	train-rmse:0.45735
[599]	train-rmse:0.457294
[600]	train-rmse:0.457206
[601]	train-rmse:0.457144
[602]	train-rmse:0.457061
[603]	train-rmse:0.456996
[604]	train-rmse:0.45693
[605]	train-rmse:0.456858
[606]	train-rmse:0.456817
[607]	train-rmse:0.456731
[608]	train-rmse:0.456688
[609]	train-rmse:0.456654
[610]	train-rmse:0.456615
[611]	train-rmse:0.456598
[612]	train-rmse:0.456583
[613]	train-rmse:0.456566
[614]	train-rmse:0.456532
[615]	train-rmse:0.456528
[616]	train-rmse:0.456478
[617]	train-rmse:0.45642
[618]	train-rmse:0.456372
[619]	train-rmse:0.456348
[620]	train-rmse:0.456329
[621]	train-rmse:0.456268
[622]	train-rmse:0.4562
[623]	train-rmse:0.456152
[624]	train-rmse:0.456071
[625]	train-rmse:0.456035
[626]	train-rmse:0.456005
[627]	train-rmse:0.

[906]	train-rmse:0.444819
[907]	train-rmse:0.444746
[908]	train-rmse:0.444722
[909]	train-rmse:0.444645
[910]	train-rmse:0.444613
[911]	train-rmse:0.444546
[912]	train-rmse:0.444483
[913]	train-rmse:0.444406
[914]	train-rmse:0.44433
[915]	train-rmse:0.444196
[916]	train-rmse:0.444144
[917]	train-rmse:0.444102
[918]	train-rmse:0.444048
[919]	train-rmse:0.444022
[920]	train-rmse:0.443971
[921]	train-rmse:0.443934
[922]	train-rmse:0.443884
[923]	train-rmse:0.443852
[924]	train-rmse:0.443806
[925]	train-rmse:0.443753
[926]	train-rmse:0.443711
[927]	train-rmse:0.443669
[928]	train-rmse:0.443612
[929]	train-rmse:0.443576
[930]	train-rmse:0.443491
[931]	train-rmse:0.443417
[932]	train-rmse:0.443384
[933]	train-rmse:0.443354
[934]	train-rmse:0.443343
[935]	train-rmse:0.443341
[936]	train-rmse:0.443339
[937]	train-rmse:0.44333
[938]	train-rmse:0.443323
[939]	train-rmse:0.44329
[940]	train-rmse:0.443265
[941]	train-rmse:0.443206
[942]	train-rmse:0.443152
[943]	train-rmse:0.44311
[944]	train-rmse

[225]	train-rmse:0.477369
[226]	train-rmse:0.477284
[227]	train-rmse:0.477217
[228]	train-rmse:0.477099
[229]	train-rmse:0.477064
[230]	train-rmse:0.477034
[231]	train-rmse:0.476988
[232]	train-rmse:0.476926
[233]	train-rmse:0.476857
[234]	train-rmse:0.476783
[235]	train-rmse:0.476692
[236]	train-rmse:0.47664
[237]	train-rmse:0.476584
[238]	train-rmse:0.476513
[239]	train-rmse:0.476461
[240]	train-rmse:0.476363
[241]	train-rmse:0.476311
[242]	train-rmse:0.47624
[243]	train-rmse:0.47618
[244]	train-rmse:0.47613
[245]	train-rmse:0.476074
[246]	train-rmse:0.476016
[247]	train-rmse:0.475957
[248]	train-rmse:0.475884
[249]	train-rmse:0.475817
[250]	train-rmse:0.475762
[251]	train-rmse:0.475685
[252]	train-rmse:0.475622
[253]	train-rmse:0.475557
[254]	train-rmse:0.475513
[255]	train-rmse:0.475475
[256]	train-rmse:0.475383
[257]	train-rmse:0.475333
[258]	train-rmse:0.475262
[259]	train-rmse:0.475248
[260]	train-rmse:0.475217
[261]	train-rmse:0.475126
[262]	train-rmse:0.475062
[263]	train-rmse

[542]	train-rmse:0.460301
[543]	train-rmse:0.460292
[544]	train-rmse:0.460271
[545]	train-rmse:0.460257
[546]	train-rmse:0.460245
[547]	train-rmse:0.460196
[548]	train-rmse:0.460163
[549]	train-rmse:0.460135
[550]	train-rmse:0.460106
[551]	train-rmse:0.460075
[552]	train-rmse:0.460023
[553]	train-rmse:0.460023
[554]	train-rmse:0.459982
[555]	train-rmse:0.459936
[556]	train-rmse:0.459877
[557]	train-rmse:0.459824
[558]	train-rmse:0.459785
[559]	train-rmse:0.459757
[560]	train-rmse:0.459692
[561]	train-rmse:0.45963
[562]	train-rmse:0.459562
[563]	train-rmse:0.459544
[564]	train-rmse:0.459483
[565]	train-rmse:0.459408
[566]	train-rmse:0.459336
[567]	train-rmse:0.45928
[568]	train-rmse:0.459228
[569]	train-rmse:0.459178
[570]	train-rmse:0.459111
[571]	train-rmse:0.459013
[572]	train-rmse:0.458942
[573]	train-rmse:0.458884
[574]	train-rmse:0.458803
[575]	train-rmse:0.458736
[576]	train-rmse:0.458676
[577]	train-rmse:0.458601
[578]	train-rmse:0.458545
[579]	train-rmse:0.458484
[580]	train-rm

[859]	train-rmse:0.446802
[860]	train-rmse:0.446788
[861]	train-rmse:0.44675
[862]	train-rmse:0.446714
[863]	train-rmse:0.446712
[864]	train-rmse:0.446657
[865]	train-rmse:0.446652
[866]	train-rmse:0.446642
[867]	train-rmse:0.446638
[868]	train-rmse:0.446633
[869]	train-rmse:0.446619
[870]	train-rmse:0.446611
[871]	train-rmse:0.446598
[872]	train-rmse:0.446594
[873]	train-rmse:0.446593
[874]	train-rmse:0.446569
[875]	train-rmse:0.446503
[876]	train-rmse:0.446408
[877]	train-rmse:0.446332
[878]	train-rmse:0.446261
[879]	train-rmse:0.446214
[880]	train-rmse:0.446152
[881]	train-rmse:0.446103
[882]	train-rmse:0.446033
[883]	train-rmse:0.445989
[884]	train-rmse:0.445947
[885]	train-rmse:0.445905
[886]	train-rmse:0.445882
[887]	train-rmse:0.44585
[888]	train-rmse:0.445817
[889]	train-rmse:0.445784
[890]	train-rmse:0.445744
[891]	train-rmse:0.445705
[892]	train-rmse:0.445643
[893]	train-rmse:0.445642
[894]	train-rmse:0.445602
[895]	train-rmse:0.44556
[896]	train-rmse:0.445482
[897]	train-rms

[178]	train-rmse:0.480635
[179]	train-rmse:0.480589
[180]	train-rmse:0.480498
[181]	train-rmse:0.480405
[182]	train-rmse:0.480327
[183]	train-rmse:0.48026
[184]	train-rmse:0.480202
[185]	train-rmse:0.480188
[186]	train-rmse:0.480142
[187]	train-rmse:0.480136
[188]	train-rmse:0.480062
[189]	train-rmse:0.479992
[190]	train-rmse:0.479924
[191]	train-rmse:0.479901
[192]	train-rmse:0.479883
[193]	train-rmse:0.479837
[194]	train-rmse:0.479787
[195]	train-rmse:0.479695
[196]	train-rmse:0.479632
[197]	train-rmse:0.479557
[198]	train-rmse:0.479471
[199]	train-rmse:0.479358
[200]	train-rmse:0.479271
[201]	train-rmse:0.479186
[202]	train-rmse:0.479109
[203]	train-rmse:0.479005
[204]	train-rmse:0.478889
[205]	train-rmse:0.478757
[206]	train-rmse:0.478684
[207]	train-rmse:0.478651
[208]	train-rmse:0.478624
[209]	train-rmse:0.478503
[210]	train-rmse:0.478467
[211]	train-rmse:0.478431
[212]	train-rmse:0.478349
[213]	train-rmse:0.47826
[214]	train-rmse:0.478152
[215]	train-rmse:0.478034
[216]	train-rm

[495]	train-rmse:0.462702
[496]	train-rmse:0.462665
[497]	train-rmse:0.46263
[498]	train-rmse:0.462596
[499]	train-rmse:0.462561
[500]	train-rmse:0.462525
[501]	train-rmse:0.462496
[502]	train-rmse:0.462472
[503]	train-rmse:0.462431
[504]	train-rmse:0.462395
[505]	train-rmse:0.462332
[506]	train-rmse:0.462274
[507]	train-rmse:0.462217
[508]	train-rmse:0.462164
[509]	train-rmse:0.462074
[510]	train-rmse:0.462012
[511]	train-rmse:0.461927
[512]	train-rmse:0.461877
[513]	train-rmse:0.461826
[514]	train-rmse:0.461769
[515]	train-rmse:0.461716
[516]	train-rmse:0.461636
[517]	train-rmse:0.461548
[518]	train-rmse:0.461547
[519]	train-rmse:0.461518
[520]	train-rmse:0.461471
[521]	train-rmse:0.461428
[522]	train-rmse:0.46134
[523]	train-rmse:0.461294
[524]	train-rmse:0.461218
[525]	train-rmse:0.461194
[526]	train-rmse:0.461164
[527]	train-rmse:0.461141
[528]	train-rmse:0.461098
[529]	train-rmse:0.461036
[530]	train-rmse:0.461005
[531]	train-rmse:0.460946
[532]	train-rmse:0.460925
[533]	train-rm

[812]	train-rmse:0.448358
[813]	train-rmse:0.448346
[814]	train-rmse:0.448325
[815]	train-rmse:0.448318
[816]	train-rmse:0.448312
[817]	train-rmse:0.448295
[818]	train-rmse:0.448267
[819]	train-rmse:0.448207
[820]	train-rmse:0.448164
[821]	train-rmse:0.448059
[822]	train-rmse:0.448002
[823]	train-rmse:0.447977
[824]	train-rmse:0.447972
[825]	train-rmse:0.44797
[826]	train-rmse:0.447917
[827]	train-rmse:0.447865
[828]	train-rmse:0.447839
[829]	train-rmse:0.447839
[830]	train-rmse:0.44782
[831]	train-rmse:0.447782
[832]	train-rmse:0.447716
[833]	train-rmse:0.447688
[834]	train-rmse:0.447637
[835]	train-rmse:0.447583
[836]	train-rmse:0.447553
[837]	train-rmse:0.44751
[838]	train-rmse:0.447474
[839]	train-rmse:0.44743
[840]	train-rmse:0.447423
[841]	train-rmse:0.447401
[842]	train-rmse:0.447376
[843]	train-rmse:0.447326
[844]	train-rmse:0.447307
[845]	train-rmse:0.44728
[846]	train-rmse:0.447255
[847]	train-rmse:0.447232
[848]	train-rmse:0.447196
[849]	train-rmse:0.447179
[850]	train-rmse:

[131]	train-rmse:0.483536
[132]	train-rmse:0.483486
[133]	train-rmse:0.483481
[134]	train-rmse:0.483478
[135]	train-rmse:0.483411
[136]	train-rmse:0.483351
[137]	train-rmse:0.483268
[138]	train-rmse:0.483202
[139]	train-rmse:0.483124
[140]	train-rmse:0.483008
[141]	train-rmse:0.482954
[142]	train-rmse:0.482897
[143]	train-rmse:0.482836
[144]	train-rmse:0.48283
[145]	train-rmse:0.482733
[146]	train-rmse:0.482622
[147]	train-rmse:0.48254
[148]	train-rmse:0.482514
[149]	train-rmse:0.482448
[150]	train-rmse:0.482412
[151]	train-rmse:0.482355
[152]	train-rmse:0.482344
[153]	train-rmse:0.482287
[154]	train-rmse:0.482243
[155]	train-rmse:0.48219
[156]	train-rmse:0.482131
[157]	train-rmse:0.482048
[158]	train-rmse:0.481932
[159]	train-rmse:0.481817
[160]	train-rmse:0.481743
[161]	train-rmse:0.481647
[162]	train-rmse:0.481588
[163]	train-rmse:0.481513
[164]	train-rmse:0.481487
[165]	train-rmse:0.481476
[166]	train-rmse:0.481453
[167]	train-rmse:0.481448
[168]	train-rmse:0.481353
[169]	train-rms

[448]	train-rmse:0.464896
[449]	train-rmse:0.464843
[450]	train-rmse:0.464763
[451]	train-rmse:0.464726
[452]	train-rmse:0.464657
[453]	train-rmse:0.464583
[454]	train-rmse:0.464555
[455]	train-rmse:0.46446
[456]	train-rmse:0.46441
[457]	train-rmse:0.464358
[458]	train-rmse:0.464279
[459]	train-rmse:0.464241
[460]	train-rmse:0.464193
[461]	train-rmse:0.46414
[462]	train-rmse:0.464084
[463]	train-rmse:0.464053
[464]	train-rmse:0.464003
[465]	train-rmse:0.463966
[466]	train-rmse:0.463942
[467]	train-rmse:0.463894
[468]	train-rmse:0.46381
[469]	train-rmse:0.463747
[470]	train-rmse:0.463664
[471]	train-rmse:0.463651
[472]	train-rmse:0.463603
[473]	train-rmse:0.463542
[474]	train-rmse:0.463478
[475]	train-rmse:0.463427
[476]	train-rmse:0.463387
[477]	train-rmse:0.46336
[478]	train-rmse:0.463327
[479]	train-rmse:0.46329
[480]	train-rmse:0.463282
[481]	train-rmse:0.463246
[482]	train-rmse:0.463219
[483]	train-rmse:0.463199
[484]	train-rmse:0.463179
[485]	train-rmse:0.463156
[486]	train-rmse:0

[765]	train-rmse:0.450406
[766]	train-rmse:0.450355
[767]	train-rmse:0.45032
[768]	train-rmse:0.450279
[769]	train-rmse:0.450265
[770]	train-rmse:0.450217
[771]	train-rmse:0.450186
[772]	train-rmse:0.450153
[773]	train-rmse:0.450149
[774]	train-rmse:0.450141
[775]	train-rmse:0.450085
[776]	train-rmse:0.450037
[777]	train-rmse:0.449998
[778]	train-rmse:0.449926
[779]	train-rmse:0.449879
[780]	train-rmse:0.449856
[781]	train-rmse:0.449775
[782]	train-rmse:0.449728
[783]	train-rmse:0.449677
[784]	train-rmse:0.449625
[785]	train-rmse:0.449573
[786]	train-rmse:0.449531
[787]	train-rmse:0.449489
[788]	train-rmse:0.449438
[789]	train-rmse:0.449398
[790]	train-rmse:0.449324
[791]	train-rmse:0.449269
[792]	train-rmse:0.449201
[793]	train-rmse:0.449176
[794]	train-rmse:0.449134
[795]	train-rmse:0.449088
[796]	train-rmse:0.449031
[797]	train-rmse:0.448986
[798]	train-rmse:0.44892
[799]	train-rmse:0.448872
[800]	train-rmse:0.448815
[801]	train-rmse:0.448752
[802]	train-rmse:0.448734
[803]	train-rm

In [299]:
visitors_pred = np.expm1(xgb_pred)
y_stack = np.clip(visitors_pred,0., None)
sub_stacked = pd.DataFrame({'visitors':y_stack}, index=data['sample_submission']['id'])

In [None]:
sub_stacked.to_csv('submission_stacking.csv')

In [283]:
from __future__ import division
# from hklee
# https://www.kaggle.com/zeemeen/weighted-mean-comparisons-lb-0-497-1st/code

dfs = {
    'air_visit_data': pd.read_csv('Data/air_visit_data.csv'),
    'air_store_info': pd.read_csv('Data/air_store_info.csv'),
    'hpg_store_info': pd.read_csv('Data/hpg_store_info.csv'),
    'air_reserve': pd.read_csv('Data/air_reserve.csv'),
    'hpg_reserve': pd.read_csv('Data/hpg_reserve.csv'),
    'store_id_relation': pd.read_csv('Data/store_id_relation.csv'),
    'sample_submission': pd.read_csv('Data/sample_submission.csv'),
    'date_info': pd.read_csv('Data/date_info.csv')
    }
for k, v in dfs.items(): locals()[k] = v

In [284]:
wkend_holidays = date_info.apply(
    (lambda x:(x.day_of_week=='Sunday' or x.day_of_week=='Saturday') and x.holiday_flg==1), axis=1)
date_info.loc[wkend_holidays, 'holiday_flg'] = 0
date_info['weight'] = ((date_info.index + 1) / len(date_info)) ** 5  

In [285]:
date_info.head()

Unnamed: 0,calendar_date,day_of_week,holiday_flg,weight
0,2016-01-01,Friday,1,2.707368e-14
1,2016-01-02,Saturday,0,8.663577e-13
2,2016-01-03,Sunday,0,6.578904e-12
3,2016-01-04,Monday,0,2.772345e-11
4,2016-01-05,Tuesday,0,8.460525e-11


In [286]:
visit_data = air_visit_data.merge(date_info, left_on='visit_date', right_on='calendar_date', how='left')
visit_data.drop('calendar_date', axis=1, inplace=True)
visit_data['visitors'] = visit_data.visitors.map(pd.np.log1p)

In [287]:
visit_data.head()

Unnamed: 0,air_store_id,visit_date,visitors,day_of_week,holiday_flg,weight
0,air_ba937bf13d40fb24,2016-01-13,3.258097,Wednesday,0,1.005227e-08
1,air_ba937bf13d40fb24,2016-01-14,3.496508,Thursday,0,1.456087e-08
2,air_ba937bf13d40fb24,2016-01-15,3.401197,Friday,0,2.055908e-08
3,air_ba937bf13d40fb24,2016-01-16,3.135494,Saturday,0,2.838881e-08
4,air_ba937bf13d40fb24,2016-01-18,1.94591,Monday,0,5.115756e-08


In [288]:
wmean = lambda x:( (x.weight * x.visitors).sum() / x.weight.sum() )
visitors = visit_data.groupby(['air_store_id', 'day_of_week', 'holiday_flg']).apply(wmean).reset_index()
visitors.rename(columns={0:'visitors'}, inplace=True) # cumbersome, should be better ways.

In [289]:
visitors.head(5)

Unnamed: 0,air_store_id,day_of_week,holiday_flg,visitors
0,air_00a91d42b08b08d9,Friday,0,3.583535
1,air_00a91d42b08b08d9,Monday,0,3.203625
2,air_00a91d42b08b08d9,Monday,1,3.091042
3,air_00a91d42b08b08d9,Saturday,0,2.524065
4,air_00a91d42b08b08d9,Sunday,0,1.098612


In [290]:
sample_submission['air_store_id'] = sample_submission.id.map(lambda x: '_'.join(x.split('_')[:-1]))
sample_submission['calendar_date'] = sample_submission.id.map(lambda x: x.split('_')[2])
sample_submission.drop('visitors', axis=1, inplace=True)
sample_submission = sample_submission.merge(date_info, on='calendar_date', how='left')
sample_submission = sample_submission.merge(visitors, on=[
    'air_store_id', 'day_of_week', 'holiday_flg'], how='left')

In [291]:
missings = sample_submission.visitors.isnull()
sample_submission.loc[missings, 'visitors'] = sample_submission[missings].merge(
    visitors[visitors.holiday_flg==0], on=('air_store_id', 'day_of_week'), 
    how='left')['visitors_y'].values

In [292]:
sample_submission.head()

Unnamed: 0,id,air_store_id,calendar_date,day_of_week,holiday_flg,weight,visitors
0,air_00a91d42b08b08d9_2017-04-23,air_00a91d42b08b08d9,2017-04-23,Sunday,0,0.682692,1.098612
1,air_00a91d42b08b08d9_2017-04-24,air_00a91d42b08b08d9,2017-04-24,Monday,0,0.689848,3.203625
2,air_00a91d42b08b08d9_2017-04-25,air_00a91d42b08b08d9,2017-04-25,Tuesday,0,0.697064,3.325868
3,air_00a91d42b08b08d9_2017-04-26,air_00a91d42b08b08d9,2017-04-26,Wednesday,0,0.70434,3.353439
4,air_00a91d42b08b08d9_2017-04-27,air_00a91d42b08b08d9,2017-04-27,Thursday,0,0.711677,3.475056


In [293]:
missings = sample_submission.visitors.isnull()
sample_submission.loc[missings, 'visitors'] = sample_submission[missings].merge(
    visitors[['air_store_id', 'visitors']].groupby('air_store_id').mean().reset_index(), 
    on='air_store_id', how='left')['visitors_y'].values

In [294]:
sample_submission['visitors'] = sample_submission.visitors.map(pd.np.expm1)

In [295]:
sub2 = sample_submission[['id', 'visitors']].copy()

In [304]:
sub_gb.reset_index(inplace=True)
sub_lgb.reset_index(inplace=True)
sub_xgbm.reset_index(inplace=True)

In [321]:
all_sub = pd.merge(pd.merge(pd.merge(pd.merge(sub_gb,sub_lgb,on='id').rename(columns={'visitors_x':'visitors_gb','visitors_y':'visitors_lgb'}),sub_xgbm,on='id').rename(columns={'visitors':'visitors_xgb'}),sub1_ml, on='id').rename(columns={'visitors':'visitors_ml'}),sub2, on='id').rename(columns={'visitors':'visitors_wm'})

In [326]:
all_sub.head()

Unnamed: 0,id,visitors_gb,visitors_lgb,visitors_xgb,visitors_ml,visitors_wm,visitors
0,air_00a91d42b08b08d9_2017-04-23,10.58954,7.878412,11.541053,6.773113,2.0,4.486556
1,air_00a91d42b08b08d9_2017-04-24,21.306686,21.954512,22.174768,14.875727,23.621632,20.429761
2,air_00a91d42b08b08d9_2017-04-25,18.716706,20.708854,22.54571,16.797157,26.82313,23.1513
3,air_00a91d42b08b08d9_2017-04-26,23.545714,26.206827,27.073663,19.424539,27.60092,24.892775
4,air_00a91d42b08b08d9_2017-04-27,29.271052,28.645922,28.614626,20.433269,31.299646,27.43144


In [327]:
all_sub['visitors'] = 0.5*all_sub['visitors_ml'] + 0.5*all_sub['visitors_wm']*1.1
#sub_merge = sub_merge.drop_duplicates(['id'], keep='first')
all_sub[['id', 'visitors_ml']].rename(columns={'visitors_ml':'visitors'}).to_csv('submissionml_wm.csv', index=False)

In [362]:
#0.48 submission
sub1_48 = pd.read_csv('submission_0.48.csv')
sub1_48.rename(columns={'visitors': 'visitors_48'}, inplace=True)
sub_merge = pd.merge(sub_merge, sub1_48, on='id', how='inner')

In [376]:

sub_merge['new_visitors'] = (sub_merge['visitors']*alpha) + (sub_merge['visitors_48']* (1-alpha))

In [377]:
sub_new = sub_merge[['id', 'new_visitors']].rename(columns ={'new_visitors': 'visitors'})

In [378]:
sub_new.to_csv('sub_new.csv', index=False)

In [344]:
sub_merge = pd.merge(sub_stacked, sub2, on='id', how='inner')
sub_merge['visitors'] = 0.5375*sub_merge['visitors_x'] + (0.55)*sub_merge['visitors_y']
sub_merge = sub_merge.drop_duplicates(['id'], keep='first')
sub_merge[['id', 'visitors']].to_csv('submission_1233v2.csv', index=False)