In [113]:

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn import decomposition, pipeline, metrics, grid_search


In [114]:
def rmsle(h, y): 
    """
    Compute the Root Mean Squared Log Error for hypthesis h and targets y

    Args:
        h - numpy array containing predictions with shape (n_samples, n_targets)
        y - numpy array containing targets with shape (n_samples, n_targets)
    """
    return np.sqrt(np.square(np.log(h + 1) - np.log(y + 1)).mean())

In [115]:
weather = pd.read_csv('weather/weather_ready_to_use.csv')
weather['visit_date'] = pd.to_datetime(weather['visit_date'],format= '%Y-%m-%d')

In [116]:
air_reserve = pd.read_csv('air_reserve.csv')
hpg_reserve = pd.read_csv('hpg_reserve.csv')

In [117]:
mean_day_wise = pd.read_csv('mean_day_wise.csv')

In [118]:
hpg_store_info = pd.read_csv('hpg_store_info.csv')
air_store_info = pd.read_csv('air_store_info.csv')

In [119]:
hpg_store_info.columns

Index([u'hpg_store_id', u'hpg_genre_name', u'hpg_area_name', u'latitude',
       u'longitude'],
      dtype='object')

In [120]:
n1 = hpg_store_info.groupby('hpg_genre_name').agg('count')['hpg_store_id'].reset_index()
n1.columns = ['hpg_genre_name','air_genre_name_c']

n2 = hpg_store_info.groupby('hpg_area_name').agg('count')['hpg_store_id'].reset_index()
n2.columns = ['hpg_area_name','air_area_name_c']

In [121]:
air_tf = list(air_store_info.apply(lambda x:'%s %s' % (x['air_area_name'],x['air_genre_name']),axis=1))

tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 5), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')
tfv.fit(air_tf)



TfidfVectorizer(analyzer='word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 5), norm=u'l2', preprocessor=None, smooth_idf=1,
        stop_words='english', strip_accents='unicode', sublinear_tf=1,
        token_pattern='\\w{1,}', tokenizer=None, use_idf=1,
        vocabulary=None)

In [122]:
air_visit_data = pd.read_csv('air_visit_data.csv')
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission['air_store_id'] = sample_submission['id'].apply(lambda x: x.split('_')[0]+str('_') +x.split('_')[1])
sample_submission['visit_date'] = sample_submission['id'].apply(lambda x: x.split('_')[2])
sample_submission['visit_date'] = pd.to_datetime(sample_submission['visit_date'],format= '%Y-%m-%d %H:%M:%S')

In [123]:
air_visit_data = air_visit_data.merge(air_store_info,how = 'left',on= 'air_store_id')
sample_submission = sample_submission.merge(air_store_info,how = 'left',on= 'air_store_id')

In [124]:
air_visit_data['id'] = air_visit_data['air_store_id'] + str('_') + air_visit_data['visit_date']
new_train = pd.read_csv('new_train.csv')
air_visit_data = air_visit_data.merge(new_train,on='id',how= 'left')
air_visit_data = air_visit_data.drop('id',axis=1)
air_visit_data['visit_date'] = pd.to_datetime(air_visit_data['visit_date'],format= '%Y-%m-%d')
air_visit_data.visit_date.min()


Timestamp('2016-01-01 00:00:00')

In [125]:
new_test =  pd.read_csv('new_test.csv')
sample_submission =  sample_submission.merge(new_test,how = 'left',on='id')

In [126]:
"""
def f(x):
    try:
        return x.split('-')[2]
    except:
        return -1

air_store_info['lol3'] = air_store_info.air_area_name.apply(f)
air_store_info['lol1'] = air_store_info.air_area_name.apply(lambda x: x.split('-')[0])
air_store_info['lol2'] = air_store_info.air_area_name.apply(lambda x: x.split('-')[1])
"""

"\ndef f(x):\n    try:\n        return x.split('-')[2]\n    except:\n        return -1\n\nair_store_info['lol3'] = air_store_info.air_area_name.apply(f)\nair_store_info['lol1'] = air_store_info.air_area_name.apply(lambda x: x.split('-')[0])\nair_store_info['lol2'] = air_store_info.air_area_name.apply(lambda x: x.split('-')[1])\n"

In [127]:
air_store_info.columns

Index([u'air_store_id', u'air_genre_name', u'air_area_name', u'latitude',
       u'longitude'],
      dtype='object')

In [128]:
air_reserve['visit_datetime'] = pd.to_datetime(air_reserve['visit_datetime'],format= '%Y-%m-%d %H:%M:%S')
air_reserve['visit_date'] = air_reserve.visit_datetime.apply(lambda x: str(x).split(' ')[0])
air_reserve['visit_date'] = pd.to_datetime(air_reserve['visit_date'],format= '%Y-%m-%d %H:%M:%S')

In [129]:
hpg_reserve['visit_datetime'] = pd.to_datetime(hpg_reserve['visit_datetime'],format= '%Y-%m-%d %H:%M:%S')
hpg_reserve['visit_date'] = hpg_reserve.visit_datetime.apply(lambda x: str(x).split(' ')[0])
hpg_reserve['visit_date'] = pd.to_datetime(hpg_reserve['visit_date'],format= '%Y-%m-%d %H:%M:%S')

In [130]:
for i in ['reserve_datetime','reserve_visitors']:
        k = air_reserve[[i,'visit_date','air_store_id']].groupby(['visit_date','air_store_id'])[i].apply(lambda x: x.tolist()).reset_index()
        name = i + 'list'
        if i == 'reserve_datetime':
            k1  = k.copy()
        else:
            k1[name] = k[i].copy()
        print i


reserve_datetime
reserve_visitors


In [131]:

air_visit_data = air_visit_data.merge(k1,on=['air_store_id','visit_date'],how = 'left')
sample_submission = sample_submission.merge(k1,on=['air_store_id','visit_date'],how = 'left')

In [132]:
air_visit_data['visit_date_month'] =air_visit_data.visit_date.dt.month
air_visit_data['visit_date_dayofw'] =air_visit_data.visit_date.dt.dayofweek
air_visit_data['visit_date_year'] =air_visit_data.visit_date.dt.year
air_visit_data['visit_date_dayofm'] =air_visit_data.visit_date.dt.day
air_visit_data['weekofyear'] =air_visit_data.visit_date.dt.weekofyear
sample_submission['visit_date_month'] =sample_submission.visit_date.dt.month
sample_submission['visit_date_dayofw'] =sample_submission.visit_date.dt.dayofweek
sample_submission['visit_date_year'] =sample_submission.visit_date.dt.year
sample_submission['visit_date_dayofm'] =sample_submission.visit_date.dt.day
sample_submission['weekofyear'] =sample_submission.visit_date.dt.weekofyear
air_visit_data.loc[air_visit_data.weekofyear==53,'weekofyear'] =0 
sample_submission.loc[sample_submission.weekofyear==53,'weekofyear'] =0 


In [133]:
total_air_ids = list(air_store_info.air_store_id.unique())

df_total = pd.concat((air_visit_data,sample_submission))
df_total = df_total[['weekofyear','visit_date_year','air_store_id']]


In [134]:
df_total = df_total.sort_values(['visit_date_year','weekofyear'])
weekofyear = list(df_total['weekofyear'].unique())
year = list(df_total['visit_date_year'].unique())

In [135]:
week_open_restro = []
for i in year:
    for j in weekofyear:
        #print i,j,len(list(df_total.loc[(df_total.visit_date_year ==i) & (df_total.weekofyear ==j),'air_store_id'].unique()))
        l= len(list(df_total.loc[(df_total.visit_date_year ==i) & (df_total.weekofyear ==j),'air_store_id'].unique()))
        if (i==2017) & (j>22):
            break
        week_open_restro.append([i,j,l])
        

In [136]:
df_open_restro = pd.DataFrame(week_open_restro)
df_open_restro.columns = ['visit_date_year','weekofyear','no_open_restro']

In [137]:
def func1(x):
    try: 
        if pd.isnull(x):
            return 0
        else:
            return len(x)
                    
    except:
        return len(x)
def func(x):
    try: 
        if pd.isnull(x):
            return -1
        else:
            return sum(x)                
    except:
        return sum(x)

air_visit_data['total_reserve']= air_visit_data['reserve_visitorslist'].apply(func)
air_visit_data['numb_total_reserve'] = air_visit_data['reserve_visitorslist'].apply(func1)

sample_submission['total_reserve']= sample_submission['reserve_visitorslist'].apply(func)
sample_submission['numb_total_reserve'] = sample_submission['reserve_visitorslist'].apply(func1)


In [138]:
k = [i for i in air_visit_data.columns if i in sample_submission.columns]
train = air_visit_data.copy()
test = sample_submission.copy()

In [139]:
train.columns

Index([u'air_store_id', u'visit_date', u'visitors', u'air_genre_name',
       u'air_area_name', u'latitude', u'longitude', u'holiday_eve',
       u'non_working', u'genre_in_area', u'total_r_in_area',
       u'reserve_visitors', u'reserve_-12_h', u'reserve_12_37_h',
       u'reserve_37_59_h', u'reserve_59_85_h', u'reserve_85+_h',
       u'visitors_mean', u'visitors_median', u'visitors_max', u'visitors_min',
       u'reserve_datetime', u'reserve_visitorslist', u'visit_date_month',
       u'visit_date_dayofw', u'visit_date_year', u'visit_date_dayofm',
       u'weekofyear', u'total_reserve', u'numb_total_reserve'],
      dtype='object')

In [140]:
"""
l =  pd.to_datetime('2017-02-1',format='%Y-%m-%d')
train2 = train.loc[(train.visit_date>=l)]"""

"\nl =  pd.to_datetime('2017-02-1',format='%Y-%m-%d')\ntrain2 = train.loc[(train.visit_date>=l)]"

In [141]:
train.visit_date.min(),train.visit_date.max()

(Timestamp('2016-01-01 00:00:00'), Timestamp('2017-04-22 00:00:00'))

In [142]:
train1 = train.loc[(train.visit_date_year>=2017)].copy()
k1 = train1[['visitors','air_store_id']].groupby('air_store_id').agg('mean').reset_index()
k1.columns = ['air_store_id','mean_visitors']
k2 = train1[['visitors','air_store_id']].groupby('air_store_id').agg('median').reset_index()
k2.columns = ['air_store_id','median_visitors']

In [143]:
k4 = train[['visitors','visit_date_month']].groupby(['visit_date_month']).agg('mean').reset_index()
k4.columns = ['visit_date_month','mean_visitors2']

k5 = train1[['visitors','air_store_id','visit_date_dayofw']].groupby(['air_store_id','visit_date_dayofw']).agg('mean').reset_index()
k5.columns = ['air_store_id','visit_date_dayofw','xxx']

k6 = train1[['visitors','visit_date_dayofw']].groupby(['visit_date_dayofw']).agg('mean').reset_index()
k6.columns = ['visit_date_dayofw','mean_visitors4']


k7 = train[['visitors','visit_date_month']].groupby(['visit_date_month']).agg('median').reset_index()
k7.columns = ['visit_date_month','median_visitors2']

k8 = train1[['visitors','air_store_id','visit_date_dayofw']].groupby(['air_store_id','visit_date_dayofw']).agg('median').reset_index()
k8.columns = ['air_store_id','visit_date_dayofw','yyy']

k9 = train1[['visitors','visit_date_dayofw']].groupby(['visit_date_dayofw']).agg('median').reset_index()
k9.columns = ['visit_date_dayofw','median_visitors4']


"""
total = pd.concat((train[['air_store_id','air_genre_name','air_area_name']],test[['air_store_id','air_genre_name','air_area_name']]))


k10 = total.groupby(['air_genre_name']).agg('count')['air_store_id'].reset_index()
k10.columns = ['air_genre_name','count1']
k11 = total.groupby(['air_area_name']).agg('count')['air_store_id'].reset_index()
k11.columns = ['air_area_name','count2']

k12 = total.groupby(['air_store_id']).agg('count')['air_area_name'].reset_index()
k12.columns = ['air_store_id','count2']

"""

k10 = train[['visitors','air_store_id']].groupby('air_store_id').agg('mean').reset_index()
k10.columns = ['air_store_id','mean_visitors_f']
k11 = train[['visitors','air_store_id','visit_date_dayofw']].groupby(['air_store_id','visit_date_dayofw']).agg('mean').reset_index()
k11.columns = ['air_store_id','visit_date_dayofw','mean_visitors3_f']
k12 = train[['visitors','visit_date_dayofw']].groupby(['visit_date_dayofw']).agg('mean').reset_index()
k12.columns = ['visit_date_dayofw','mean_visitors4_f']






k13 = train[['visitors','weekofyear']].groupby(['weekofyear']).agg('mean').reset_index()
k13.columns = ['weekofyear','mean_visitors2_weekofyear']
k14 = train[['visitors','weekofyear']].groupby(['weekofyear']).agg('median').reset_index()
k14.columns = ['weekofyear','median_visitors2_weekofyear']



k15 = train[['visitors','air_store_id','visit_date_month']].groupby(['air_store_id','visit_date_month']).agg('mean').reset_index()


#k12 = train[['visitors','air_genre_name','air_area_name']].groupby(['air_genre_name','air_area_name']).agg('mean').reset_index()
#k12.columns = ['air_genre_name','air_area_name','mean_air_area_name_visitor']






In [144]:
"""
k15.columns = ['air_store_id','visit_date_month','xxx']
train = train.merge(k15,on=['air_store_id','visit_date_month'],how='left')
test = test.merge(k15,on= ['air_store_id','visit_date_month'],how='left')
k1.columns = ['air_store_id', 'yyy']
test= test.merge(k1,on = 'air_store_id',how = 'left')
test.loc[pd.isnull(test.xxx),'xxx'] = test.loc[pd.isnull(test.xxx),'yyy'].copy()
test = test.drop('yyy',axis=1)
"""

"\nk15.columns = ['air_store_id','visit_date_month','xxx']\ntrain = train.merge(k15,on=['air_store_id','visit_date_month'],how='left')\ntest = test.merge(k15,on= ['air_store_id','visit_date_month'],how='left')\nk1.columns = ['air_store_id', 'yyy']\ntest= test.merge(k1,on = 'air_store_id',how = 'left')\ntest.loc[pd.isnull(test.xxx),'xxx'] = test.loc[pd.isnull(test.xxx),'yyy'].copy()\ntest = test.drop('yyy',axis=1)\n"

In [145]:
train = air_visit_data.copy()
test = sample_submission.copy()
train = train[k]
test = test[k]



train = train.merge(k1,on='air_store_id',how='left')
test = test.merge(k1,on='air_store_id',how='left')
train = train.merge(k2,on='air_store_id',how='left')
test = test.merge(k2,on='air_store_id',how='left')
#train = train.merge(k3,on=['air_store_id','visit_date_month'],how='left')
#test = test.merge(k3,on= ['air_store_id','visit_date_month'],how='left')

train = train.merge(k4,on=['visit_date_month'],how='left')
test = test.merge(k4,on= ['visit_date_month'],how='left')
#train = train.merge(k5,on=['air_store_id','visit_date_dayofw'],how='left')
#test = test.merge(k5,on= ['air_store_id','visit_date_dayofw'],how='left')
train = train.merge(k6,on=['visit_date_dayofw'],how='left')
test = test.merge(k6,on= ['visit_date_dayofw'],how='left')


train = train.merge(k7,on=['visit_date_month'],how='left')
test = test.merge(k7,on= ['visit_date_month'],how='left')
#train = train.merge(k8,on=['air_store_id','visit_date_dayofw'],how='left')
#test = test.merge(k8,on= ['air_store_id','visit_date_dayofw'],how='left')
train = train.merge(k9,on=['visit_date_dayofw'],how='left')
test = test.merge(k9,on= ['visit_date_dayofw'],how='left')

train = train.merge(k10,on=['air_store_id'],how='left')
test = test.merge(k10,on= ['air_store_id'],how='left')
train = train.merge(k11,on=['air_store_id','visit_date_dayofw'],how='left')
test = test.merge(k11,on= ['air_store_id','visit_date_dayofw'],how='left')
train = train.merge(k12,on=['visit_date_dayofw'],how='left')
test = test.merge(k12,on= ['visit_date_dayofw'],how='left')




train = train.merge(df_open_restro,on=['visit_date_year','weekofyear'],how='left')
test = test.merge(df_open_restro,on= ['visit_date_year','weekofyear'],how='left')



#train = train.merge(k13,on=['weekofyear'],how='left')
#test = test.merge(k13,on= ['weekofyear'],how='left')
#train = train.merge(k14,on=['weekofyear'],how='left')
#test = test.merge(k14,on= ['weekofyear'],how='left')



"""
n1 = air_store_info.groupby(['air_genre_name','air_area_name']).agg('count')['air_store_id'].reset_index()
n1.columns = ['air_genre_name','air_area_name','air_genre_name_c']


n2 = n1.groupby('air_genre_name').agg('count')['air_area_name'].reset_index()
n2.columns = ['air_genre_name','air_area_name_unique']

n3 = n1.groupby('air_area_name').agg('count')['air_genre_name'].reset_index()
n3.columns = ['air_area_name','air_genre_name_unique']




train = train.merge(k14,on=['air_store_id','visit_date_dayofw'],how='left')
test = test.merge(k14,on= ['air_store_id','visit_date_dayofw'],how='left')
train = train.merge(k15,on=['visit_date_dayofw'],how='left')
test = test.merge(k15,on= ['visit_date_dayofw'],how='left')
"""
y = train.visitors.values
ids = sample_submission['id'].values



In [146]:
#test.loc[pd.isnull(test.xxx),['yyy','xxx']]

In [147]:

date_info = pd.read_csv('date_info.csv')
date_info['calendar_date']  = pd.to_datetime(date_info['calendar_date'],format= '%Y-%m-%d')
date_info.rename(columns = {'calendar_date':'visit_date'},inplace = True)

print('holidays at weekends are not special, right?')
wkend_holidays = date_info.apply((lambda x:(x.day_of_week=='Sunday' or x.day_of_week=='Saturday') and x.holiday_flg==1), axis=1)
date_info.loc[wkend_holidays, 'holiday_flg'] = 0
date_info['weight'] = ((date_info.index + 1.0) / len(date_info)) ** 5.0


train = train.merge(date_info,on='visit_date',how='left')
test = test.merge(date_info,on='visit_date',how='left')


holidays at weekends are not special, right?


In [148]:
"""
k1 = train.loc[train.holiday_flg == 1].groupby(['air_store_id','visit_date_dayofw'])['visitors'].agg('mean').reset_index()
k2 = train.loc[train.holiday_flg != 1].groupby(['air_store_id','visit_date_dayofw'])['visitors'].agg('mean').reset_index()
k1.columns = ['air_store_id','visit_date_dayofw','holiday_flg_1']
k2.columns = ['air_store_id','visit_date_dayofw','holiday_flg_1']

k = k1.merge(k2,on=['air_store_id','visit_date_dayofw'],how='left')
train = train.merge(k,how='left',on=['air_store_id','visit_date_dayofw'])
test = test.merge(k,how='left',on=['air_store_id','visit_date_dayofw'])
k1 = train.loc[train.holiday_flg == 1].groupby(['air_area_name','visit_date_dayofw'])['visitors'].agg('mean').reset_index()
k2 = train.loc[train.holiday_flg != 1].groupby(['air_area_name','visit_date_dayofw'])['visitors'].agg('mean').reset_index()
k1.columns = ['air_area_name','visit_date_dayofw','holiday_flg_1']
k2.columns = ['air_area_name','visit_date_dayofw','holiday_flg_1']

k = k1.merge(k2,on=['air_area_name','visit_date_dayofw'],how='left')
train = train.merge(k,how='left',on=['air_area_name','visit_date_dayofw'])
test = test.merge(k,how='left',on=['air_area_name','visit_date_dayofw'])
"""


"\nk1 = train.loc[train.holiday_flg == 1].groupby(['air_store_id','visit_date_dayofw'])['visitors'].agg('mean').reset_index()\nk2 = train.loc[train.holiday_flg != 1].groupby(['air_store_id','visit_date_dayofw'])['visitors'].agg('mean').reset_index()\nk1.columns = ['air_store_id','visit_date_dayofw','holiday_flg_1']\nk2.columns = ['air_store_id','visit_date_dayofw','holiday_flg_1']\n\nk = k1.merge(k2,on=['air_store_id','visit_date_dayofw'],how='left')\ntrain = train.merge(k,how='left',on=['air_store_id','visit_date_dayofw'])\ntest = test.merge(k,how='left',on=['air_store_id','visit_date_dayofw'])\nk1 = train.loc[train.holiday_flg == 1].groupby(['air_area_name','visit_date_dayofw'])['visitors'].agg('mean').reset_index()\nk2 = train.loc[train.holiday_flg != 1].groupby(['air_area_name','visit_date_dayofw'])['visitors'].agg('mean').reset_index()\nk1.columns = ['air_area_name','visit_date_dayofw','holiday_flg_1']\nk2.columns = ['air_area_name','visit_date_dayofw','holiday_flg_1']\n\nk = k1.m

In [149]:
relation = pd.read_csv('store_id_relation.csv')
relation['both'] = 1

In [150]:
"""
o = hpg_reserve[['hpg_store_id','visit_date']]
o = o.merge(relation,on='hpg_store_id',how='left')
o  = o.loc[pd.notnull(o.air_store_id)]
o = o[['air_store_id','visit_date']]
p = air_reserve[['air_store_id','visit_date']]
q = pd.concat((o,p),axis=0)
d =q.groupby(['air_store_id', u'visit_date']).agg('count').reset_index()[['air_store_id','visit_date']]
d = d.merge(air_store_info,how='left',on='air_store_id')
d = d[['air_store_id','visit_date','air_area_name']]
l = d.groupby('visit_date').agg('count')['air_store_id'].reset_index()
l.columns = ['visit_date','restoopend']
m = air_store_info.groupby('air_area_name').agg('count')['air_store_id'].reset_index()
m.columns = ['air_area_name','restro_in_that_area']
air_visit_data = air_visit_data.merge(l, on = 'visit_date', how= 'left')
air_visit_data = air_visit_data.merge(m, on = 'air_area_name', how= 'left')
sample_submission = sample_submission.merge(l, on = 'visit_date', how= 'left')
sample_submission = sample_submission.merge(m, on = 'air_area_name', how= 'left')
n =d.groupby(['visit_date','air_area_name']).agg('count')['air_store_id'].reset_index()
n.columns = ['visit_date','air_area_name','date_are_restro_opened']
air_visit_data = air_visit_data.merge(n, on = ['visit_date','air_area_name'], how= 'left')
sample_submission = sample_submission.merge(n, on = ['visit_date','air_area_name'], how= 'left')
"""

"\no = hpg_reserve[['hpg_store_id','visit_date']]\no = o.merge(relation,on='hpg_store_id',how='left')\no  = o.loc[pd.notnull(o.air_store_id)]\no = o[['air_store_id','visit_date']]\np = air_reserve[['air_store_id','visit_date']]\nq = pd.concat((o,p),axis=0)\nd =q.groupby(['air_store_id', u'visit_date']).agg('count').reset_index()[['air_store_id','visit_date']]\nd = d.merge(air_store_info,how='left',on='air_store_id')\nd = d[['air_store_id','visit_date','air_area_name']]\nl = d.groupby('visit_date').agg('count')['air_store_id'].reset_index()\nl.columns = ['visit_date','restoopend']\nm = air_store_info.groupby('air_area_name').agg('count')['air_store_id'].reset_index()\nm.columns = ['air_area_name','restro_in_that_area']\nair_visit_data = air_visit_data.merge(l, on = 'visit_date', how= 'left')\nair_visit_data = air_visit_data.merge(m, on = 'air_area_name', how= 'left')\nsample_submission = sample_submission.merge(l, on = 'visit_date', how= 'left')\nsample_submission = sample_submission.me

In [151]:
sample_submission

Unnamed: 0,id,visitors,air_store_id,visit_date,air_genre_name,air_area_name,latitude,longitude,holiday_eve,non_working,...,visitors_min,reserve_datetime,reserve_visitorslist,visit_date_month,visit_date_dayofw,visit_date_year,visit_date_dayofm,weekofyear,total_reserve,numb_total_reserve
0,air_00a91d42b08b08d9_2017-04-23,0,air_00a91d42b08b08d9,2017-04-23,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0.0,1,...,1.098612,,,4,6,2017,23,16,-1,0
1,air_00a91d42b08b08d9_2017-04-24,0,air_00a91d42b08b08d9,2017-04-24,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0.0,0,...,0.693147,,,4,0,2017,24,17,-1,0
2,air_00a91d42b08b08d9_2017-04-25,0,air_00a91d42b08b08d9,2017-04-25,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0.0,0,...,0.693147,,,4,1,2017,25,17,-1,0
3,air_00a91d42b08b08d9_2017-04-26,0,air_00a91d42b08b08d9,2017-04-26,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0.0,0,...,0.693147,,,4,2,2017,26,17,-1,0
4,air_00a91d42b08b08d9_2017-04-27,0,air_00a91d42b08b08d9,2017-04-27,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0.0,0,...,0.693147,,,4,3,2017,27,17,-1,0
5,air_00a91d42b08b08d9_2017-04-28,0,air_00a91d42b08b08d9,2017-04-28,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,1.0,0,...,0.693147,,,4,4,2017,28,17,-1,0
6,air_00a91d42b08b08d9_2017-04-29,0,air_00a91d42b08b08d9,2017-04-29,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0.0,1,...,1.098612,,,4,5,2017,29,17,-1,0
7,air_00a91d42b08b08d9_2017-04-30,0,air_00a91d42b08b08d9,2017-04-30,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0.0,1,...,1.098612,,,4,6,2017,30,17,-1,0
8,air_00a91d42b08b08d9_2017-05-01,0,air_00a91d42b08b08d9,2017-05-01,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0.0,0,...,0.693147,,,5,0,2017,1,18,-1,0
9,air_00a91d42b08b08d9_2017-05-02,0,air_00a91d42b08b08d9,2017-05-02,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,1.0,0,...,0.693147,,,5,1,2017,2,18,-1,0


In [152]:

train = train.merge(relation,how='left',on='air_store_id')
test = test.merge(relation,how='left',on='air_store_id')



train = train.merge(hpg_store_info,how='left',on='hpg_store_id')
test = test.merge(hpg_store_info,how='left',on='hpg_store_id')

train = train.merge(weather,on=['air_store_id','visit_date'],how='left')
test = test.merge(weather,on= ['air_store_id','visit_date'],how='left')


"""
train = train.merge(mean_day_wise,on=['air_store_id'],how='left')
test = test.merge(mean_day_wise,on= ['air_store_id'],how='left')



train = train.merge(n2,on=['air_genre_name'],how='left')
test = test.merge(n2,on= ['air_genre_name'],how='left')
train = train.merge(n3,on=['air_area_name'],how='left')
test = test.merge(n3,on= ['air_area_name'],how='left')
"""


"\ntrain = train.merge(mean_day_wise,on=['air_store_id'],how='left')\ntest = test.merge(mean_day_wise,on= ['air_store_id'],how='left')\n\n\n\ntrain = train.merge(n2,on=['air_genre_name'],how='left')\ntest = test.merge(n2,on= ['air_genre_name'],how='left')\ntrain = train.merge(n3,on=['air_area_name'],how='left')\ntest = test.merge(n3,on= ['air_area_name'],how='left')\n"

In [153]:
ind1 = train.loc[train.visit_date <= pd.to_datetime('2017-03-30',format= '%Y-%m-%d')].index
ind2 = train.loc[train.visit_date > pd.to_datetime('2017-03-30',format= '%Y-%m-%d')].index


In [154]:
"""
k14 = train.loc[ind1,['visitors','visit_date_dayofw','air_area_name','visit_date_year']].groupby(['visit_date_dayofw','air_area_name','visit_date_year']).agg('mean').reset_index()
k14.columns = ['visit_date_dayofw','air_area_name','visit_date_year','mean_visitors7']
train = train.merge(k14,on=['visit_date_dayofw','air_area_name','visit_date_year'],how='left')
test = test.merge(k14,on= ['visit_date_dayofw','air_area_name','visit_date_year'],how='left')
"""

"\nk14 = train.loc[ind1,['visitors','visit_date_dayofw','air_area_name','visit_date_year']].groupby(['visit_date_dayofw','air_area_name','visit_date_year']).agg('mean').reset_index()\nk14.columns = ['visit_date_dayofw','air_area_name','visit_date_year','mean_visitors7']\ntrain = train.merge(k14,on=['visit_date_dayofw','air_area_name','visit_date_year'],how='left')\ntest = test.merge(k14,on= ['visit_date_dayofw','air_area_name','visit_date_year'],how='left')\n"

In [155]:
train_tf = list(train.apply(lambda x:'%s %s' % (x['air_area_name'],x['air_genre_name']),axis=1))
test_tf = list(test.apply(lambda x:'%s %s' % (x['air_area_name'],x['air_genre_name']),axis=1))
train_tf_vec =  tfv.transform(train_tf) 
test_tf_vec = tfv.transform(test_tf)

In [156]:
svd = TruncatedSVD(n_components=50, n_iter=7, random_state=42)
svd.fit(train_tf_vec)
train_tf_vec = svd.transform(train_tf_vec)
test_tf_vec = svd.transform(test_tf_vec)



In [157]:
"""
auto_data = pd.read_csv('auto_data.csv')
auto_data_area = pd.read_csv('auto_data_area_wise.csv')
train = train.merge(auto_data,how='left',on = 'air_store_id')
test = test.merge(auto_data,how='left',on = 'air_store_id')
train = train.merge(auto_data_area,how='left',on = 'air_area_name')
test = test.merge(auto_data_area,how='left',on = 'air_area_name')
"""

"\nauto_data = pd.read_csv('auto_data.csv')\nauto_data_area = pd.read_csv('auto_data_area_wise.csv')\ntrain = train.merge(auto_data,how='left',on = 'air_store_id')\ntest = test.merge(auto_data,how='left',on = 'air_store_id')\ntrain = train.merge(auto_data_area,how='left',on = 'air_area_name')\ntest = test.merge(auto_data_area,how='left',on = 'air_area_name')\n"

In [158]:
train_tf_vec = pd.DataFrame(train_tf_vec)
test_tf_vec = pd.DataFrame(test_tf_vec)

In [159]:
air_month_mean = pd.read_csv('air_store_month_wise_mean.csv')
air_week_mean = pd.read_csv('air_store_day_of_week_wise_mean.csv')

train = train.merge(air_month_mean, how='left',on='air_store_id')
test = test.merge(air_month_mean, how='left',on='air_store_id')
train = train.merge(air_week_mean, how='left',on='air_store_id')
test = test.merge(air_week_mean, how='left',on='air_store_id')

In [160]:
train = train.drop(['hpg_area_name','hpg_genre_name','reserve_visitorslist','reserve_datetime','visitors','visit_date'],axis =1)
test = test.drop(['hpg_area_name','hpg_genre_name','reserve_visitorslist','reserve_datetime','visitors','visit_date'],axis =1)
from sklearn import ensemble, preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
train.replace(np.nan,-1,inplace=True)
test.replace(np.nan,-1,inplace=True)
text_columns = []
for f in train.columns:
    if (train[f].dtype == 'object'):  
        print f
        text_columns.append(f)            
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train[f].values) + list(test[f].values))
        train[f] = lbl.transform(list(train[f].values))
        test[f] = lbl.transform(list(test[f].values))
    
        
        

air_store_id
air_genre_name
air_area_name
day_of_week
hpg_store_id


In [161]:
"""
df = pd.concat((train[['air_store_id','hpg_store_id']],test[['air_store_id','hpg_store_id']]))
print df.shape,train.shape,test.shape
one_hot = pd.get_dummies(df['air_store_id'], drop_first=False)
k = (one_hot - one_hot.mean()) / one_hot.std()
k_train.shape,k_test.shape
k_train = k[:train.shape[0]]
k_test = k[train.shape[0]:]
k = [str(i)+'_col' for i in k_train.columns ]
k_train.columns = k
k_test.columns = k
train = pd.concat((train,k_train),axis=1)
test = pd.concat((test,k_test),axis=1)
test.columns
[i for i in train.columns if i not in test.columns]
"""

"\ndf = pd.concat((train[['air_store_id','hpg_store_id']],test[['air_store_id','hpg_store_id']]))\nprint df.shape,train.shape,test.shape\none_hot = pd.get_dummies(df['air_store_id'], drop_first=False)\nk = (one_hot - one_hot.mean()) / one_hot.std()\nk_train.shape,k_test.shape\nk_train = k[:train.shape[0]]\nk_test = k[train.shape[0]:]\nk = [str(i)+'_col' for i in k_train.columns ]\nk_train.columns = k\nk_test.columns = k\ntrain = pd.concat((train,k_train),axis=1)\ntest = pd.concat((test,k_test),axis=1)\ntest.columns\n[i for i in train.columns if i not in test.columns]\n"

In [162]:
train = pd.concat((train,train_tf_vec),axis=1)
test = pd.concat((test,test_tf_vec),axis=1)
train.replace(np.nan,-1,inplace=True)
test.replace(np.nan,-1,inplace=True)


In [163]:
"""
test['id'] = ids
train['visitors'] = y
train.to_csv('final_train.csv',index = False)
test.to_csv('final_test.csv',index = False)
"""

"\ntest['id'] = ids\ntrain['visitors'] = y\ntrain.to_csv('final_train.csv',index = False)\ntest.to_csv('final_test.csv',index = False)\n"

In [164]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import  roc_auc_score
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0,depth =8):
        params = {}
        params["objective"] = "reg:linear"
        params['eval_metric'] = 'rmse'
        params["eta"] = 0.01 #0.00334
        params["min_child_weight"] = 1
        params["subsample"] = 0.7
        params["colsample_bytree"] = 0.3
        params["silent"] = 1
        params["max_depth"] = depth
        params["seed"] = seed_val
        #params["max_delta_step"] = 2
        #params["gamma"] = 0.5
        num_rounds = 5000 #2500

        plst = list(params.items())
        xgtrain = xgb.DMatrix(train_X, label=train_y)

        if test_y is not None:
                xgtest = xgb.DMatrix(test_X, label=test_y)
                watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
                model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds= 300)
        else:
                xgtest = xgb.DMatrix(test_X)
                #xgtest1 = xgb.DMatrix(test_X1)
                model = xgb.train(plst, xgtrain, 3200)

        if feature_names:
                        create_feature_map(feature_names)
                        model.dump_model('xgbmodel.txt', 'xgb.fmap', with_stats=True)
                        importance = model.get_fscore(fmap='xgb.fmap')
                        importance = sorted(importance.items(), key=operator.itemgetter(1), reverse=True)
                        imp_df = pd.DataFrame(importance, columns=['feature','fscore'])
                        imp_df['fscore'] = imp_df['fscore'] / imp_df['fscore'].sum()
                        imp_df.to_csv("imp_feat.txt", index=False)

        pred_test_y = model.predict(xgtest)
        if test_y is not None:
                loss = rmsle(np.expm1(test_y), np.expm1(pred_test_y))
        	return loss
	else:
		return pred_test_y

In [165]:
from sklearn.model_selection import train_test_split
X_train = train.iloc[ind1]
X_test = train.iloc[ind2]
y_train = y[ind1]
y_test  = y[ind2]
k = runXGB(X_train, np.log1p(y_train), X_test, np.log1p(y_test))
#k = runXGB(X_train, np.log1p(y_train), X_test, np.log1p(y_test))#0.454932


[0]	train-rmse:2.41612	test-rmse:2.43399
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 300 rounds.
[1]	train-rmse:2.3933	test-rmse:2.41092
[2]	train-rmse:2.3706	test-rmse:2.38792
[3]	train-rmse:2.34827	test-rmse:2.36526
[4]	train-rmse:2.32594	test-rmse:2.34272
[5]	train-rmse:2.30393	test-rmse:2.32046
[6]	train-rmse:2.28204	test-rmse:2.2982
[7]	train-rmse:2.26045	test-rmse:2.27634
[8]	train-rmse:2.23913	test-rmse:2.25475
[9]	train-rmse:2.21812	test-rmse:2.23346
[10]	train-rmse:2.19722	test-rmse:2.21227
[11]	train-rmse:2.17666	test-rmse:2.19141
[12]	train-rmse:2.1562	test-rmse:2.17064
[13]	train-rmse:2.13605	test-rmse:2.15021
[14]	train-rmse:2.11594	test-rmse:2.12945
[15]	train-rmse:2.09618	test-rmse:2.1095
[16]	train-rmse:2.07662	test-rmse:2.0899
[17]	train-rmse:2.05721	test-rmse:2.07024
[18]	train-rmse:2.03803	test-rmse:2.05088
[19]	train-rmse:2.01905	test-rmse:2.03168
[20]	train-rmse:2.00035	test-rms

[188]	train-rmse:0.617215	test-rmse:0.614793
[189]	train-rmse:0.615114	test-rmse:0.612783
[190]	train-rmse:0.613012	test-rmse:0.610872
[191]	train-rmse:0.610936	test-rmse:0.608953
[192]	train-rmse:0.608898	test-rmse:0.607001
[193]	train-rmse:0.606806	test-rmse:0.604915
[194]	train-rmse:0.604869	test-rmse:0.603075
[195]	train-rmse:0.602895	test-rmse:0.601098
[196]	train-rmse:0.600996	test-rmse:0.599296
[197]	train-rmse:0.599122	test-rmse:0.597515
[198]	train-rmse:0.597176	test-rmse:0.595606
[199]	train-rmse:0.595347	test-rmse:0.593739
[200]	train-rmse:0.593459	test-rmse:0.591878
[201]	train-rmse:0.591689	test-rmse:0.590105
[202]	train-rmse:0.589872	test-rmse:0.588357
[203]	train-rmse:0.588067	test-rmse:0.586645
[204]	train-rmse:0.586378	test-rmse:0.585012
[205]	train-rmse:0.584641	test-rmse:0.583409
[206]	train-rmse:0.582927	test-rmse:0.581773
[207]	train-rmse:0.581314	test-rmse:0.5802
[208]	train-rmse:0.579649	test-rmse:0.578613
[209]	train-rmse:0.578098	test-rmse:0.577155
[210]	train-

[372]	train-rmse:0.487306	test-rmse:0.499492
[373]	train-rmse:0.487168	test-rmse:0.499428
[374]	train-rmse:0.487026	test-rmse:0.49936
[375]	train-rmse:0.486907	test-rmse:0.499305
[376]	train-rmse:0.486789	test-rmse:0.499261
[377]	train-rmse:0.486647	test-rmse:0.499189
[378]	train-rmse:0.486535	test-rmse:0.499148
[379]	train-rmse:0.486411	test-rmse:0.49909
[380]	train-rmse:0.486277	test-rmse:0.499036
[381]	train-rmse:0.486171	test-rmse:0.499006
[382]	train-rmse:0.486056	test-rmse:0.498983
[383]	train-rmse:0.4859	test-rmse:0.498883
[384]	train-rmse:0.485752	test-rmse:0.4988
[385]	train-rmse:0.485627	test-rmse:0.498769
[386]	train-rmse:0.48549	test-rmse:0.498655
[387]	train-rmse:0.485364	test-rmse:0.498555
[388]	train-rmse:0.485225	test-rmse:0.498484
[389]	train-rmse:0.485091	test-rmse:0.498441
[390]	train-rmse:0.484972	test-rmse:0.498356
[391]	train-rmse:0.484857	test-rmse:0.498308
[392]	train-rmse:0.484708	test-rmse:0.49823
[393]	train-rmse:0.484607	test-rmse:0.498183
[394]	train-rmse:0

[556]	train-rmse:0.472928	test-rmse:0.492916
[557]	train-rmse:0.472897	test-rmse:0.492907
[558]	train-rmse:0.472833	test-rmse:0.492871
[559]	train-rmse:0.472798	test-rmse:0.492857
[560]	train-rmse:0.472739	test-rmse:0.492833
[561]	train-rmse:0.472697	test-rmse:0.492834
[562]	train-rmse:0.472643	test-rmse:0.492818
[563]	train-rmse:0.472577	test-rmse:0.492776
[564]	train-rmse:0.472523	test-rmse:0.492768
[565]	train-rmse:0.472477	test-rmse:0.492759
[566]	train-rmse:0.472411	test-rmse:0.49271
[567]	train-rmse:0.472357	test-rmse:0.492682
[568]	train-rmse:0.472309	test-rmse:0.49266
[569]	train-rmse:0.472286	test-rmse:0.49265
[570]	train-rmse:0.472233	test-rmse:0.492634
[571]	train-rmse:0.472201	test-rmse:0.492633
[572]	train-rmse:0.472142	test-rmse:0.492592
[573]	train-rmse:0.472102	test-rmse:0.492586
[574]	train-rmse:0.472049	test-rmse:0.492551
[575]	train-rmse:0.471992	test-rmse:0.492525
[576]	train-rmse:0.471948	test-rmse:0.492523
[577]	train-rmse:0.471896	test-rmse:0.492516
[578]	train-r

[739]	train-rmse:0.465145	test-rmse:0.48936
[740]	train-rmse:0.465091	test-rmse:0.489334
[741]	train-rmse:0.46505	test-rmse:0.489308
[742]	train-rmse:0.464999	test-rmse:0.489259
[743]	train-rmse:0.46496	test-rmse:0.48925
[744]	train-rmse:0.464916	test-rmse:0.489223
[745]	train-rmse:0.46488	test-rmse:0.489201
[746]	train-rmse:0.464869	test-rmse:0.489202
[747]	train-rmse:0.46483	test-rmse:0.489187
[748]	train-rmse:0.464797	test-rmse:0.489181
[749]	train-rmse:0.464758	test-rmse:0.489174
[750]	train-rmse:0.464722	test-rmse:0.489151
[751]	train-rmse:0.464686	test-rmse:0.489149
[752]	train-rmse:0.46466	test-rmse:0.489138
[753]	train-rmse:0.464624	test-rmse:0.489125
[754]	train-rmse:0.464585	test-rmse:0.489125
[755]	train-rmse:0.464567	test-rmse:0.489118
[756]	train-rmse:0.464514	test-rmse:0.489096
[757]	train-rmse:0.464487	test-rmse:0.489082
[758]	train-rmse:0.464447	test-rmse:0.489071
[759]	train-rmse:0.464403	test-rmse:0.489041
[760]	train-rmse:0.464382	test-rmse:0.48903
[761]	train-rmse:0

[923]	train-rmse:0.459092	test-rmse:0.486874
[924]	train-rmse:0.459063	test-rmse:0.486812
[925]	train-rmse:0.459035	test-rmse:0.486811
[926]	train-rmse:0.458987	test-rmse:0.486792
[927]	train-rmse:0.458958	test-rmse:0.486793
[928]	train-rmse:0.458936	test-rmse:0.486791
[929]	train-rmse:0.45889	test-rmse:0.486743
[930]	train-rmse:0.458872	test-rmse:0.486732
[931]	train-rmse:0.458851	test-rmse:0.486736
[932]	train-rmse:0.458832	test-rmse:0.486724
[933]	train-rmse:0.458795	test-rmse:0.486719
[934]	train-rmse:0.458761	test-rmse:0.486706
[935]	train-rmse:0.458747	test-rmse:0.48671
[936]	train-rmse:0.458713	test-rmse:0.486647
[937]	train-rmse:0.45869	test-rmse:0.486648
[938]	train-rmse:0.458663	test-rmse:0.486616
[939]	train-rmse:0.458636	test-rmse:0.486605
[940]	train-rmse:0.458598	test-rmse:0.486585
[941]	train-rmse:0.458556	test-rmse:0.486566
[942]	train-rmse:0.458521	test-rmse:0.486564
[943]	train-rmse:0.458503	test-rmse:0.486558
[944]	train-rmse:0.458473	test-rmse:0.486555
[945]	train-r

[1104]	train-rmse:0.454038	test-rmse:0.484694
[1105]	train-rmse:0.454014	test-rmse:0.484691
[1106]	train-rmse:0.45397	test-rmse:0.48468
[1107]	train-rmse:0.453951	test-rmse:0.484681
[1108]	train-rmse:0.453918	test-rmse:0.484659
[1109]	train-rmse:0.453897	test-rmse:0.48466
[1110]	train-rmse:0.453888	test-rmse:0.484655
[1111]	train-rmse:0.453862	test-rmse:0.484657
[1112]	train-rmse:0.453842	test-rmse:0.484627
[1113]	train-rmse:0.453817	test-rmse:0.484626
[1114]	train-rmse:0.453797	test-rmse:0.484626
[1115]	train-rmse:0.453766	test-rmse:0.484623
[1116]	train-rmse:0.453735	test-rmse:0.484619
[1117]	train-rmse:0.453704	test-rmse:0.484602
[1118]	train-rmse:0.453662	test-rmse:0.484595
[1119]	train-rmse:0.453637	test-rmse:0.484596
[1120]	train-rmse:0.453612	test-rmse:0.484595
[1121]	train-rmse:0.453588	test-rmse:0.48459
[1122]	train-rmse:0.453555	test-rmse:0.484588
[1123]	train-rmse:0.453533	test-rmse:0.484589
[1124]	train-rmse:0.453511	test-rmse:0.484587
[1125]	train-rmse:0.453492	test-rmse:0

[1284]	train-rmse:0.449752	test-rmse:0.483431
[1285]	train-rmse:0.449744	test-rmse:0.483427
[1286]	train-rmse:0.449727	test-rmse:0.483423
[1287]	train-rmse:0.449708	test-rmse:0.48341
[1288]	train-rmse:0.449691	test-rmse:0.48341
[1289]	train-rmse:0.449664	test-rmse:0.483408
[1290]	train-rmse:0.449636	test-rmse:0.483389
[1291]	train-rmse:0.449617	test-rmse:0.48338
[1292]	train-rmse:0.449606	test-rmse:0.483379
[1293]	train-rmse:0.449583	test-rmse:0.483378
[1294]	train-rmse:0.449562	test-rmse:0.483373
[1295]	train-rmse:0.449556	test-rmse:0.483375
[1296]	train-rmse:0.449547	test-rmse:0.483371
[1297]	train-rmse:0.449532	test-rmse:0.483371
[1298]	train-rmse:0.449508	test-rmse:0.483369
[1299]	train-rmse:0.449493	test-rmse:0.483361
[1300]	train-rmse:0.44947	test-rmse:0.483355
[1301]	train-rmse:0.449438	test-rmse:0.483336
[1302]	train-rmse:0.449407	test-rmse:0.483333
[1303]	train-rmse:0.449389	test-rmse:0.483334
[1304]	train-rmse:0.449344	test-rmse:0.483276
[1305]	train-rmse:0.449325	test-rmse:0

[1463]	train-rmse:0.445897	test-rmse:0.482424
[1464]	train-rmse:0.445873	test-rmse:0.482424
[1465]	train-rmse:0.445833	test-rmse:0.482427
[1466]	train-rmse:0.445819	test-rmse:0.482422
[1467]	train-rmse:0.445793	test-rmse:0.482414
[1468]	train-rmse:0.445777	test-rmse:0.48241
[1469]	train-rmse:0.445753	test-rmse:0.482405
[1470]	train-rmse:0.44574	test-rmse:0.482393
[1471]	train-rmse:0.445721	test-rmse:0.48239
[1472]	train-rmse:0.445704	test-rmse:0.482378
[1473]	train-rmse:0.445679	test-rmse:0.48231
[1474]	train-rmse:0.445662	test-rmse:0.48231
[1475]	train-rmse:0.445654	test-rmse:0.482315
[1476]	train-rmse:0.445643	test-rmse:0.482309
[1477]	train-rmse:0.445615	test-rmse:0.4823
[1478]	train-rmse:0.445598	test-rmse:0.482293
[1479]	train-rmse:0.445564	test-rmse:0.482286
[1480]	train-rmse:0.44554	test-rmse:0.482294
[1481]	train-rmse:0.445504	test-rmse:0.482291
[1482]	train-rmse:0.44549	test-rmse:0.482281
[1483]	train-rmse:0.445454	test-rmse:0.482273
[1484]	train-rmse:0.445435	test-rmse:0.4822

[1643]	train-rmse:0.442379	test-rmse:0.48145
[1644]	train-rmse:0.442359	test-rmse:0.481445
[1645]	train-rmse:0.442345	test-rmse:0.48145
[1646]	train-rmse:0.442314	test-rmse:0.481424
[1647]	train-rmse:0.442307	test-rmse:0.481421
[1648]	train-rmse:0.442296	test-rmse:0.481414
[1649]	train-rmse:0.442279	test-rmse:0.481402
[1650]	train-rmse:0.442263	test-rmse:0.481399
[1651]	train-rmse:0.44225	test-rmse:0.481396
[1652]	train-rmse:0.442226	test-rmse:0.481419
[1653]	train-rmse:0.442197	test-rmse:0.481396
[1654]	train-rmse:0.442176	test-rmse:0.481387
[1655]	train-rmse:0.442168	test-rmse:0.481385
[1656]	train-rmse:0.442149	test-rmse:0.481376
[1657]	train-rmse:0.442132	test-rmse:0.48138
[1658]	train-rmse:0.442113	test-rmse:0.481384
[1659]	train-rmse:0.442095	test-rmse:0.481392
[1660]	train-rmse:0.442075	test-rmse:0.481384
[1661]	train-rmse:0.44205	test-rmse:0.48139
[1662]	train-rmse:0.442022	test-rmse:0.481386
[1663]	train-rmse:0.442006	test-rmse:0.481384
[1664]	train-rmse:0.441985	test-rmse:0.4

[1823]	train-rmse:0.438982	test-rmse:0.480936
[1824]	train-rmse:0.438971	test-rmse:0.480928
[1825]	train-rmse:0.438947	test-rmse:0.480928
[1826]	train-rmse:0.438929	test-rmse:0.480916
[1827]	train-rmse:0.438919	test-rmse:0.480913
[1828]	train-rmse:0.438895	test-rmse:0.480909
[1829]	train-rmse:0.438873	test-rmse:0.480906
[1830]	train-rmse:0.43886	test-rmse:0.480908
[1831]	train-rmse:0.438839	test-rmse:0.480911
[1832]	train-rmse:0.43882	test-rmse:0.480881
[1833]	train-rmse:0.438806	test-rmse:0.480884
[1834]	train-rmse:0.438784	test-rmse:0.480887
[1835]	train-rmse:0.438767	test-rmse:0.480883
[1836]	train-rmse:0.438749	test-rmse:0.480884
[1837]	train-rmse:0.438734	test-rmse:0.480877
[1838]	train-rmse:0.438713	test-rmse:0.480883
[1839]	train-rmse:0.438695	test-rmse:0.480884
[1840]	train-rmse:0.438682	test-rmse:0.480888
[1841]	train-rmse:0.438655	test-rmse:0.480875
[1842]	train-rmse:0.438632	test-rmse:0.480876
[1843]	train-rmse:0.438616	test-rmse:0.480875
[1844]	train-rmse:0.438598	test-rmse

[2002]	train-rmse:0.435943	test-rmse:0.48035
[2003]	train-rmse:0.43593	test-rmse:0.480343
[2004]	train-rmse:0.43591	test-rmse:0.480342
[2005]	train-rmse:0.435895	test-rmse:0.480339
[2006]	train-rmse:0.435874	test-rmse:0.480337
[2007]	train-rmse:0.435835	test-rmse:0.480312
[2008]	train-rmse:0.435814	test-rmse:0.480307
[2009]	train-rmse:0.435799	test-rmse:0.480298
[2010]	train-rmse:0.43578	test-rmse:0.480293
[2011]	train-rmse:0.43576	test-rmse:0.480294
[2012]	train-rmse:0.435742	test-rmse:0.480292
[2013]	train-rmse:0.435726	test-rmse:0.48029
[2014]	train-rmse:0.435712	test-rmse:0.480289
[2015]	train-rmse:0.435689	test-rmse:0.480291
[2016]	train-rmse:0.43568	test-rmse:0.480292
[2017]	train-rmse:0.435663	test-rmse:0.480294
[2018]	train-rmse:0.435642	test-rmse:0.480283
[2019]	train-rmse:0.435631	test-rmse:0.480284
[2020]	train-rmse:0.43562	test-rmse:0.480285
[2021]	train-rmse:0.435612	test-rmse:0.480282
[2022]	train-rmse:0.435597	test-rmse:0.480283
[2023]	train-rmse:0.435586	test-rmse:0.480

[2181]	train-rmse:0.432902	test-rmse:0.47998
[2182]	train-rmse:0.432894	test-rmse:0.479979
[2183]	train-rmse:0.432864	test-rmse:0.479973
[2184]	train-rmse:0.432842	test-rmse:0.479973
[2185]	train-rmse:0.432822	test-rmse:0.479969
[2186]	train-rmse:0.432807	test-rmse:0.47997
[2187]	train-rmse:0.432788	test-rmse:0.479973
[2188]	train-rmse:0.432769	test-rmse:0.479975
[2189]	train-rmse:0.432756	test-rmse:0.479972
[2190]	train-rmse:0.432745	test-rmse:0.479971
[2191]	train-rmse:0.432732	test-rmse:0.479969
[2192]	train-rmse:0.432718	test-rmse:0.47997
[2193]	train-rmse:0.432704	test-rmse:0.479972
[2194]	train-rmse:0.432692	test-rmse:0.47997
[2195]	train-rmse:0.432675	test-rmse:0.479965
[2196]	train-rmse:0.43266	test-rmse:0.479972
[2197]	train-rmse:0.432639	test-rmse:0.47997
[2198]	train-rmse:0.432632	test-rmse:0.47997
[2199]	train-rmse:0.432623	test-rmse:0.479969
[2200]	train-rmse:0.43262	test-rmse:0.479962
[2201]	train-rmse:0.432606	test-rmse:0.479966
[2202]	train-rmse:0.432595	test-rmse:0.479

[2361]	train-rmse:0.430181	test-rmse:0.479649
[2362]	train-rmse:0.43017	test-rmse:0.479645
[2363]	train-rmse:0.430158	test-rmse:0.479646
[2364]	train-rmse:0.430142	test-rmse:0.479645
[2365]	train-rmse:0.430124	test-rmse:0.479643
[2366]	train-rmse:0.430116	test-rmse:0.479646
[2367]	train-rmse:0.4301	test-rmse:0.479641
[2368]	train-rmse:0.430083	test-rmse:0.479643
[2369]	train-rmse:0.430062	test-rmse:0.479631
[2370]	train-rmse:0.430044	test-rmse:0.479634
[2371]	train-rmse:0.430033	test-rmse:0.479639
[2372]	train-rmse:0.430023	test-rmse:0.479637
[2373]	train-rmse:0.430006	test-rmse:0.479628
[2374]	train-rmse:0.429988	test-rmse:0.47963
[2375]	train-rmse:0.429969	test-rmse:0.479631
[2376]	train-rmse:0.429955	test-rmse:0.479626
[2377]	train-rmse:0.429936	test-rmse:0.479626
[2378]	train-rmse:0.42992	test-rmse:0.479625
[2379]	train-rmse:0.429911	test-rmse:0.479616
[2380]	train-rmse:0.429888	test-rmse:0.47963
[2381]	train-rmse:0.429871	test-rmse:0.479632
[2382]	train-rmse:0.429865	test-rmse:0.4

[2541]	train-rmse:0.427433	test-rmse:0.479399
[2542]	train-rmse:0.427413	test-rmse:0.479379
[2543]	train-rmse:0.427399	test-rmse:0.479378
[2544]	train-rmse:0.427381	test-rmse:0.479371
[2545]	train-rmse:0.427372	test-rmse:0.479372
[2546]	train-rmse:0.427351	test-rmse:0.47937
[2547]	train-rmse:0.427339	test-rmse:0.479363
[2548]	train-rmse:0.42732	test-rmse:0.479344
[2549]	train-rmse:0.427313	test-rmse:0.479345
[2550]	train-rmse:0.4273	test-rmse:0.479352
[2551]	train-rmse:0.427285	test-rmse:0.479351
[2552]	train-rmse:0.427272	test-rmse:0.479355
[2553]	train-rmse:0.427256	test-rmse:0.479356
[2554]	train-rmse:0.427236	test-rmse:0.479349
[2555]	train-rmse:0.427224	test-rmse:0.479348
[2556]	train-rmse:0.427208	test-rmse:0.479342
[2557]	train-rmse:0.427191	test-rmse:0.47935
[2558]	train-rmse:0.427171	test-rmse:0.479354
[2559]	train-rmse:0.427142	test-rmse:0.479338
[2560]	train-rmse:0.427121	test-rmse:0.479339
[2561]	train-rmse:0.427109	test-rmse:0.479339
[2562]	train-rmse:0.427097	test-rmse:0.

[2721]	train-rmse:0.42484	test-rmse:0.479276
[2722]	train-rmse:0.424825	test-rmse:0.479272
[2723]	train-rmse:0.424811	test-rmse:0.479271
[2724]	train-rmse:0.424794	test-rmse:0.479268
[2725]	train-rmse:0.424781	test-rmse:0.479274
[2726]	train-rmse:0.424764	test-rmse:0.479264
[2727]	train-rmse:0.424749	test-rmse:0.479263
[2728]	train-rmse:0.424731	test-rmse:0.479263
[2729]	train-rmse:0.424717	test-rmse:0.479264
[2730]	train-rmse:0.424705	test-rmse:0.47925
[2731]	train-rmse:0.424688	test-rmse:0.47925
[2732]	train-rmse:0.424676	test-rmse:0.479247
[2733]	train-rmse:0.42467	test-rmse:0.479249
[2734]	train-rmse:0.424651	test-rmse:0.479247
[2735]	train-rmse:0.424646	test-rmse:0.479256
[2736]	train-rmse:0.424638	test-rmse:0.479262
[2737]	train-rmse:0.424623	test-rmse:0.479259
[2738]	train-rmse:0.424616	test-rmse:0.479257
[2739]	train-rmse:0.424608	test-rmse:0.479255
[2740]	train-rmse:0.424595	test-rmse:0.47925
[2741]	train-rmse:0.424579	test-rmse:0.47925
[2742]	train-rmse:0.424569	test-rmse:0.4

[2901]	train-rmse:0.422433	test-rmse:0.479101
[2902]	train-rmse:0.422413	test-rmse:0.479097
[2903]	train-rmse:0.422398	test-rmse:0.479089
[2904]	train-rmse:0.422381	test-rmse:0.479094
[2905]	train-rmse:0.422368	test-rmse:0.479103
[2906]	train-rmse:0.422351	test-rmse:0.479098
[2907]	train-rmse:0.422334	test-rmse:0.479096
[2908]	train-rmse:0.422323	test-rmse:0.479097
[2909]	train-rmse:0.422313	test-rmse:0.479098
[2910]	train-rmse:0.422297	test-rmse:0.479095
[2911]	train-rmse:0.422285	test-rmse:0.479094
[2912]	train-rmse:0.422266	test-rmse:0.479079
[2913]	train-rmse:0.422249	test-rmse:0.479089
[2914]	train-rmse:0.422238	test-rmse:0.479088
[2915]	train-rmse:0.422225	test-rmse:0.479093
[2916]	train-rmse:0.422206	test-rmse:0.479099
[2917]	train-rmse:0.422189	test-rmse:0.4791
[2918]	train-rmse:0.422179	test-rmse:0.479102
[2919]	train-rmse:0.422166	test-rmse:0.479099
[2920]	train-rmse:0.422153	test-rmse:0.479102
[2921]	train-rmse:0.422139	test-rmse:0.479093
[2922]	train-rmse:0.422134	test-rmse

[3080]	train-rmse:0.419881	test-rmse:0.479106
[3081]	train-rmse:0.419867	test-rmse:0.47909
[3082]	train-rmse:0.419855	test-rmse:0.479085
[3083]	train-rmse:0.419843	test-rmse:0.479084
[3084]	train-rmse:0.419829	test-rmse:0.479076
[3085]	train-rmse:0.419813	test-rmse:0.479087
[3086]	train-rmse:0.4198	test-rmse:0.479086
[3087]	train-rmse:0.419785	test-rmse:0.479084
[3088]	train-rmse:0.419767	test-rmse:0.479082
[3089]	train-rmse:0.419755	test-rmse:0.479053
[3090]	train-rmse:0.419739	test-rmse:0.479052
[3091]	train-rmse:0.41973	test-rmse:0.479052
[3092]	train-rmse:0.419713	test-rmse:0.479056
[3093]	train-rmse:0.419696	test-rmse:0.479056
[3094]	train-rmse:0.419687	test-rmse:0.479059
[3095]	train-rmse:0.41968	test-rmse:0.479059
[3096]	train-rmse:0.419667	test-rmse:0.479066
[3097]	train-rmse:0.419656	test-rmse:0.47907
[3098]	train-rmse:0.419644	test-rmse:0.479068
[3099]	train-rmse:0.41963	test-rmse:0.479068
[3100]	train-rmse:0.419611	test-rmse:0.479066
[3101]	train-rmse:0.419601	test-rmse:0.47

[3260]	train-rmse:0.417431	test-rmse:0.479109
[3261]	train-rmse:0.41742	test-rmse:0.47911
[3262]	train-rmse:0.417411	test-rmse:0.479107
[3263]	train-rmse:0.417391	test-rmse:0.479102
[3264]	train-rmse:0.417387	test-rmse:0.479104
[3265]	train-rmse:0.417365	test-rmse:0.4791
[3266]	train-rmse:0.417353	test-rmse:0.4791
[3267]	train-rmse:0.417331	test-rmse:0.479103
[3268]	train-rmse:0.417322	test-rmse:0.479102
[3269]	train-rmse:0.41731	test-rmse:0.479116
[3270]	train-rmse:0.417299	test-rmse:0.479117
[3271]	train-rmse:0.41729	test-rmse:0.479118
[3272]	train-rmse:0.417278	test-rmse:0.47912
[3273]	train-rmse:0.417264	test-rmse:0.479122
[3274]	train-rmse:0.417253	test-rmse:0.479115
[3275]	train-rmse:0.417241	test-rmse:0.479114
[3276]	train-rmse:0.417227	test-rmse:0.479116
[3277]	train-rmse:0.417209	test-rmse:0.479116
[3278]	train-rmse:0.417199	test-rmse:0.479112
[3279]	train-rmse:0.417188	test-rmse:0.479109
[3280]	train-rmse:0.417174	test-rmse:0.479091
[3281]	train-rmse:0.417162	test-rmse:0.4790

[3440]	train-rmse:0.415039	test-rmse:0.479047
[3441]	train-rmse:0.41503	test-rmse:0.479049
[3442]	train-rmse:0.415015	test-rmse:0.479051
[3443]	train-rmse:0.415002	test-rmse:0.47905
[3444]	train-rmse:0.414987	test-rmse:0.479049
[3445]	train-rmse:0.414974	test-rmse:0.479048
[3446]	train-rmse:0.414967	test-rmse:0.479042
[3447]	train-rmse:0.414957	test-rmse:0.479033
[3448]	train-rmse:0.414948	test-rmse:0.479032
[3449]	train-rmse:0.414934	test-rmse:0.479037
[3450]	train-rmse:0.414927	test-rmse:0.479036
[3451]	train-rmse:0.414914	test-rmse:0.479035
[3452]	train-rmse:0.414897	test-rmse:0.479037
[3453]	train-rmse:0.414884	test-rmse:0.479031
[3454]	train-rmse:0.414872	test-rmse:0.479033
[3455]	train-rmse:0.414862	test-rmse:0.479033
[3456]	train-rmse:0.414847	test-rmse:0.47903
[3457]	train-rmse:0.414837	test-rmse:0.479024
[3458]	train-rmse:0.414821	test-rmse:0.479029
[3459]	train-rmse:0.414806	test-rmse:0.47903
[3460]	train-rmse:0.414787	test-rmse:0.479037
[3461]	train-rmse:0.414772	test-rmse:0

[3620]	train-rmse:0.412679	test-rmse:0.478987
[3621]	train-rmse:0.412663	test-rmse:0.478982
[3622]	train-rmse:0.412653	test-rmse:0.478985
[3623]	train-rmse:0.412648	test-rmse:0.478986
[3624]	train-rmse:0.412638	test-rmse:0.478983
[3625]	train-rmse:0.412626	test-rmse:0.47898
[3626]	train-rmse:0.412613	test-rmse:0.478975
[3627]	train-rmse:0.4126	test-rmse:0.478976
[3628]	train-rmse:0.412585	test-rmse:0.478979
[3629]	train-rmse:0.412567	test-rmse:0.478962
[3630]	train-rmse:0.412555	test-rmse:0.478962
[3631]	train-rmse:0.412543	test-rmse:0.478962
[3632]	train-rmse:0.412533	test-rmse:0.478975
[3633]	train-rmse:0.41252	test-rmse:0.478977
[3634]	train-rmse:0.412511	test-rmse:0.478964
[3635]	train-rmse:0.412489	test-rmse:0.478967
[3636]	train-rmse:0.412478	test-rmse:0.478967
[3637]	train-rmse:0.412464	test-rmse:0.478969
[3638]	train-rmse:0.41245	test-rmse:0.478963
[3639]	train-rmse:0.412443	test-rmse:0.478958
[3640]	train-rmse:0.412435	test-rmse:0.478961
[3641]	train-rmse:0.41242	test-rmse:0.4

[3799]	train-rmse:0.410485	test-rmse:0.479069
[3800]	train-rmse:0.410475	test-rmse:0.479069
[3801]	train-rmse:0.410462	test-rmse:0.479065
[3802]	train-rmse:0.41045	test-rmse:0.479067
[3803]	train-rmse:0.410434	test-rmse:0.479072
[3804]	train-rmse:0.410424	test-rmse:0.479067
[3805]	train-rmse:0.410411	test-rmse:0.479065
[3806]	train-rmse:0.410403	test-rmse:0.479062
[3807]	train-rmse:0.410393	test-rmse:0.479064
[3808]	train-rmse:0.410386	test-rmse:0.479062
[3809]	train-rmse:0.410378	test-rmse:0.479061
[3810]	train-rmse:0.410365	test-rmse:0.479063
[3811]	train-rmse:0.410355	test-rmse:0.479062
[3812]	train-rmse:0.410344	test-rmse:0.479061
[3813]	train-rmse:0.410329	test-rmse:0.479062
[3814]	train-rmse:0.410321	test-rmse:0.479066
[3815]	train-rmse:0.410305	test-rmse:0.479064
[3816]	train-rmse:0.41029	test-rmse:0.479072
[3817]	train-rmse:0.410278	test-rmse:0.479071
[3818]	train-rmse:0.410264	test-rmse:0.47907
[3819]	train-rmse:0.410256	test-rmse:0.479068
[3820]	train-rmse:0.410239	test-rmse:

In [166]:
"""
Stopping. Best iteration:
[3213]	train-rmse:0.415767	test-rmse:0.468939
Stopping. Best iteration:
[3281]	train-rmse:0.410538	test-rmse:0.451937
[2872]	train-rmse:0.423973	test-rmse:0.451862
[3020]	train-rmse:0.418175	test-rmse:0.452131
Stopping. Best iteration:
[2866]	train-rmse:0.418617	test-rmse:0.452214
[3211]	train-rmse:0.434274	test-rmse:0.457787
[3302]	train-rmse:0.432564	test-rmse:0.457957
[241]train-rmse:0.441116	test-rmse:0.459099
[240]	train-rmse:0.441266	test-rmse:0.453867
[300]	train-rmse:0.444027	test-rmse:0.460116
[290]	train-rmse:0.46005	test-rmse:0.469023
train-rmse:0.443398 test-rmse:0.471787
[730,6]	train-rmse:0.451892	test-rmse:0.471121
"""

air_visit_data.total_reserve.unique()

array([  -1,   12,    2,    3,   25,    5,    6,    1,   15,   21,    9,
         20,   18,   19,   24,    8,    4,   13,   45,   35,   11,   30,
         23,   37,   34,    7,   38,   33,   22,   32,   57,   26,   10,
         27,   14,   16,   17,   74,   31,   39,   58,   29,   28,   44,
         40,   43,   62,   36,   48,   94,   98,   47,   88,   86,   50,
        107,  109,   75,   69,   41,   42,   60,   51,   49,   59,   79,
         65,   71,   53,   64,   80,   46,   66,   56,   67,   55,  664,
        641, 1633,  557,   61,   52,   54,   70,   78,   63,   73,   82,
         91,  148,  153,  155,  110,   68,   85,   87,   95,   81,   90,
        122,   72,   83,   77,   93,  466,  100,  104,  142,   92,   76,
         89,   84,  116,   97,   99,   96,  108])

In [167]:
y1 = np.log1p(y+1)
pred1 = runXGB(train,y1, test,depth = 8)

p= np.expm1(pred1)-1
p[p<0] = 0



In [168]:
pd.DataFrame({'id':ids,'visitors':p}).to_csv('last.csv',index =False)


In [169]:
train.columns.values

array(['air_store_id', 'air_genre_name', 'air_area_name', 'latitude_x',
       'longitude_x', 'holiday_eve', 'non_working', 'genre_in_area',
       'total_r_in_area', 'reserve_visitors', 'reserve_-12_h',
       'reserve_12_37_h', 'reserve_37_59_h', 'reserve_59_85_h',
       'reserve_85+_h', 'visitors_mean', 'visitors_median', 'visitors_max',
       'visitors_min', 'visit_date_month', 'visit_date_dayofw',
       'visit_date_year', 'visit_date_dayofm', 'weekofyear',
       'total_reserve', 'numb_total_reserve', 'mean_visitors',
       'median_visitors', 'mean_visitors2', 'mean_visitors4',
       'median_visitors2', 'median_visitors4', 'mean_visitors_f',
       'mean_visitors3_f', 'mean_visitors4_f', 'no_open_restro',
       'day_of_week', 'holiday_flg', 'weight', 'hpg_store_id', 'both',
       'latitude_y', 'longitude_y', 'avg_temperature', 'high_temperature',
       'low_temperature', 'precipitation', 'm0', 'm1', 'm2', 'm3', 'm6',
       'm7', 'm8', 'm9', 'm10', 'm11', 'w1', 'w2', 'w3',

In [170]:
p

array([  1.76433253,  21.7926712 ,  25.1543045 , ...,   3.44913435,
         3.64120197,   3.79860067], dtype=float32)