In [285]:
%matplotlib inline
import glob, re
import numpy as np
import pandas as pd
from sklearn import *
from datetime import datetime
from matplotlib import pyplot as plt
import seaborn as sns
from xgboost import *
from sklearn.preprocessing import LabelEncoder

In [2]:
# Read all the csv files
dfs = { re.search('/([^/\.]*)\.csv', fn).group(1):
    pd.read_csv(fn)for fn in glob.glob('input/*.csv')}
for k, v in dfs.items(): locals()[k] = v

In [3]:
# The holidays happened on weekends have no significant difference in visit
wkend_holidays = date_info.apply(lambda x: (x.day_of_week=='Sunday' or x.day_of_week=='Saturday') and x.holiday_flg==1, axis=1)
date_info.loc[wkend_holidays, 'holiday_flg'] = 0
# Earlier record has little weight, because the reservation population are generally smaller than later
# We want to take a basis of the visit number
date_info['weight'] = ((date_info.index + 1) / len(date_info)) ** 5  

In [4]:
# Merge visit_data and date_info on the visit date
visit_data = air_visit_data.merge(date_info, left_on='visit_date', right_on='calendar_date', how='left')
visit_data.drop('calendar_date', axis=1, inplace=True)
# Process by log(1+x) for visit number
visit_data['visitors'] = visit_data.visitors.map(pd.np.log1p)

In [5]:
# We get a visitors df to store the weighted visitor number for each group of store_id, day_of_week and holiday_flg
visitors = visit_data.groupby(['air_store_id', 'day_of_week', 'holiday_flg']).apply(lambda x:( (x.weight * x.visitors).sum() / x.weight.sum() )).reset_index()
# 829 unique store id
visitors.rename(columns={0:'visitors'}, inplace=True) 

In [6]:
# Get a basis visit number for similar record
# Parse the store_id and date from the column id
sample_submission['air_store_id'] = sample_submission.id.map(lambda x: '_'.join(x.split('_')[:-1]))
sample_submission['calendar_date'] = sample_submission.id.map(lambda x: x.split('_')[2])
# Drop the default column
sample_submission.drop('visitors', axis=1, inplace=True)
# To fill the other group columns, dow and holiday
sample_submission = sample_submission.merge(date_info, on='calendar_date', how='left')
# Merge them to get the basis
# 821 unique store id, all covered in visitors
sample_submission = sample_submission.merge(visitors, on=['air_store_id', 'day_of_week', 'holiday_flg'], how='left')

In [93]:
set(visitors.air_store_id.unique()) - set(sample_submission.air_store_id.unique())

{'air_0ead98dd07e7a82a',
 'air_229d7e508d9f1b5e',
 'air_2703dcb33192b181',
 'air_b2d8bc9c88b85f96',
 'air_cb083b4789a8d3a2',
 'air_cf22e368c1a71d53',
 'air_d0a7bd3339c3d12a',
 'air_d63cfa6d6ab78446'}

In [7]:
# If there are missing value(actually a lot), merge them with same weekday and store_id, only take none-holiday, otherwise
# there will be duplicates
missings = sample_submission.visitors.isnull()
sample_submission.loc[missings, 'visitors'] = sample_submission[missings].merge(visitors[visitors.holiday_flg==0], on=('air_store_id', 'day_of_week'), how='left')['visitors_y'].values

In [8]:
# If still have null value, merge with the mean visit number of all records
missings = sample_submission.visitors.isnull()
sample_submission.loc[missings, 'visitors'] = sample_submission[missings].merge(visitors[['air_store_id', 'visitors']].groupby('air_store_id').mean().reset_index(), on='air_store_id', how='left')['visitors_y'].values

In [9]:
# e^y - 1
test_visit_var = sample_submission.visitors.map(pd.np.expm1)

In [10]:
# Merge target visit data(label) with date_info
data = {
    'tra': pd.read_csv('input/air_visit_data.csv')
    }

data['tra'] = data['tra'].merge(date_info, left_on='visit_date', right_on='calendar_date', how='left')
data['tra'] = data['tra'].merge(visitors, on=['air_store_id', 'day_of_week', 'holiday_flg'], how='left')


In [11]:
# Actually, there would be no missing value here
missings = data['tra'].visitors_y.isnull()
data['tra'].loc[missings, 'visitors_y'] = data['tra'][missings].merge(visitors[visitors.holiday_flg==0], on=('air_store_id', 'day_of_week'), how='left')['visitors_y'].values

missings = data['tra'].visitors_y.isnull()
data['tra'].loc[missings, 'visitors_y'] = data['tra'][missings].merge(visitors[['air_store_id', 'visitors']].groupby('air_store_id').mean().reset_index(), on='air_store_id', how='left')['visitors_y'].values

train_visit_var = data['tra'].visitors_y.map(pd.np.expm1)

In [12]:
data = {
    'tra': pd.read_csv('input/air_visit_data.csv'),
    'as': pd.read_csv('input/air_store_info.csv'),
    'hs': pd.read_csv('input/hpg_store_info.csv'),
    'ar': pd.read_csv('input/air_reserve.csv'),
    'hr': pd.read_csv('input/hpg_reserve.csv'),
    'id': pd.read_csv('input/store_id_relation.csv'),
    'tes': pd.read_csv('input/sample_submission.csv'),
    'hol': pd.read_csv('input/date_info.csv').rename(columns={'calendar_date':'visit_date'})
    }

## About the reservation in test data

1. The last reservation time is 2017-04-22 23:00 while the last visit time is 2017-05-31
2. This model just merge test data with the reservation features, of which the lag is far larger than the part in training data
3. We only consider reservation taken 40 days before the visit date to avoid overfitting.

In [13]:
# Merge only on the hotels that both have hpg and air id, so the number of data decreases a lot
data['hr'] = pd.merge(data['hr'], data['id'], how='inner', on=['hpg_store_id'])

for df in ['ar','hr']:
    data[df]['visit_datetime'] = pd.to_datetime(data[df]['visit_datetime']).dt.date
    data[df]['reserve_datetime'] = pd.to_datetime(data[df]['reserve_datetime']).dt.date
    data[df]['reserve_datetime_diff'] = data[df].apply(lambda r: (r['visit_datetime'] - r['reserve_datetime']).days, axis=1)
    # 1: sum  2: mean of diff and visitor number
    # Only take 40 days diff
    data[df] = data[df][data[df]['reserve_datetime_diff'] >= 40]
    tmp1 = data[df].groupby(['air_store_id','visit_datetime'], as_index=False)[['reserve_datetime_diff', 'reserve_visitors']].sum().rename(columns={'visit_datetime':'visit_date', 'reserve_datetime_diff': 'rs1', 'reserve_visitors':'rv1'})
    tmp2 = data[df].groupby(['air_store_id','visit_datetime'], as_index=False)[['reserve_datetime_diff', 'reserve_visitors']].mean().rename(columns={'visit_datetime':'visit_date', 'reserve_datetime_diff': 'rs2', 'reserve_visitors':'rv2'})
    data[df] = pd.merge(tmp1, tmp2, how='inner', on=['air_store_id','visit_date'])

In [322]:
# Extract day features from visit_date column
data['tra']['visit_date'] = pd.to_datetime(data['tra']['visit_date'])
data['tra']['dow'] = data['tra']['visit_date'].dt.dayofweek
data['tra']['year'] = data['tra']['visit_date'].dt.year
data['tra']['month'] = data['tra']['visit_date'].dt.month
data['tra']['visit_date'] = data['tra']['visit_date'].dt.date

In [15]:
# Extract day features from sample submission
data['tes']['visit_date'] = data['tes']['id'].map(lambda x: str(x).split('_')[2])
data['tes']['air_store_id'] = data['tes']['id'].map(lambda x: '_'.join(x.split('_')[:2]))
data['tes']['visit_date'] = pd.to_datetime(data['tes']['visit_date'])
data['tes']['dow'] = data['tes']['visit_date'].dt.dayofweek
data['tes']['year'] = data['tes']['visit_date'].dt.year
data['tes']['month'] = data['tes']['visit_date'].dt.month
data['tes']['visit_date'] = data['tes']['visit_date'].dt.date

In [16]:
# Group by store and dow, get the min, mean ... of each group
unique_stores = data['tes']['air_store_id'].unique()
stores = pd.concat([pd.DataFrame({'air_store_id': unique_stores, 'dow': [i]*len(unique_stores)}) for i in range(7)], axis=0, ignore_index=True).reset_index(drop=True)
#OPTIMIZED BY JEROME VALLET
tmp = data['tra'].groupby(['air_store_id','dow']).agg({'visitors' : [np.min,np.mean,np.median,np.max,np.size]}).reset_index()
tmp.columns = ['air_store_id', 'dow', 'min_visitors', 'mean_visitors', 'median_visitors','max_visitors','count_observations']
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow']) 

In [17]:
stores = pd.merge(stores, data['as'], how='left', on=['air_store_id']) 
# NEW FEATURES FROM Georgii Vyshnia
stores['air_genre_name'] = stores['air_genre_name'].map(lambda x: str(str(x).replace('/',' ')))
stores['air_area_name'] = stores['air_area_name'].map(lambda x: str(str(x).replace('-',' ')))
lbl = preprocessing.LabelEncoder()
for i in range(10):
    stores['air_genre_name'+str(i)] = lbl.fit_transform(stores['air_genre_name'].map(lambda x: str(str(x).split(' ')[i]) if len(str(x).split(' '))>i else ''))
    stores['air_area_name'+str(i)] = lbl.fit_transform(stores['air_area_name'].map(lambda x: str(str(x).split(' ')[i]) if len(str(x).split(' '))>i else ''))
stores['air_genre_name'] = lbl.fit_transform(stores['air_genre_name'])
stores['air_area_name'] = lbl.fit_transform(stores['air_area_name'])

In [372]:
data['hol']['visit_date'] = pd.to_datetime(data['hol']['visit_date'])
data['hol']['day_of_week'] = lbl.fit_transform(data['hol']['day_of_week'])
data['hol']['visit_date'] = data['hol']['visit_date'].dt.date
train = pd.merge(data['tra'], data['hol'], how='left', on=['visit_date']) 
test = pd.merge(data['tes'], data['hol'], how='left', on=['visit_date']) 

In [373]:
train = pd.merge(train, stores, how='left', on=['air_store_id','dow']) 
test = pd.merge(test, stores, how='left', on=['air_store_id','dow'])

In [326]:
train.head()

Unnamed: 0,air_store_id,visit_date,visitors,dow,year,month,day_of_week,holiday_flg,min_visitors,mean_visitors,...,air_genre_name5,air_area_name5,air_genre_name6,air_area_name6,air_genre_name7,air_area_name7,air_genre_name8,air_area_name8,air_genre_name9,air_area_name9
0,air_ba937bf13d40fb24,2016-01-13,25,2,2016,1,6,0,7.0,23.84375,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,air_ba937bf13d40fb24,2016-01-14,32,3,2016,1,4,0,2.0,20.292308,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,air_ba937bf13d40fb24,2016-01-15,29,4,2016,1,0,0,4.0,34.738462,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,air_ba937bf13d40fb24,2016-01-16,22,5,2016,1,2,0,6.0,27.651515,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,air_ba937bf13d40fb24,2016-01-18,6,0,2016,1,1,0,2.0,13.754386,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [374]:
# Add new year and golden week feature
newyear_dates = pd.date_range(datetime(2016,1,1),datetime(2016,1,5)).append(pd.date_range(datetime(2016,12,31),datetime(2017,1,5)))
goldweek = pd.date_range(datetime(2016,4,29),datetime(2016,5,5)).append(pd.date_range(datetime(2017,4,29),datetime(2017,5,5)))
train['visit_date'] = pd.to_datetime(train['visit_date'].values)
test['visit_date'] = pd.to_datetime(test['visit_date'].values)
train['newyear'] = train['visit_date'].isin(newyear_dates)
train['goldweek'] = train['visit_date'].isin(goldweek)
test['newyear'] = test['visit_date'].isin(newyear_dates)
test['goldweek'] = test['visit_date'].isin(goldweek)

In [375]:
# Merge true visit, holiday, store with air_reserve and hpg_reserve
# Big problem in this step
for df in ['ar','hr']:
    train = pd.merge(train, data[df], how='left', on=['air_store_id','visit_date']) 
    test = pd.merge(test, data[df], how='left', on=['air_store_id','visit_date'])

In [376]:
# After merge air and hpg, we have rv and rs from two systems
train['id'] = train.apply(lambda r: '_'.join([str(r['air_store_id']), str(r['visit_date'])]), axis=1)

train['total_reserv_sum'] = train['rv1_x'] + train['rv1_y']
train['total_reserv_mean'] = (train['rv2_x'] + train['rv2_y']) / 2
train['total_reserv_dt_diff_mean'] = (train['rs2_x'] + train['rs2_y']) / 2

test['total_reserv_sum'] = test['rv1_x'] + test['rv1_y']
test['total_reserv_mean'] = (test['rv2_x'] + test['rv2_y']) / 2
test['total_reserv_dt_diff_mean'] = (test['rs2_x'] + test['rs2_y']) / 2

In [377]:
# NEW FEATURES FROM JMBULL
# Interesting feature, take date as feature, because later time, the number of visitors is larger
train['date_int'] = train['visit_date'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
test['date_int'] = test['visit_date'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
train['var_max_lat'] = train['latitude'].max() - train['latitude']
train['var_max_long'] = train['longitude'].max() - train['longitude']
test['var_max_lat'] = test['latitude'].max() - test['latitude']
test['var_max_long'] = test['longitude'].max() - test['longitude']

In [378]:
# NEW FEATURES FROM Georgii Vyshnia
train['lon_plus_lat'] = train['longitude'] + train['latitude'] 
test['lon_plus_lat'] = test['longitude'] + test['latitude']

lbl = preprocessing.LabelEncoder()
train['air_store_id2'] = lbl.fit_transform(train['air_store_id'])
test['air_store_id2'] = lbl.transform(test['air_store_id'])

train['hklee_feature'] = train_visit_var 
test['hklee_feature'] = test_visit_var

col = [c for c in train if c not in ['id', 'air_store_id', 'visit_date','visitors']]

In [427]:
model3.booster().get_score(importance_type='weight')

{'air_area_name': 6949,
 'air_area_name0': 533,
 'air_area_name1': 615,
 'air_area_name2': 2405,
 'air_area_name3': 352,
 'air_area_name4': 3717,
 'air_area_name5': 497,
 'air_area_name6': 144,
 'air_genre_name': 3573,
 'air_genre_name0': 445,
 'air_genre_name1': 1914,
 'air_genre_name2': 114,
 'air_store_id2': 7420,
 'avg_humidity': 2027,
 'avg_local_pressure': 3564,
 'avg_sea_pressure': 1031,
 'avg_temperature': 3476,
 'avg_vapor_pressure': 2065,
 'avg_wind_speed': 3550,
 'cloud_cover': 1663,
 'count_observations': 6771,
 'date_int': 8281,
 'day_of_week': 1404,
 'dow': 1160,
 'goldweek': 107,
 'high_temperature': 2883,
 'hklee_feature': 10409,
 'holiday_flg': 325,
 'hours_sunlight': 2847,
 'latitude': 2252,
 'lon_plus_lat': 1290,
 'longitude': 1073,
 'low_temperature': 2617,
 'max_visitors': 7206,
 'mean_visitors': 7773,
 'median_visitors': 3657,
 'min_visitors': 3008,
 'month': 1452,
 'month0': 40,
 'month1': 188,
 'month10': 174,
 'month11': 61,
 'month2': 225,
 'month3': 140,
 'mo

In [412]:
index = len(train)
total = pd.concat([train, test], ignore_index=True)
temp = pd.get_dummies(total['dow'])
temp.columns = ["weekday" + str(i) for i in range(7)]
total = pd.concat([total, temp], axis = 1)
temp = pd.get_dummies(total['month'])
temp.columns = ["month" + str(i) for i in range(12)]
total = pd.concat([total, temp], axis = 1)
train, test = total.iloc[:index], total.iloc[index:]

In [312]:
train.drop(['dow','month'], axis=1, inplace=True)

In [417]:
# Why -1
train = train.fillna(-1)
test = test.fillna(-1)

In [418]:
weather_store = pd.read_csv('input/air_store_weather.csv',parse_dates=['calendar_date'])
weather_store = weather_store.rename(columns={'calendar_date':'visit_date'})

In [419]:
train['visit_date'] = pd.to_datetime(train['visit_date'].values)
test['visit_date'] = pd.to_datetime(test['visit_date'].values)

In [420]:
train = pd.merge(train, weather_store, on = ['air_store_id', 'visit_date'])
test = pd.merge(test, weather_store, on = ['air_store_id', 'visit_date'])

In [None]:
train.to_csv('input/train1.csv', index = False)
test.to_csv('input/test1.csv', index = False)

In [421]:
reserve_features = ['rs1_x', 'rv1_x', 'rs2_x', 'rv2_x',
       'rs1_y', 'rv1_y', 'rs2_y', 'rv2_y', 'id', 'total_reserv_sum',
       'total_reserv_mean', 'total_reserv_dt_diff_mean']
genre_area_features = ['air_genre_name3', 'air_area_name3', 'air_genre_name4',
       'air_area_name4', 'air_genre_name5', 'air_area_name5',
       'air_genre_name6', 'air_area_name6', 'air_genre_name7',
       'air_area_name7', 'air_genre_name8', 'air_area_name8',
       'air_genre_name9', 'air_area_name9']
#sub_train = train[[col for col in train.columns if col not in reserve_features and col not in genre_area_features]]
col = [c for c in train.columns if c not in ['id', 'air_store_id', 'visit_date','visitors']]

In [70]:
def RMSLE(y, pred):
    return metrics.mean_squared_error(y, pred)**0.5

In [422]:
train.head()

Unnamed: 0,air_area_name,air_area_name0,air_area_name1,air_area_name2,air_area_name3,air_area_name4,air_area_name5,air_area_name6,air_area_name7,air_area_name8,...,low_temperature,precipitation,hours_sunlight,solar_radiation,avg_wind_speed,avg_vapor_pressure,avg_local_pressure,avg_humidity,avg_sea_pressure,cloud_cover
0,62.0,7.0,6.0,26.0,6.0,78.0,0.0,0.0,0.0,0.0,...,-0.766667,0.0,8.7,10.86,1.7,4.9,1010.1,60.0,1013.1,2.5
1,62.0,7.0,6.0,26.0,6.0,78.0,0.0,0.0,0.0,0.0,...,0.1,0.0,9.05,12.09,1.966667,4.6,1008.9,52.0,1011.9,0.5
2,62.0,7.0,6.0,26.0,6.0,78.0,0.0,0.0,0.0,0.0,...,2.533333,0.0,7.5,11.67,3.133333,5.7,1013.1,64.0,1016.1,6.0
3,62.0,7.0,6.0,26.0,6.0,78.0,0.0,0.0,0.0,0.0,...,2.266667,0.0,8.85,12.41,2.4,5.1,1015.8,54.0,1018.8,2.3
4,62.0,7.0,6.0,26.0,6.0,78.0,0.0,0.0,0.0,0.0,...,0.333333,54.3,1.25,2.4,4.433333,7.1,995.9,95.0,998.9,7.5


In [282]:
model3 = XGBRegressor(learning_rate=0.1, n_estimators=200, subsample=0.8, colsample_bytree=0.8, max_depth =8)
model3.fit(train[sub_col], np.log1p(train['visitors'].values))
print('RMSE XgboostRegressor:', RMSLE(np.log1p(train['visitors'].values), model3.predict(train[sub_col])))

RMSE XgboostRegressor: 0.468338734751


In [424]:
model1 = ensemble.GradientBoostingRegressor(learning_rate=0.2, random_state=3, n_estimators=200, subsample=0.8, max_depth =10)

model2 = neighbors.KNeighborsRegressor(n_jobs=-1, n_neighbors=5)

model3 = XGBRegressor(learning_rate=0.2, n_estimators=200, subsample=0.8, colsample_bytree=0.8, max_depth =10)

model1.fit(train[col], np.log1p(train['visitors'].values))
print("Model1 fitted")
model2.fit(train[col], np.log1p(train['visitors'].values))
print("Model2 fitted")
model3.fit(train[col], np.log1p(train['visitors'].values))
print("Model3 fitted")
print('RMSE GradientBoostingRegressor: ', RMSLE(np.log1p(train['visitors'].values), model1.predict(train[col])))
print('RMSE KNeighborsRegressor: ', RMSLE(np.log1p(train['visitors'].values), model2.predict(train[col])))
print('RMSE XgboostRegressor:', RMSLE(np.log1p(train['visitors'].values), model3.predict(train[col])))

Model1 fitted
Model2 fitted
Model3 fitted
RMSE GradientBoostingRegressor:  0.313813030539
RMSE KNeighborsRegressor:  0.449829342612
RMSE XgboostRegressor: 0.342089287158


Knn 
K = 3: 0.614
K = 10: 0.539
K = 5: 0.482

In [449]:
# Cross validation on three random selection
model2 = ensemble.GradientBoostingRegressor(learning_rate=0.2, random_state=3, n_estimators=200, subsample=0.8, max_depth =10)
model1 = neighbors.KNeighborsRegressor(n_jobs=-1, n_neighbors=5)
model3 = XGBRegressor(learning_rate=0.2, n_estimators=200, subsample=0.8, colsample_bytree=0.8, max_depth =10)
models = [model1, model2, model3]

train_index = np.random.randint(0, len(train), int(len(train)*0.85))
val_index = np.setdiff1d(np.arange(0, len(train)), train_index)
X_val, y_val = train.iloc[val_index][col], np.log1p(train.iloc[val_index]['visitors'].values)

for i in range(3):
    model = models[i]
    
    print(i)
    
    X_train, y_train = train.iloc[train_index][col], np.log1p(train.iloc[train_index]['visitors'].values)
    model.fit(X_train[col], y_train)
    print('RMSLE: ', RMSLE(y_val, model.predict(X_val)))

    
    

0
RMSLE:  0.590998219748
1
RMSLE:  0.516366702972
2
RMSLE:  0.510421400952


In [450]:
predKnn, predGb, predXg = models[0].predict(X_val), models[1].predict(X_val), models[2].predict(X_val)
hklee = np.log1p(X_val.hklee_feature)
ensemble_df = pd.DataFrame({"knn":predKnn, "gb":predGb, "xg":predXg, "hk":hklee})


In [447]:
en_model = XGBRegressor(learning_rate=0.2, n_estimators=100, subsample=0.8, colsample_bytree=0.8, max_depth = 10)
en_model.fit(ensemble_df.values, y_val)

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.2, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.8)

In [451]:
RMSLE(y_val, en_model.predict(ensemble_df.values))

0.50310988304320903

In [437]:
models[0].predict(X_val)

array([ 3.2672598 ,  2.57132926,  3.16675444, ...,  1.98544082,
        2.07966724,  2.19722458])

In [438]:
models[1].predict(X_val)

array([ 3.16590172,  2.25917637,  3.11480641, ...,  2.15567371,
        2.08111022,  1.41808837])

In [439]:
models[2].predict(X_val)

array([ 3.39294076,  2.2871356 ,  3.09318137, ...,  2.06250572,
        1.88974106,  1.78981686], dtype=float32)

In [440]:
y_val

array([ 3.40119738,  1.94591015,  3.29583687, ...,  1.09861229,
        1.94591015,  1.79175947])

In [442]:
np.log1p(X_val.hklee_feature)

2         3.519119
4         2.200934
9         3.184366
11        2.650192
12        2.915902
13        2.739233
16        2.915902
19        3.184366
20        2.200934
21        2.650192
22        2.915902
24        3.519119
26        2.200934
27        2.650192
28        2.915902
31        3.184366
32        2.200934
33        2.650192
43        3.184366
47        2.739233
50        2.200934
52        2.915902
56        2.650192
59        3.519119
60        3.184366
62        2.200934
63        2.650192
64        2.915902
67        3.184366
69        2.650192
            ...   
252035    1.928282
252037    1.889312
252044    1.743142
252051    1.791401
252052    1.928282
252055    1.889312
252056    1.743142
252059    1.917453
252060    1.906838
252061    1.889312
252062    1.743142
252064    1.928282
252065    1.917453
252069    1.791401
252071    1.917453
252073    1.889312
252074    1.743142
252076    1.928282
252079    1.889312
252083    1.906838
252084    1.889312
252085    1.

In [284]:
print(datetime.datetime.now())
#Bojan Tunguz / Surprise Me 2!
model1 = ensemble.GradientBoostingRegressor(learning_rate=0.2, random_state=3, n_estimators=200, subsample=0.8, max_depth =10)
model2 = neighbors.KNeighborsRegressor(n_jobs=-1, n_neighbors=4)
#model3 = XGBRegressor(learning_rate=0.2, random_state=3, n_estimators=200, subsample=0.8, colsample_bytree=0.8, max_depth =10)

model1.fit(train[col], np.log1p(train['visitors'].values))
model2.fit(train[col], np.log1p(train['visitors'].values))
#model3.fit(train[col], np.log1p(train['visitors'].values))
print('RMSE GradientBoostingRegressor: ', RMSLE(np.log1p(train['visitors'].values), model1.predict(train[col])))
print('RMSE KNeighborsRegressor: ', RMSLE(np.log1p(train['visitors'].values), model2.predict(train[col])))
#print('RMSE XGBRegressor: ', RMSLE(np.log1p(train['visitors'].values), model3.predict(train[col])))
#test['visitors'] = (model1.predict(test[col]) + model2.predict(test[col]) + model3.predict(test[col])) / 3
test['visitors'] = model1.predict(test[col])*0.3 + model2.predict(test[col])*0.3 + model3.predict(test[col])*0.4
test['visitors'] = np.expm1(test['visitors']).clip(lower=0.)
sub1 = test[['id','visitors']].copy()
del train; del data;
print(datetime.datetime.now())

2018-01-29 15:58:36.000453
RMSE GradientBoostingRegressor:  0.344157162162
RMSE KNeighborsRegressor:  0.415170186118
2018-01-29 16:28:42.332897


0.3, 0.3, 0.4: 0.496

In [432]:
#test['visitors'] = model1.predict(test[col])*0.2 + model2.predict(test[col])*0.5 + model3.predict(test[col])*0.3
test['visitors'] = model1.predict(test[col])*0 + model2.predict(test[col])*0.5 + model3.predict(test[col])*0.5
test['visitors'] = np.expm1(test['visitors']).clip(lower=0.)

In [433]:
# from hklee
# https://www.kaggle.com/zeemeen/weighted-mean-comparisons-lb-0-497-1st/code
sub1 = test[['id','visitors']].copy()
dfs = { re.search('/([^/\.]*)\.csv', fn).group(1):
    pd.read_csv(fn)for fn in glob.glob('input/*.csv')}

for k, v in dfs.items(): locals()[k] = v

wkend_holidays = date_info.apply(
    (lambda x:(x.day_of_week=='Sunday' or x.day_of_week=='Saturday') and x.holiday_flg==1), axis=1)
date_info.loc[wkend_holidays, 'holiday_flg'] = 0
date_info['weight'] = ((date_info.index + 1) / len(date_info)) ** 5  

visit_data = air_visit_data.merge(date_info, left_on='visit_date', right_on='calendar_date', how='left')
visit_data.drop('calendar_date', axis=1, inplace=True)
visit_data['visitors'] = visit_data.visitors.map(pd.np.log1p)

wmean = lambda x:( (x.weight * x.visitors).sum() / x.weight.sum() )
visitors = visit_data.groupby(['air_store_id', 'day_of_week', 'holiday_flg']).apply(wmean).reset_index()
visitors.rename(columns={0:'visitors'}, inplace=True) # cumbersome, should be better ways.

sample_submission['air_store_id'] = sample_submission.id.map(lambda x: '_'.join(x.split('_')[:-1]))
sample_submission['calendar_date'] = sample_submission.id.map(lambda x: x.split('_')[2])
sample_submission.drop('visitors', axis=1, inplace=True)
sample_submission = sample_submission.merge(date_info, on='calendar_date', how='left')
sample_submission = sample_submission.merge(visitors, on=[
    'air_store_id', 'day_of_week', 'holiday_flg'], how='left')

missings = sample_submission.visitors.isnull()
sample_submission.loc[missings, 'visitors'] = sample_submission[missings].merge(
    visitors[visitors.holiday_flg==0], on=('air_store_id', 'day_of_week'), 
    how='left')['visitors_y'].values

missings = sample_submission.visitors.isnull()
sample_submission.loc[missings, 'visitors'] = sample_submission[missings].merge(
    visitors[['air_store_id', 'visitors']].groupby('air_store_id').mean().reset_index(), 
    on='air_store_id', how='left')['visitors_y'].values

sample_submission['visitors'] = sample_submission.visitors.map(pd.np.expm1)
sub2 = sample_submission[['id', 'visitors']].copy()
sub_merge = pd.merge(sub1, sub2, on='id', how='inner')

sub_merge['visitors'] = (sub_merge['visitors_x'] + sub_merge['visitors_y']* 1.1)/2
sub_merge[['id', 'visitors']].to_csv('submission.csv', index=False)