In [1]:
import time
import numpy as np
import pandas as pd
from dateutil.parser import parse
from datetime import date, timedelta
from sklearn.preprocessing import LabelEncoder

In [4]:
data_path = '../rawdata/'

# 导入原始数据，预处理

In [5]:
air_reserve = pd.read_csv(data_path + 'air_reserve.csv').rename(columns={'air_store_id':'store_id'})
hpg_reserve = pd.read_csv(data_path + 'hpg_reserve.csv').rename(columns={'hpg_store_id':'store_id'})
air_store = pd.read_csv(data_path + 'air_store_info.csv').rename(columns={'air_store_id':'store_id'})
hpg_store = pd.read_csv(data_path + 'hpg_store_info.csv').rename(columns={'hpg_store_id':'store_id'})
air_visit = pd.read_csv(data_path + 'air_visit_data.csv').rename(columns={'air_store_id':'store_id'})
#以HPG的ID为索引
store_id_map = pd.read_csv(data_path + 'store_id_relation.csv').set_index('hpg_store_id',drop=False)
date_info = pd.read_csv(data_path + 'date_info.csv').rename(columns={'calendar_date': 'visit_date'}).drop('day_of_week',axis=1)
submission = pd.read_csv(data_path + 'sample_submission.csv')

In [8]:
store_id_map.head()

Unnamed: 0_level_0,air_store_id,hpg_store_id
hpg_store_id,Unnamed: 1_level_1,Unnamed: 2_level_1
hpg_4bc649e72e2a239a,air_63b13c56b7201bd9,hpg_4bc649e72e2a239a
hpg_c34b496d0305a809,air_a24bf50c3e90d583,hpg_c34b496d0305a809
hpg_cd8ae0d9bbd58ff9,air_c7f78b4f3cba33ff,hpg_cd8ae0d9bbd58ff9
hpg_de24ea49dc25d6b8,air_947eb2cae4f3e8f2,hpg_de24ea49dc25d6b8
hpg_653238a84804d8e7,air_965b2e0cf4119003,hpg_653238a84804d8e7


In [12]:
air_reserve.head()

Unnamed: 0,store_id,visit_datetime,reserve_datetime,reserve_visitors,visit_date,reserve_date,dow
0,air_877f79706adbfb06,2016-01-01 19:00:00,2016-01-01 16:00:00,1,2016-01-01,2016-01-01,4
1,air_db4b38ebe7a7ceff,2016-01-01 19:00:00,2016-01-01 19:00:00,3,2016-01-01,2016-01-01,4
2,air_db4b38ebe7a7ceff,2016-01-01 19:00:00,2016-01-01 19:00:00,6,2016-01-01,2016-01-01,4
3,air_877f79706adbfb06,2016-01-01 20:00:00,2016-01-01 16:00:00,2,2016-01-01,2016-01-01,4
4,air_db80363d35f10926,2016-01-01 20:00:00,2016-01-01 01:00:00,5,2016-01-01,2016-01-01,4


In [11]:
#就餐日期，预定日期，周几
submission['visit_date'] = submission['id'].str[-10:]
submission['store_id'] = submission['id'].str[:-11]
air_reserve['visit_date'] = air_reserve['visit_datetime'].str[:10]
air_reserve['reserve_date'] = air_reserve['reserve_datetime'].str[:10]
air_reserve['dow'] = pd.to_datetime(air_reserve['visit_date']).dt.dayofweek
hpg_reserve['visit_date'] = hpg_reserve['visit_datetime'].str[:10]
hpg_reserve['reserve_date'] = hpg_reserve['reserve_datetime'].str[:10]
hpg_reserve['dow'] = pd.to_datetime(hpg_reserve['visit_date']).dt.dayofweek

In [13]:
#与submission保持一致，目的？
air_visit['id'] = air_visit['store_id'] + '_' + air_visit['visit_date']

In [24]:
#更新HPG餐厅ID
#store_id_map['air_store_id'] 为关联表中列名AIR ID，索引为HPG ID的Series。 通过map，将hpg_reserve的餐厅ID从HPG ID映射为AIR ID
#对于那些没有对应AIR ID 的餐厅，仍然使用HPG ID
hpg_reserve['store_id'] = hpg_reserve['store_id'].map(store_id_map['air_store_id']).fillna(hpg_reserve['store_id'])

In [26]:
#同上逻辑，彻底抹去HPG的概念
hpg_store['store_id'] = hpg_store['store_id'].map(store_id_map['air_store_id']).fillna(hpg_store['store_id'])
hpg_store.rename(columns={'hpg_genre_name':'air_genre_name','hpg_area_name':'air_area_name'},inplace=True)

In [29]:
#对训练集和测试集进行同样的处理，因此合并
data = pd.concat([air_visit, submission]).copy()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [30]:
data['dow'] = pd.to_datetime(data['visit_date']).dt.dayofweek

In [31]:
data.head()

Unnamed: 0,id,store_id,visit_date,visitors,dow
0,air_ba937bf13d40fb24_2016-01-13,air_ba937bf13d40fb24,2016-01-13,25,2
1,air_ba937bf13d40fb24_2016-01-14,air_ba937bf13d40fb24,2016-01-14,32,3
2,air_ba937bf13d40fb24_2016-01-15,air_ba937bf13d40fb24,2016-01-15,29,4
3,air_ba937bf13d40fb24_2016-01-16,air_ba937bf13d40fb24,2016-01-16,22,5
4,air_ba937bf13d40fb24_2016-01-18,air_ba937bf13d40fb24,2016-01-18,6,0


In [32]:
date_info.head()

Unnamed: 0,visit_date,holiday_flg
0,2016-01-01,1
1,2016-01-02,1
2,2016-01-03,1
3,2016-01-04,0
4,2016-01-05,0


In [33]:
#holiday_flg2 指代周末或者节假日
date_info['holiday_flg2'] = pd.to_datetime(date_info['visit_date']).dt.dayofweek
date_info['holiday_flg2'] = ((date_info['holiday_flg2']>4) | (date_info['holiday_flg']==1)).astype(int)

In [34]:
air_store['air_area_name0'] = air_store['air_area_name'].apply(lambda x: x.split(' ')[0])

In [36]:
#对餐厅类型和地理位置进行类型编码
lbl = LabelEncoder()
air_store['air_genre_name'] = lbl.fit_transform(air_store['air_genre_name'])
air_store['air_area_name0'] = lbl.fit_transform(air_store['air_area_name0'])

In [38]:
data.head()

Unnamed: 0,id,store_id,visit_date,visitors,dow
0,air_ba937bf13d40fb24_2016-01-13,air_ba937bf13d40fb24,2016-01-13,25,2
1,air_ba937bf13d40fb24_2016-01-14,air_ba937bf13d40fb24,2016-01-14,32,3
2,air_ba937bf13d40fb24_2016-01-15,air_ba937bf13d40fb24,2016-01-15,29,4
3,air_ba937bf13d40fb24_2016-01-16,air_ba937bf13d40fb24,2016-01-16,22,5
4,air_ba937bf13d40fb24_2016-01-18,air_ba937bf13d40fb24,2016-01-18,6,0


In [39]:
# log（1+x)处理顾客数
data['visitors'] = np.log1p(data['visitors'])

In [40]:
air_store.head()

Unnamed: 0,store_id,air_genre_name,air_area_name,latitude,longitude,air_area_name0
0,air_0f0cdeee6c9bf3d7,6,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,3
1,air_7cc17a324ae5c7dc,6,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,3
2,air_fee8dcf4d619598e,6,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,3
3,air_a17f0778617c76e2,6,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,3
4,air_83db5aff8f50478e,6,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,7


In [41]:
#根据餐厅ID，将餐厅的详情融合到data中
data = data.merge(air_store,on='store_id',how='left')

In [42]:
data.head()

Unnamed: 0,id,store_id,visit_date,visitors,dow,air_genre_name,air_area_name,latitude,longitude,air_area_name0
0,air_ba937bf13d40fb24_2016-01-13,air_ba937bf13d40fb24,2016-01-13,3.258097,2,4,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,7
1,air_ba937bf13d40fb24_2016-01-14,air_ba937bf13d40fb24,2016-01-14,3.496508,3,4,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,7
2,air_ba937bf13d40fb24_2016-01-15,air_ba937bf13d40fb24,2016-01-15,3.401197,4,4,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,7
3,air_ba937bf13d40fb24_2016-01-16,air_ba937bf13d40fb24,2016-01-16,3.135494,5,4,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,7
4,air_ba937bf13d40fb24_2016-01-18,air_ba937bf13d40fb24,2016-01-18,1.94591,0,4,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,7


In [43]:
#根据就餐日期，将日期的相关信息融合到data中
data = data.merge(date_info[['visit_date','holiday_flg','holiday_flg2']], on=['visit_date'],how='left')

In [44]:
data

Unnamed: 0,id,store_id,visit_date,visitors,dow,air_genre_name,air_area_name,latitude,longitude,air_area_name0,holiday_flg,holiday_flg2
0,air_ba937bf13d40fb24_2016-01-13,air_ba937bf13d40fb24,2016-01-13,3.258097,2,4,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,7,0,0
1,air_ba937bf13d40fb24_2016-01-14,air_ba937bf13d40fb24,2016-01-14,3.496508,3,4,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,7,0,0
2,air_ba937bf13d40fb24_2016-01-15,air_ba937bf13d40fb24,2016-01-15,3.401197,4,4,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,7,0,0
3,air_ba937bf13d40fb24_2016-01-16,air_ba937bf13d40fb24,2016-01-16,3.135494,5,4,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,7,0,1
4,air_ba937bf13d40fb24_2016-01-18,air_ba937bf13d40fb24,2016-01-18,1.945910,0,4,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,7,0,0
5,air_ba937bf13d40fb24_2016-01-19,air_ba937bf13d40fb24,2016-01-19,2.302585,1,4,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,7,0,0
6,air_ba937bf13d40fb24_2016-01-20,air_ba937bf13d40fb24,2016-01-20,3.465736,2,4,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,7,0,0
7,air_ba937bf13d40fb24_2016-01-21,air_ba937bf13d40fb24,2016-01-21,3.091042,3,4,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,7,0,0
8,air_ba937bf13d40fb24_2016-01-22,air_ba937bf13d40fb24,2016-01-22,2.944439,4,4,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,7,0,0
9,air_ba937bf13d40fb24_2016-01-23,air_ba937bf13d40fb24,2016-01-23,3.295837,5,4,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,7,0,1


# 工具函数

In [45]:
#合并集合
#集合中后置位的对象将覆盖前置位的对象在result中保存的内容
def concat(L):
    result = None
    for l in L:
        if result is None:
            result = l
        else:
            try:
                result[l.columns.tolist()] = l
            except:
                print(l.head())
    return result

In [None]:

def left_merge(data1,data2,on):
    if type(on) != list:
        on = [on]
    if (set(on) & set(data2.columns)) != set(on):
        data2_temp = data2.reset_index()
    else:
        data2_temp = data2.copy()
    columns = [f for f in data2.columns if f not in on]
    result = data1.merge(data2_temp,on=on,how='left')
    result = result[columns]
    return result

In [48]:
#输入起始日期以及天数增量，返回结束日期
def date_add_days(start_date, days):
    end_date = parse(start_date[:10]) + timedelta(days=days)
    end_date = end_date.strftime('%Y-%m-%d')
    return end_date

In [49]:
a=1,2

In [None]:
def make_feats(end_date,n_day):
    t0 = time.time()
    key = end_date,n_day
    print('data key为：{}'.format(key))
    print('add label')
    label = get_label(end_date,n_day)

    print('make feature...')
    result = [label]
    result.append(get_store_visitor_feat(label, key, 1000))        # store features
    result.append(get_store_visitor_feat(label, key, 56))          # store features
    result.append(get_store_visitor_feat(label, key, 28))          # store features
    result.append(get_store_visitor_feat(label, key, 14))          # store features
    result.append(get_store_exp_visitor_feat(label, key, 1000))    # store exp features
    result.append(get_store_week_feat(label, key, 1000))           # store dow features
    result.append(get_store_week_feat(label, key, 56))             # store dow features
    result.append(get_store_week_feat(label, key, 28))             # store dow features
    result.append(get_store_week_feat(label, key, 14))             # store dow features
    result.append(get_store_week_diff_feat(label, key, 58))       # store dow diff features
    result.append(get_store_week_diff_feat(label, key, 1000))      # store dow diff features
    result.append(get_store_all_week_feat(label, key, 1000))       # store all week feat
    result.append(get_store_week_exp_feat(label, key, 1000))       # store dow exp feat
    result.append(get_store_holiday_feat(label, key, 1000))        # store holiday feat

    result.append(get_genre_visitor_feat(label, key, 1000))         # genre feature
    result.append(get_genre_visitor_feat(label, key, 56))           # genre feature
    result.append(get_genre_visitor_feat(label, key, 28))           # genre feature
    result.append(get_genre_exp_visitor_feat(label, key, 1000))     # genre feature
    result.append(get_genre_week_feat(label, key, 1000))            # genre dow feature
    result.append(get_genre_week_feat(label, key, 56))              # genre dow feature
    result.append(get_genre_week_feat(label, key, 28))              # genre dow feature
    result.append(get_genre_week_exp_feat(label, key, 1000))        # genre dow exp feature

    result.append(get_reserve_feat(label,key))                      # air_reserve
    result.append(get_first_last_time(label,key,1000))             # first time and last time

    result.append(label)

    print('merge...')
    result = concat(result)

    result = second_feat(result)

    print('data shape：{}'.format(result.shape))
    print('spending {}s'.format(time.time() - t0))
    return result

# 开始训练

In [46]:
import datetime
import lightgbm as lgb

In [47]:
train_feat = pd.DataFrame()
start_date = '2017-03-12'

In [None]:
for i in range(58):
    train_feat_sub = make_feats(date_add_days(start_date, i*(-7)),39)
    train_feat = pd.concat([train_feat,train_feat_sub])