In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.preprocessing import *
import datetime
import gc
from numpy import inf
from tqdm import tqdm

In [2]:
train_df = pd.read_csv('./data/transformed/train_data.csv',parse_dates=['date','time_start'],dtype={'link_ID':str})
link_top = pd.read_csv('./data/transformed/link_top.csv',dtype={'link_ID':str})
link_info = pd.read_csv('./data/transformed/link_info.csv',dtype={'link_ID':str})

In [3]:
train_df.sort_values(['link_ID','time_start'],inplace=True)

In [4]:
grp = train_df.groupby(['link_ID','date','time_start_hour'])

In [5]:
##增加前一小时的统计值
def last_hour_stat(grp,prefix):
    x = grp['travel_time'].agg(['mean','max','min','median']).reset_index()
    c = grp['travel_time'].quantile(0.25).reset_index()
    c.rename(columns={'travel_time':'quater'},inplace=True)
    e = grp['travel_time'].quantile(0.75).reset_index()
    e.rename(columns={'travel_time':'quater3'},inplace=True)
    f = (grp['travel_time'].last()-grp['travel_time'].first()).reset_index()
    f.rename(columns={'travel_time':'increment'},inplace=True)
    x = pd.merge(x,c,how='left',on=['link_ID','date','time_start_hour'])
    x = pd.merge(x,e,how='left',on=['link_ID','date','time_start_hour'])
    x = pd.merge(x,f,how='left',on=['link_ID','date','time_start_hour'])
    #x = pd.merge(x,g,how='left',on=['link_ID','date','time_start_hour'])
    x['time_start_hour']+=1
    index = x[x.time_start_hour==24].index
    x.loc[index,'time_start_hour'] = 0
    x.loc[index,'date'] += pd.Timedelta('1 days')
    col = x.columns.tolist()
    for i in range(len(col)):
        if col[i] not in ['link_ID','date','time_start_hour']:
            col[i] = col[i]+'_{0}'.format(prefix)
    x.columns = col
    return x

In [6]:
def hist_time_stat(data,groupby_key=['time_start_hour','time_start_min'],time_window=7):
    key = ['link_ID','month']
    key.extend(groupby_key)
    grp = data.groupby(key).tail(time_window).groupby(key)
    x = grp['travel_time'].agg(['mean','max','min','median']).reset_index()
    c = grp['travel_time'].quantile(0.25).reset_index()
    c.rename(columns={'travel_time':'quater'},inplace=True)
    e = grp['travel_time'].quantile(0.75).reset_index()
    e.rename(columns={'travel_time':'quater3'},inplace=True)
    f = (grp['travel_time'].last()-grp['travel_time'].first()).reset_index()
    f.rename(columns={'travel_time':'increment'},inplace=True)
    x = pd.merge(x,c,how='left',on=key)
    x = pd.merge(x,e,how='left',on=key)
    x = pd.merge(x,f,how='left',on=key)
    x['month'] += 1
    col = x.columns.tolist()
    if(len(key)==4):
        prefix = 'min'
    else:
        prefix = 'quater'
    for i in range(len(col)):
        if col[i] not in key:
            col[i] = col[i]+'_'+prefix+'_{0}'.format(time_window)
    x.columns = col
    return x

In [7]:
bar = tqdm([3,7,12,16,23,None])
for i in bar:
    if i is not None:
        slice_ = np.arange((-1)*i,0).tolist()
        x = last_hour_stat(grp['travel_time'].nth(slice_).reset_index().groupby(['link_ID','date','time_start_hour']),i)
        train_df = pd.merge(train_df,x,how='left',on=['link_ID','date','time_start_hour'])
    else:
        x = last_hour_stat(grp,0)
        train_df = pd.merge(train_df,x,how='left',on=['link_ID','date','time_start_hour'])

100%|██████████| 6/6 [11:02<00:00, 110.56s/it]


In [8]:
train_df = train_df[(train_df.time_start_hour!=6)&(train_df.time_start_hour!=13)&(train_df.time_start_hour!=16)]

In [9]:
train_df = pd.merge(train_df,link_info,how='left',on=['link_ID'])

In [10]:
train_df['length_rank'] = train_df['length'].rank(method='max',ascending=False).astype('int32')
train_df['width_rank'] = train_df['width'].rank(method='max',ascending=False).astype('int32')
train_df['length*width'] = train_df['width']*train_df['length'].astype('int16')
train_df['length/width'] = train_df['width']/train_df['length']

In [11]:
train_df = pd.merge(train_df,link_top.loc[:,['link_ID','in_link_num','out_link_num']],how='left',on=['link_ID'])
train_df['in*out'] = train_df['in_link_num']*train_df['out_link_num']
train_df['out-in'] = train_df['out_link_num']-train_df['in_link_num']
train_df['out-in/length'] = (train_df['out_link_num']-train_df['in_link_num'])/train_df['length']
train_df['out-in/length*width']=(train_df['out_link_num']-train_df['in_link_num'])/train_df['length/width']
train_df['out/in'] = train_df['out_link_num']/train_df['in_link_num']
train_df['out/in'] = train_df['out/in'].replace({inf:0})

In [12]:
window_list = tqdm([3,7,12,20,27])
for t in window_list:
    min_stat = hist_time_stat(train_df,time_window=t)
    quater_stat = hist_time_stat(train_df,groupby_key=['quarter_trans'],time_window=t)
    train_df = pd.merge(train_df,min_stat,how='left',on = ['link_ID','month','time_start_hour','time_start_min'])
    train_df = pd.merge(train_df,quater_stat,how='left',on = ['link_ID','month','quarter_trans'])

100%|██████████| 5/5 [07:35<00:00, 91.52s/it]


In [13]:
train_df.to_csv('./data/feature/train.csv',index=False)