In [1]:
import pandas as pd
import numpy as np
import os
from scipy.interpolate import UnivariateSpline
from sklearn import linear_model
import xgboost as xgb
from sklearn.externals import joblib
from sklearn.utils import *



In [2]:
pd.set_option('display.max_rows',150)
pd.set_option('display.max_columns',500)
pd.set_option('display.width',1000)

In [3]:
# 读取处理好的特征数据
df = pd.read_csv('data/trainning.txt', delimiter=';',parse_dates=['time_interval_begin'],dtype={'link_ID':object})
df.head()

Unnamed: 0,link_ID,date,time_interval_begin,travel_time,imputation1,lagging1,lagging2,lagging3,lagging4,lagging5,length,area,vacation,minute_series,day_of_week,day_of_week_en,hour_en,"week_hour_1.0,1.0","week_hour_1.0,2.0","week_hour_1.0,3.0","week_hour_2.0,1.0","week_hour_2.0,2.0","week_hour_2.0,3.0","week_hour_3.0,1.0","week_hour_3.0,2.0","week_hour_3.0,3.0",links_num_2,links_num_3,links_num_4,links_num_5,width_3,width_6,width_9,width_12,width_15,link_ID_en
0,3377906280028510514,2017-03-01,2017-03-01 06:00:00,1.659311,True,,,,,,48,144,0.0,0.0,3,1.0,1.0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,47
1,3377906280028510514,2017-03-01,2017-03-01 06:02:00,1.664941,True,1.659311,,,,,48,144,0.0,2.0,3,1.0,1.0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,47
2,3377906280028510514,2017-03-01,2017-03-01 06:04:00,1.671675,True,1.664941,1.659311,,,,48,144,0.0,4.0,3,1.0,1.0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,47
3,3377906280028510514,2017-03-01,2017-03-01 06:06:00,1.676886,True,1.671675,1.664941,1.659311,,,48,144,0.0,6.0,3,1.0,1.0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,47
4,3377906280028510514,2017-03-01,2017-03-01 06:08:00,1.682314,True,1.676886,1.671675,1.664941,1.659311,,48,144,0.0,8.0,3,1.0,1.0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,47


In [4]:
# 时间序列特征
lagging = 5
lagging_feature = ['lagging%01d' % e for e in range(lagging, 0, -1)]
lagging_feature

['lagging5', 'lagging4', 'lagging3', 'lagging2', 'lagging1']

In [5]:
base_feature = [x for x in df.columns.values.tolist() if x not in ['time_interval_begin',
                                                                  'link_ID','link_ID_int',
                                                                  'date','travel_time',
                                                                  'imputationl','minute_series',
                                                                  'area','hour_en',
                                                                   'day_of_week']]

In [6]:
base_feature = [x for x in base_feature if x not in lagging_feature]

In [7]:
train_feature = list(base_feature)
train_feature.extend(lagging_feature)
valid_feature = list(base_feature)
valid_feature.extend(['minute_series', 'travel_time'])
print(train_feature)

['imputation1', 'length', 'vacation', 'day_of_week_en', 'week_hour_1.0,1.0', 'week_hour_1.0,2.0', 'week_hour_1.0,3.0', 'week_hour_2.0,1.0', 'week_hour_2.0,2.0', 'week_hour_2.0,3.0', 'week_hour_3.0,1.0', 'week_hour_3.0,2.0', 'week_hour_3.0,3.0', 'links_num_2', 'links_num_3', 'links_num_4', 'links_num_5', 'width_3', 'width_6', 'width_9', 'width_12', 'width_15', 'link_ID_en', 'lagging5', 'lagging4', 'lagging3', 'lagging2', 'lagging1']


xgboost训练参数：

In [8]:
params_grid = {
    'learning_rate':[0.05],
    'n_estimators':[100],
    'subsample':[0.6],
    'colsample_bytree':[0.6],
    'max_depth':[7],
    'min_child_weight':[1],
    'reg_alpha':[2],
    'gamma':[0]
}

In [9]:
from sklearn.model_selection import ParameterGrid
grid = ParameterGrid(params_grid)

In [10]:
def bucket_data(lines):
    bucket = {}
    for line in lines:
        time_series = line[-2]
        bucket[time_series] = []
    for line in lines:
        time_series, y1 = line[-2:]
        line = np.delete(line, -2, axis=0)
        bucket[time_series].append(line)
    return bucket


def cross_valid(regressor, bucket, lagging):
    valid_loss = []
    last = [[] for i in range(len(bucket[list(bucket.keys())[0]]))]
    for time_series in sorted(bucket.keys(), key=float):
        if time_series >= 120:
            if int(time_series) in range(120,120+lagging*2,2):
                last = np.concatenate((last, np.array(bucket[time_series], dtype=float)[:, -1].reshape(-1,1)),axis=1)
            else:
                batch = np.array(bucket[time_series], dtype=float)
                y = batch[:,-1]
                batch = np.delete(batch, -1, axis=1)
                batch = np.concatenate((batch, last), axis=1)
                y_pre = regressor.predict(batch)
                last = np.delete(last, 0, axis=1)
                last = np.concatenate((last, y_pre.reshape(-1,1)),axis=1)
                loss = np.mean(abs(np.expm1(y) - np.expm1(y_pre))/np.expm1(y))
                valid_loss.append(loss)
    return np.mean(valid_loss)


def mape_ln(y, d):
    c = d.get_label()
    result = np.sum(np.abs((np.expm1(y)-np.expm1(c))/np.expm1(c)))/len(c)
    return 'mape', result


def submission(train_feature, regressor,df, file1,file2,file3,file4):
    test_df = df.loc[((df['time_interval_begin'].dt.year==2017)&(df['time_interval_begin'].dt.month==7)
                     &(df['time_interval_begin'].dt.hour.isin([7,14,17]))
                      &(df['time_interval_begin'].dt.minute==58))].copy()
    test_df['lagging5'] = test_df['lagging4']
    test_df['lagging4'] = test_df['lagging3']
    test_df['lagging3'] = test_df['lagging2']
    test_df['lagging2'] = test_df['lagging1']
    test_df['lagging1'] = test_df['travel_time']
    with open(file1, 'w'):
        pass
    with open(file2, 'w'):
        pass
    with open(file3, 'w'):
        pass
    with open(file4, 'w'):
        pass
    for i in range(30):
        test_X = test_df[train_feature]
        y_prediction = regressor.predict(test_X.values)
        test_df['lagging5'] = test_df['lagging4']
        test_df['lagging4'] = test_df['lagging3']
        test_df['lagging3'] = test_df['lagging2']
        test_df['lagging2'] = test_df['lagging1']
        test_df['lagging1'] = y_prediction
        
        test_df['prediction'] = np.expm1(y_prediction)
        test_df['time_interval_begin'] = test_df['time_interval_begin']+pd.DateOffset(minutes=2)
        test_df['time_interval'] = test_df['time_interval_begin'].map(
            lambda x: '[' + str(x)+','+str(x+pd.DateOffset(minutes=2))+')')
        test_df.time_interval = test_df.time_interval.astype(object)
        if i < 7:
            test_df[['link_ID','date','time_interval','prediction']].to_csv(file1,mode='a',
                                                                          header=False,
                                                                          index=False,
                                                                          sep=';')
        elif (7 <= i) and (i < 14):
            test_df[['link_ID','date','time_interval','prediction']].to_csv(file2,mode='a',
                                                                          header=False,
                                                                          index=False,
                                                                          sep=';')
        elif (14 <= i) and (i < 22):
            test_df[['link_ID','date','time_interval','prediction']].to_csv(file1,mode='a',
                                                                          header=False,
                                                                          index=False,
                                                                          sep=';')
        else:
            test_df[['link_ID','date','time_interval','prediction']].to_csv(file4,mode='a',
                                                                          header=False,
                                                                          index=False,
                                                                          sep=';')

训练模块

In [11]:
from sklearn.model_selection import train_test_split
def fit_evaluate(df, df_test, params):
    df = df.dropna()
    X = df[train_feature].values
    y = df['travel_time'].values
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
    
    df_test = df_test[valid_feature].values
    valid_data = bucket_data(df_test)
    
    eval_set = [(X_test, y_test)]
    regressor = xgb.XGBRegressor(learning_rate=params['learning_rate'],
                             n_estimators=params['n_estimators'],
                            booster='gbtree', objective='reg:linear',
                            n_jobs=-1,subsample=params['subsample'],
                            colsample_bytree=params['colsample_bytree'],
                            random_state=0,max_depth=params['max_depth'],
                            gamma=params['gamma'],
                             min_child_weight=params['min_child_weight'],
                            reg_alpha=params['reg_alpha'])
    regressor.fit(X_train,y_train,verbose=False,early_stopping_rounds=10,eval_set=eval_set)
    return regressor, cross_valid(regressor, valid_data, lagging=lagging), regressor.best_iteration,regressor.best_score

In [12]:
def train(df, params, best, vis=False):
    train1 = df.loc[df['time_interval_begin'] <= pd.to_datetime('2017-03-24')]
    train2 = df.loc[
        (df['time_interval_begin']>pd.to_datetime('2017-03-24'))&(
        df['time_interval_begin'] <= pd.to_datetime('2017-04-18'))]
    train3 = df.loc[
        (df['time_interval_begin']>pd.to_datetime('2017-04-18'))&(
        df['time_interval_begin'] <= pd.to_datetime('2017-05-12'))]
    train4 = df.loc[
        (df['time_interval_begin']>pd.to_datetime('2017-05-12'))&(
        df['time_interval_begin'] <= pd.to_datetime('2017-06-06'))]
    train5 = df.loc[
        (df['time_interval_begin']>pd.to_datetime('2017-06-06'))&(
        df['time_interval_begin'] <= pd.to_datetime('2017-06-30'))]
    
    regressor, loss1, best_iteration1,best_score1 = fit_evaluate(pd.concat([train1,
                                                                            train2,
                                                                           train3,
                                                                           train4]),train5,
                                                                params)
    print(best_iteration1,best_score1,loss1)
    
    regressor, loss2, best_iteration2,best_score2 = fit_evaluate(pd.concat([train1,
                                                                            train2,
                                                                           train3,
                                                                           train5]),train4,
                                                                params)    
    print(best_iteration2,best_score2,loss2)
    
    regressor, loss3, best_iteration3,best_score3 = fit_evaluate(pd.concat([train1,
                                                                            train2,
                                                                           train4,
                                                                           train5]),train3,
                                                                params)    
    print(best_iteration3,best_score3,loss3) 

    regressor, loss4, best_iteration4,best_score4 = fit_evaluate(pd.concat([train1,
                                                                           train3,
                                                                           train4,
                                                                           train5]),train2,
                                                                params) 
    print(best_iteration4,best_score4,loss4)     

    regressor, loss5, best_iteration5,best_score5 = fit_evaluate(pd.concat([train2,
                                                                           train3,
                                                                           train4,
                                                                           train5]),train1,
                                                                params)
    print(best_iteration5,best_score5,loss5) 
    
    loss = [loss1,loss2, loss3, loss4, loss5]
    params['loss_std'] = np.std(loss)
    params['loss'] = str(loss)
    params['mean_loss'] = np.mean(loss)
    params['n_estimators'] = str([best_iteration1, best_iteration2, best_iteration3,
                                 best_iteration4, best_iteration5])
    params['best_score'] = str([best_score1, best_score2, best_score3,
                                 best_score4, best_score5])
    
    print(str(params))
    if np.mean(loss) <= best:
        best = np.mean(loss)
        print('best with:' + str(params))
    return best

In [13]:
best = 1
for params in grid:
    best = train(df, params, best)

99 0.231729 0.09787323564628972
99 0.211948 0.22588986922596394
99 0.207832 0.269828138777363
99 0.205743 0.27878690843594917
99 0.206546 0.2825731100341743
{'colsample_bytree': 0.6, 'gamma': 0, 'learning_rate': 0.05, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': '[99, 99, 99, 99, 99]', 'reg_alpha': 2, 'subsample': 0.6, 'loss_std': 0.06956988861011186, 'loss': '[0.09787323564628972, 0.22588986922596394, 0.269828138777363, 0.27878690843594917, 0.2825731100341743]', 'mean_loss': 0.23099025242394805, 'best_score': '[0.231729, 0.211948, 0.207832, 0.205743, 0.206546]'}
best with:{'colsample_bytree': 0.6, 'gamma': 0, 'learning_rate': 0.05, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': '[99, 99, 99, 99, 99]', 'reg_alpha': 2, 'subsample': 0.6, 'loss_std': 0.06956988861011186, 'loss': '[0.09787323564628972, 0.22588986922596394, 0.269828138777363, 0.27878690843594917, 0.2825731100341743]', 'mean_loss': 0.23099025242394805, 'best_score': '[0.231729, 0.211948, 0.207832, 0.205743,

In [14]:
submit_params = {
    'learning_rate':0.05,
    'n_estimators':100,
    'subsample':0.6,
    'colsample_bytree':0.6,
    'max_depth':7,
    'min_child_weight':1,
    'reg_alpha':2,
    'gamma':0
}

In [15]:
def xgboost_submit(df, params):
    train_df = df.loc[df['time_interval_begin']<pd.to_datetime('2017-07-01')]
    
    train_df = train_df.dropna()
    X = train_df[train_feature].values
    y = train_df['travel_time'].values
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
    
    eval_set = [(X_test, y_test)]
    regressor = xgb.XGBRegressor(learning_rate=params['learning_rate'],
                             n_estimators=params['n_estimators'],
                            booster='gbtree', objective='reg:linear',
                            n_jobs=-1,subsample=params['subsample'],
                            colsample_bytree=params['colsample_bytree'],
                            random_state=0,max_depth=params['max_depth'],
                            gamma=params['gamma'],
                             min_child_weight=params['min_child_weight'],
                            reg_alpha=params['reg_alpha'])
    regressor.fit(X_train,y_train,verbose=True,early_stopping_rounds=10,
                  eval_metric=mape_ln,eval_set=eval_set)
    try:
        os.mkdir("model/")  # 尝试创建相对目录，有则跳过
    except:
        pass
    joblib.dump(regressor, 'model/xgbr.pkl')
    print(regressor)
    try:
        os.mkdir("submission/")  # 尝试创建相对目录，有则跳过
    except:
        pass
    submission(train_feature, regressor,df, 
               'submission/xgbrl.txt','submission/xgbr2.txt',
              'submission/xgbr3.txt','submission/xgbr4.txt')

In [16]:
xgboost_submit(df, submit_params)

[0]	validation_0-rmse:2.02747	validation_0-mape:0.867894
Multiple eval metrics have been passed: 'validation_0-mape' will be used for early stopping.

Will train until validation_0-mape hasn't improved in 10 rounds.
[1]	validation_0-rmse:1.92734	validation_0-mape:0.850712
[2]	validation_0-rmse:1.83231	validation_0-mape:0.83309
[3]	validation_0-rmse:1.74204	validation_0-mape:0.815116
[4]	validation_0-rmse:1.65635	validation_0-mape:0.796798
[5]	validation_0-rmse:1.57575	validation_0-mape:0.777818
[6]	validation_0-rmse:1.49911	validation_0-mape:0.758668
[7]	validation_0-rmse:1.4258	validation_0-mape:0.739614
[8]	validation_0-rmse:1.35624	validation_0-mape:0.720407
[9]	validation_0-rmse:1.29025	validation_0-mape:0.701088
[10]	validation_0-rmse:1.22764	validation_0-mape:0.681732
[11]	validation_0-rmse:1.16884	validation_0-mape:0.662053
[12]	validation_0-rmse:1.11249	validation_0-mape:0.642755
[13]	validation_0-rmse:1.05955	validation_0-mape:0.62329
[14]	validation_0-rmse:1.00937	validation_