In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.neighbors import KNeighborsRegressor
import gc
import tensorflow as tf
import xgboost as xgb
from sklearn.ensemble import ExtraTreesRegressor,RandomForestRegressor
from keras.models import Sequential,load_model
from keras.layers import LSTM, GRU, Dense, Activation, Dropout ,regularizers
from keras import optimizers,metrics,backend
from sklearn.preprocessing import  StandardScaler
from sklearn.datasets import  make_regression

Using TensorFlow backend.


In [2]:
train = pd.read_csv('./data/feature/train.csv',parse_dates=['date','time_start'],dtype={'link_ID':str})
train = train.sort_values(by=['link_ID','time_start'])

In [None]:
train[(train.month==7)&(train.travel_time!=0)]['link_ID'].nunique()

In [3]:
hol = [pd.to_datetime('20170402',format='%Y%m%d'),pd.to_datetime('20170403',format='%Y%m%d'),
pd.to_datetime('20170404',format='%Y%m%d'),pd.to_datetime('20170429',format='%Y%m%d'),
pd.to_datetime('20170430',format='%Y%m%d'),pd.to_datetime('20170501',format='%Y%m%d'),
pd.to_datetime('20170528',format='%Y%m%d'),pd.to_datetime('20170529',format='%Y%m%d'),
pd.to_datetime('20170530',format='%Y%m%d'),pd.to_datetime('20170601',format='%Y%m%d'),
pd.to_datetime('20170308',format='%Y%m%d')]
holiday = pd.DataFrame({'date':hol,'holiday':np.ones(len(hol))})
train = pd.merge(train,holiday,how='left',on=['date'])
train['holiday'] = train['holiday'].fillna(0)

In [4]:
def data_split(train,test_hour=[8],predict_month=6,start_month=5,feature_clip=False):
    train_hour = []
    for i in test_hour:
        train_hour.append(i)
        train_hour.append(i-1)
        train_hour.append(i+1)
    if(predict_month==7):
        train = train[train.missing==0]
    if(feature_clip==True):
        len_=len(train.columns)
        train=train.iloc[:,:(len_-70)]
    ctf = LabelEncoder()
    ctf.fit(train['link_ID'].values)
    train['link_ID_Encode'] = ctf.transform(train['link_ID'].values)
    train_df = train[train.time_start_hour.isin(train_hour)]
    train_df = train_df[train_df.month>=start_month]
    train_df.fillna(method='ffill',inplace=True)
    val_data = train_df[(train_df.time_start_hour.isin(test_hour))&(train_df.month==predict_month)]
    train_data = train_df[(train_df.month<=predict_month)]
    val_index = val_data.index
    rm_index = train_data.index.isin(val_index)
    train_data = train_data[~rm_index]
    return train_data,val_data

In [5]:
def data_cluster(train_data,val_data,cluster_num,test_hour,is_log=False,is_cluster=True):
    col = "c{0}_cluster".format(test_hour)
    if(is_cluster):
        train_data = train_data[train_data[col]==cluster_num]
        val_data = val_data[val_data[col]==cluster_num]
    col_list = train_data.drop(['link_ID','date','time_start','travel_time','true_label'],axis=1).columns.tolist()
    train_y = train_data.travel_time.values
    val_y = val_data.true_label.values
    train_data = train_data.drop(['link_ID','date','time_start','travel_time','true_label'],axis=1).values
    val_data = val_data.drop(['link_ID','date','time_start','travel_time','true_label'],axis=1).values
    if(is_log):
        train_y = np.log1p(train_y)
        val_y = np.log1p(val_y)
    return train_data,val_data,train_y,val_y,col_list

In [6]:
def mape(x,y):
    return np.fabs(x/y-1).mean()

In [7]:
def mape_eval(pred,dtrain):
    label = dtrain.get_label()
    error = (np.abs(pred-label)/label).mean()
    return 'mape',error,False

In [8]:
def mape_eval_xgb(pred,dtrain):
    label = dtrain.get_label()
    error = (np.abs(pred-label)/label).mean()
    return 'mape',error

In [9]:
def WriteTxt(dir, x, delimiter=" "):
    text_file = open(dir, "w")
    nrow = np.alen(x)
    ncol = np.alen(x[0])
    for i in range(nrow):
        temp = str(x[i, 0])
        for j in range(1, ncol):
            temp = temp + delimiter + str(x[i, j])
        if i + 1 != nrow:
            temp = temp + "\n"
        text_file.write(temp)
    text_file.close()

In [10]:
train_data,val_data = data_split(train,[15],start_month=5,feature_clip=False)
train_data,val_data,train_y,val_y,col_list = data_cluster(train_data,val_data,2,15,is_cluster=False)

In [None]:
dtrain = lgb.Dataset(train_data,train_y,feature_name=col_list,free_raw_data=False,categorical_feature=['link_ID_Encode'])
dval = lgb.Dataset(val_data,val_y,feature_name=col_list,free_raw_data=False,categorical_feature=['link_ID_Encode'])

In [None]:
param_dart1 = {'num_leaves':64,'learning_rate':0.0025,'objective':'regression_l1',
              'num_threads':32,'feature_fraction':0.6,'bagging_fraction':0.6,'gaussian_eta':1,
              'bagging_freq':1,'max_bin':25,'lambda_l1':10,'boosting':'dart','device':'cpu',
             'min_data_in_leaf':150,'skip_drop':0.5,'max_drop':50,'xgboost_dart_mode':False,
             'drop_rate':0.3}
dart1 = lgb.train(param_dart1, dtrain,feval = mape_eval,valid_sets=[dval],verbose_eval=True,num_boost_round=1500,early_stopping_rounds=10)

In [None]:
param_dart2 = {'num_leaves':128,'learning_rate':0.005,'objective':'regression_l1',
              'num_threads':32,'feature_fraction':0.9,'bagging_fraction':0.85,'gaussian_eta':1,
              'bagging_freq':1,'max_bin':20,'lambda_l1':10,'boosting':'dart','device':'cpu',
             'min_data_in_leaf':100}
dart2 = lgb.train(param_dart2, dtrain,feval = mape_eval,valid_sets=[dval],verbose_eval=True,num_boost_round=1500,early_stopping_rounds=20)

In [None]:
param_goss = {'num_leaves':192,'learning_rate':0.003,'objective':'regression_l1',
              'num_threads':32,'feature_fraction':0.7,'bagging_fraction':0.7,'gaussian_eta':1,'max_bin':25,'lambda_l1':10,'boosting':'goss','device':'cpu',
             'min_data_in_leaf':150}
goss = lgb.train(param_goss, dtrain,feval = mape_eval,valid_sets=[dval],verbose_eval=True,num_boost_round=1500,early_stopping_rounds=20)

In [None]:
param_gbdt = {'num_leaves':64, 'num_trees':1200, 'objective':'regression_l1','learning_rate':0.005,'num_threads':32,
        'feature_fraction':0.8,'bagging_fraction':0.8,'bagging_freq':10,'max_bin':20,'gaussian_eta':1.1,
        'lambda_l2':3,'boosting':'gbdt','device':'cpu'}
gbdt = lgb.train(param_gbdt, dtrain, feval = mape_eval,valid_sets=[dval],verbose_eval=True,num_boost_round=2500,early_stopping_rounds=20)

In [None]:
train_data1,val_data1 = data_split(train,[15],start_month=4,feature_clip=True)
train_data1,val_data1,train_y1,val_y1,col_list1 = data_cluster(train_data1,val_data1,2,15,is_cluster=False)
dtrain1 = lgb.Dataset(train_data1,train_y1,feature_name=col_list1,free_raw_data=False)
dval1 = lgb.Dataset(val_data1,val_y1,feature_name=col_list1,free_raw_data=False)

In [None]:
param_dart1_ = {'num_leaves':96,'learning_rate':0.0025,'objective':'regression_l1',
              'num_threads':32,'feature_fraction':0.6,'bagging_fraction':0.75,'gaussian_eta':1,
              'bagging_freq':2,'max_bin':20,'lambda_l2':3,'boosting':'dart','device':'cpu',
             'min_data_in_leaf':100,'skip_drop':0.5,'max_drop':80,'xgboost_dart_mode':False,
             'drop_rate':0.35}
dart1_ = lgb.train(param_dart1_, dtrain1,feval = mape_eval,valid_sets=[dval1],verbose_eval=True,num_boost_round=1500,early_stopping_rounds=15)

param_dart2_ = {'num_leaves':192,'learning_rate':0.004,'objective':'regression_l1',
              'num_threads':32,'feature_fraction':0.95,'bagging_fraction':0.85,'gaussian_eta':1,
              'bagging_freq':1,'max_bin':20,'lambda_l2':3,'boosting':'dart','device':'cpu',
             'min_data_in_leaf':100,'boost_from_average':True}
dart2_ = lgb.train(param_dart2_, dtrain1,feval = mape_eval,valid_sets=[dval1],verbose_eval=True,num_boost_round=1500,early_stopping_rounds=20)

param_goss_ = {'num_leaves':160,'learning_rate':0.005,'objective':'regression_l1',
              'num_threads':32,'feature_fraction':0.8,'bagging_fraction':0.8,'gaussian_eta':1,'max_bin':25,'lambda_l1':10,'boosting':'goss','device':'cpu',
             'min_data_in_leaf':150}
goss_ = lgb.train(param_goss_, dtrain1,feval = mape_eval,valid_sets=[dval1],verbose_eval=True,num_boost_round=1500,early_stopping_rounds=5)

param_gbdt_ = {'num_leaves':80, 'num_trees':1200, 'objective':'regression_l1','learning_rate':0.005,'num_threads':32,
        'feature_fraction':0.85,'bagging_fraction':0.85,'bagging_freq':10,'max_bin':20,'gaussian_eta':1.1,
        'lambda_l2':3,'boosting':'gbdt','device':'cpu'}
gbdt_ = lgb.train(param_gbdt_, dtrain1, feval = mape_eval,valid_sets=[dval1],verbose_eval=True,num_boost_round=2500,early_stopping_rounds=5)

In [None]:
dart1 = lgb.train(param_dart1, dtrain,dart1.best_iteration)
dart2 = lgb.train(param_dart2, dtrain,dart2.best_iteration)
gbdt = lgb.train(param_gbdt, dtrain,gbdt.best_iteration)
goss = lgb.train(param_goss, dtrain,goss.best_iteration)
dart1_ = lgb.train(param_dart1_, dtrain1,dart1_.best_iteration)
dart2_ = lgb.train(param_dart2_, dtrain1,dart2_.best_iteration)
goss_ = lgb.train(param_goss_, dtrain1,goss_.best_iteration)
gbdt_ = lgb.train(param_gbdt_, dtrain1,gbdt_.best_iteration)
e1 = [dart1,dart2,goss,gbdt]
e2 = [dart1_,dart2_,goss_,gbdt_]
p = []
for clf in e1:
    p.append(clf.predict(dval.data))
for clf in e2:
    p.append(clf.predict(dval1.data))
#w = np.array([0.292,0.29,0.327,0.325,0.3,0.294,0.322,0.321])
#w = np.exp(np.ones(8)/w)
#w = (w/w.sum()).reshape((1,-1))
#mape(s,dval.label)
#w = np.array([0.23,0.22,0.05,0.05,0.15,0.2,0.05,0.05]).reshape((1,-1))

In [None]:
w = np.array([0.20,0.4,0.0,0.0,0.1,0.3,0.0,0.0]).reshape((1,-1))

In [None]:
p1 = np.concatenate(p).reshape((8,-1))

In [None]:
p2 = np.dot(w,p1)

In [None]:
for i in p:
    print mape(i,dval.label)

In [None]:
mape(p2,dval.label)

In [11]:
for i in range(train_data.shape[1]):
    sc = StandardScaler()
    sc.fit(train_data[:,i].reshape((-1,1)))
    train_data[:,i] = sc.transform(train_data[:,i].reshape((-1,1))).flatten()
    val_data[:,i] = sc.transform(val_data[:,i].reshape((-1,1))).flatten()

In [12]:
def build_model(timesteps):
    model = Sequential()
        
    model.add(Dense(input_dim=timesteps,activation='relu',units=128))
    model.add(Dropout(0.3))
    model.add(Dense(input_dim=128,units=192,activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(input_dim=192,units=256,activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(input_dim=256,units=1,activation='relu'))
    sgd = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
    model.compile(loss='mean_absolute_percentage_error',optimizer=sgd,metrics=[metrics.mape])
    return model

In [13]:
model = build_model(train_data.shape[1])

In [13]:
model.fit(batch_size=64,epochs=10,shuffle=True,x=train_data,y=train_y,verbose=1)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f272bb58350>

In [None]:
pred = model.predict(val_data)
pred = pred.flatten()
mape(pred,val_y)

In [None]:
dart_ = lgb.train(param_dart,dtrain_all,265)

In [None]:
gbdt_ = lgb.train(param_gbdt,dtrain_all,415)

In [None]:
pred1 = np.around(np.expm1(gbdt_.predict(dtest)),3)

In [None]:
may_pred1 = np.around(dart.predict(may_dtest),3)
may_pred1 = may_pred1.reshape((132,31,30))
np.save('./may_pred.npy',may_pred1)

In [None]:
ans = pd.read_csv('./submission/t_neo_20170811_dart_.txt',sep='#',dtype={'link_ID':str})
link_info = pd.read_csv('./data/gy_contest_link_info.txt',sep=';',dtype={'link_ID':str})
sorter = dict()
for i in range(132):
    sorter[link_info['link_ID'].values[i]]=i
ans['rank'] = ans['link_ID'].map(sorter)
ans.sort_values(['rank','date','time_slice'],inplace=True)
ans_pred = ans['travel_time'].values.reshape((132,30,30))
np.save('./june_pred.npy',ans_pred)

In [None]:
pred2 = np.around(np.expm1(dart_.predict(dtest)),3)
#pred = pred2*0.70 + (1-0.70)*pred1

In [None]:
test_data['travel_time'] = pred2

In [None]:
test_data['time_end'] = test_data['time_start']+pd.Timedelta('2 minute')
test_data['time_interval'] = pd.Series(['[']*len(test_data)).str.cat(test_data['time_start'].astype('str')).str.cat(pd.Series([',']*len(test_data))).str.cat(test_data['time_end'].astype('str')).str.cat(pd.Series([')']*len(test_data))).values
#pred['date'] = pred['time_start'].apply(lambda x:x.date())
test_data['date'] = test_data['date'].apply(lambda x:x.strftime('%Y-%m-%d'))
test_data = test_data.loc[:,['link_ID','date','time_interval','travel_time']]
#WriteTxt('./submission/t_neo_20170811_dart_.txt',test_data.values,'#')

In [None]:
pred_8 = test_data.copy()

In [None]:
pred_15 = test_data.copy()

In [None]:
pred_18 = test_data.copy()

In [None]:
x = pred_8.append(pred_15).append(pred_18)

In [None]:
WriteTxt('./submission/t_neo_20170908_dart_.txt',x.values,'#')