In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split,GridSearchCV



In [2]:
public_vacation_list = [
    '20170102', '20170128', '20170130', '20170131', '20170404',
    '20170414', '20170415', '20170417', '20170501', '20170503',
    '20170530', '20170701', '20171002', '20171005', '20171028',
    '20171225', '20171226', '20180101', '20180216', '20180217',
    '20180219', '20180330', '20180331', '20180402', '20180405',
    '20180501', '20180522', '20180618', '20180702', '20180925',
    '20181001', '20181017', '20181225', '20181226'
]

In [3]:
training_data = pd.read_csv("train.csv")
testing_data = pd.read_csv("test.csv")

In [4]:
def preprocess_data(data):
    data_of_date = data['date'].tolist()
    date = []
    time = []
    for row in data_of_date:
        temp = row.split(" ")
        date.append(temp[0])
        time.append(int(temp[1].split(':')[0]))
    data['date'] = pd.to_datetime(date)
    data['time'] = time
    return data['date'],data['time']

In [5]:
training_data['date'],training_data['time'] = preprocess_data(training_data)
training_data['year'] = training_data['date'].dt.year
training_data['month'] = training_data['date'].dt.month
training_data['day'] = training_data['date'].dt.day
training_data['dayofweek'] = training_data['date'].dt.dayofweek
training_data["weekofyear"] = training_data['date'].dt.week
training_data["quarter"] = training_data['date'].dt.quarter
training_data["dayofyear"] = training_data['date'].dt.dayofyear
training_data['date'] = training_data['date'].apply(lambda x: x.strftime('%Y%m%d'))
training_data['is_public_holiday'] = training_data['date'].apply(lambda x: 1 if x in public_vacation_list else 0)
training_data['is_weekend'] = training_data['dayofweek'].apply(lambda x: 1 if x == 5 or x == 6 else 0)

In [6]:
training_data

Unnamed: 0,id,date,speed,time,year,month,day,dayofweek,weekofyear,quarter,dayofyear,is_public_holiday,is_weekend
0,0,20170101,43.002930,0,2017,1,1,6,52,1,1,0,1
1,1,20170101,46.118696,1,2017,1,1,6,52,1,1,0,1
2,2,20170101,44.294158,2,2017,1,1,6,52,1,1,0,1
3,3,20170101,41.067468,3,2017,1,1,6,52,1,1,0,1
4,4,20170101,46.448653,4,2017,1,1,6,52,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14001,14001,20181231,19.865269,12,2018,12,31,0,1,4,365,0,0
14002,14002,20181231,17.820375,15,2018,12,31,0,1,4,365,0,0
14003,14003,20181231,12.501851,16,2018,12,31,0,1,4,365,0,0
14004,14004,20181231,15.979319,18,2018,12,31,0,1,4,365,0,0


In [7]:
testing_data['date'],testing_data['time'] = preprocess_data(testing_data)
testing_data['year'] = testing_data['date'].dt.year
testing_data['month'] = testing_data['date'].dt.month
testing_data['day'] = testing_data['date'].dt.day
testing_data['dayofweek'] = testing_data['date'].dt.dayofweek
testing_data["weekofyear"] = testing_data['date'].dt.week
testing_data["quarter"] = testing_data['date'].dt.quarter
testing_data["dayofyear"] = testing_data['date'].dt.dayofyear
testing_data['date'] = testing_data['date'].apply(lambda x: x.strftime('%Y%m%d'))
testing_data['is_public_holiday'] = testing_data['date'].apply(lambda x: 1 if x in public_vacation_list else 0)
testing_data['is_weekend'] = testing_data['dayofweek'].apply(lambda x: 1 if x == 5 or x == 6 else 0)

In [8]:
testing_data

Unnamed: 0,id,date,time,year,month,day,dayofweek,weekofyear,quarter,dayofyear,is_public_holiday,is_weekend
0,0,20180101,2,2018,1,1,0,1,1,1,1,0
1,1,20180101,5,2018,1,1,0,1,1,1,1,0
2,2,20180101,7,2018,1,1,0,1,1,1,1,0
3,3,20180101,8,2018,1,1,0,1,1,1,1,0
4,4,20180101,10,2018,1,1,0,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
3499,3499,20181231,17,2018,12,31,0,1,4,365,0,0
3500,3500,20181231,19,2018,12,31,0,1,4,365,0,0
3501,3501,20181231,21,2018,12,31,0,1,4,365,0,0
3502,3502,20181231,22,2018,12,31,0,1,4,365,0,0


In [9]:
x_train = training_data.drop(["speed","date","id"], axis=1)
y_train = training_data["speed"]
x_test = testing_data.drop(["date","id"], axis=1)

In [10]:
print(x_train)
print(y_train)
print(x_test)

       time  year  month  day  dayofweek  weekofyear  quarter  dayofyear  \
0         0  2017      1    1          6          52        1          1   
1         1  2017      1    1          6          52        1          1   
2         2  2017      1    1          6          52        1          1   
3         3  2017      1    1          6          52        1          1   
4         4  2017      1    1          6          52        1          1   
...     ...   ...    ...  ...        ...         ...      ...        ...   
14001    12  2018     12   31          0           1        4        365   
14002    15  2018     12   31          0           1        4        365   
14003    16  2018     12   31          0           1        4        365   
14004    18  2018     12   31          0           1        4        365   
14005    20  2018     12   31          0           1        4        365   

       is_public_holiday  is_weekend  
0                      0           1  
1        

In [11]:
xx_train, xx_test, yy_train, yy_test = train_test_split(x_train, y_train,test_size=0.01)
dtrain = xgb.DMatrix(xx_train, label = yy_train)    
dvalid = xgb.DMatrix(xx_test, label = yy_test)      
dtest = xgb.DMatrix(x_test) 

In [12]:
xgb_pars = {'objective': 'reg:squarederror', 'learning_rate': 0.02, 'min_child_weight': 0.7, 'max_depth': 10,  
            'subsample': 0.87, 'colsample_bytree': 1, 'colsample_bylevel': 0.68, 'reg_alpha': 0.2, 'gamma': 0.1,
            'reg_lambda': 0.4, 'nthread': 4}

watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

model = xgb.train(xgb_pars, dtrain, 10000, watchlist, early_stopping_rounds=5,
      maximize=False, verbose_eval=1)

print('The RMSE is %.5f' % model.best_score)

[0]	train-rmse:34.34394	valid-rmse:33.53677
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 5 rounds.
[1]	train-rmse:33.67528	valid-rmse:32.87363
[2]	train-rmse:33.02036	valid-rmse:32.22152
[3]	train-rmse:32.37791	valid-rmse:31.58157
[4]	train-rmse:31.74960	valid-rmse:30.95390
[5]	train-rmse:31.13421	valid-rmse:30.33897
[6]	train-rmse:30.53024	valid-rmse:29.73877
[7]	train-rmse:29.94003	valid-rmse:29.14782
[8]	train-rmse:29.36306	valid-rmse:28.57567
[9]	train-rmse:28.79662	valid-rmse:28.01127
[10]	train-rmse:28.24188	valid-rmse:27.45878
[11]	train-rmse:27.69867	valid-rmse:26.91768
[12]	train-rmse:27.16680	valid-rmse:26.38553
[13]	train-rmse:26.64505	valid-rmse:25.86325
[14]	train-rmse:26.13431	valid-rmse:25.35114
[15]	train-rmse:25.63303	valid-rmse:24.85022
[16]	train-rmse:25.14405	valid-rmse:24.36367
[17]	train-rmse:24.66407	valid-rmse:23.88411
[18]	train-rmse:24.19389	valid-rmse:23.41102
[19]	train-

In [13]:
valid_pred = model.predict(dvalid)
print('valid_mse', mean_squared_error(yy_test,valid_pred))

valid_mse 10.959809873669908


In [14]:
y_pred = model.predict(dtest)
result = []
for i in range(0, len(y_pred)):
    result.append([int(i), y_pred[i]])

In [15]:
pd_data = pd.DataFrame(result, columns=['id', 'speed'])
pd_data.to_csv('submit.csv', index=None)