In [1]:
import numpy as np
import pandas as pd
from datetime import date
from workalendar.asia import HongKong 

# Data preprocess - train

In [2]:
df = pd.read_csv("train.csv")

In [3]:
df.head()

Unnamed: 0,id,date,speed
0,0,1/1/2017 0:00,43.00293
1,1,1/1/2017 1:00,46.118696
2,2,1/1/2017 2:00,44.294158
3,3,1/1/2017 3:00,41.067468
4,4,1/1/2017 4:00,46.448653


In [4]:
temp = pd.DataFrame(df.date.apply(lambda x: x.split(" ")).tolist(), columns = ['date','h'])

In [5]:
temp.h = temp.h.apply(lambda x: x.split(":")[0]).tolist()

In [6]:
temp_1 = pd.DataFrame(temp.date.apply(lambda x: x.split("/")).tolist(), columns = ['d','m','y'])

In [7]:
temp = temp.drop(['date'], axis=1)
temp = pd.concat([temp,temp_1], axis=1)
del temp_1

In [8]:
temp = temp.astype(int)
temp.head()

Unnamed: 0,h,d,m,y
0,0,1,1,2017
1,1,1,1,2017
2,2,1,1,2017
3,3,1,1,2017
4,4,1,1,2017


In [9]:
df = df.drop(['date'], axis=1)
df = pd.concat([df,temp], axis=1)
del temp
df.head()

Unnamed: 0,id,speed,h,d,m,y
0,0,43.00293,0,1,1,2017
1,1,46.118696,1,1,1,2017
2,2,44.294158,2,1,1,2017
3,3,41.067468,3,1,1,2017
4,4,46.448653,4,1,1,2017


In [10]:
cal = HongKong()

In [11]:
is_working_day = list()
day_of_week = list()

for idx in range(len(df.id)):
    if cal.is_working_day(date(df.y[idx], df.m[idx], df.d[idx])):
        is_working_day.append(1)
    else:
        is_working_day.append(0)
    
    date_data = date(df.y[idx], df.m[idx], df.d[idx]).isocalendar()
    day_of_week.append(date_data[2])

In [12]:
df['day_of_week'] = day_of_week
df["is_working_day"] = is_working_day

In [13]:
df.head()

Unnamed: 0,id,speed,h,d,m,y,day_of_week,is_working_day
0,0,43.00293,0,1,1,2017,7,0
1,1,46.118696,1,1,1,2017,7,0
2,2,44.294158,2,1,1,2017,7,0
3,3,41.067468,3,1,1,2017,7,0
4,4,46.448653,4,1,1,2017,7,0


# Data preprocess - test

In [14]:
df_test = pd.read_csv("test.csv")
df_test.head()

Unnamed: 0,id,date
0,0,1/1/2018 2:00
1,1,1/1/2018 5:00
2,2,1/1/2018 7:00
3,3,1/1/2018 8:00
4,4,1/1/2018 10:00


In [15]:
temp = pd.DataFrame(df_test.date.apply(lambda x: x.split(" ")).tolist(), columns = ['date','h'])
temp.h = temp.h.apply(lambda x: x.split(":")[0]).tolist()

temp_1 = pd.DataFrame(temp.date.apply(lambda x: x.split("/")).tolist(), columns = ['d','m','y'])

temp = temp.drop(['date'], axis=1)
temp = pd.concat([temp,temp_1], axis=1)

del temp_1

temp = temp.astype(int)
temp.head()

Unnamed: 0,h,d,m,y
0,2,1,1,2018
1,5,1,1,2018
2,7,1,1,2018
3,8,1,1,2018
4,10,1,1,2018


In [16]:
df_test = df_test.drop(['date'], axis=1)
df_test = pd.concat([df_test,temp], axis=1)

del temp

df_test.head()

Unnamed: 0,id,h,d,m,y
0,0,2,1,1,2018
1,1,5,1,1,2018
2,2,7,1,1,2018
3,3,8,1,1,2018
4,4,10,1,1,2018


In [17]:
is_working_day = list()
day_of_week = list()

for idx in range(len(df_test.id)):
    if cal.is_working_day(date(df_test.y[idx], df_test.m[idx], df_test.d[idx])):
        is_working_day.append(1)
    else:
        is_working_day.append(0)
    
    date_data = date(df_test.y[idx], df_test.m[idx], df_test.d[idx]).isocalendar()
    day_of_week.append(date_data[2])

df_test['day_of_week'] = day_of_week
df_test["is_working_day"] = is_working_day

df_test.head()

Unnamed: 0,id,h,d,m,y,day_of_week,is_working_day
0,0,2,1,1,2018,1,0
1,1,5,1,1,2018,1,0
2,2,7,1,1,2018,1,0
3,3,8,1,1,2018,1,0
4,4,10,1,1,2018,1,0


# model

In [18]:
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV

In [19]:
y = df.speed
X = df.drop(['speed'], axis=1)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
from sklearn.metrics import mean_squared_error as MSE

## xgboost

In [22]:
import xgboost as xg
from scipy import stats

In [23]:
xgb_r = xg.XGBRegressor(objective ='reg:squarederror', 
                        n_estimators = 1000, 
                        seed = 123, 
                        max_depth=10, 
                        learning_rate=0.05) 

In [24]:
xgb_r.fit(X_train, y_train, early_stopping_rounds=10, eval_set=[(X_test, y_test)], verbose=False)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.05, max_delta_step=0, max_depth=10,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=1000, n_jobs=0, num_parallel_tree=1, random_state=123,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=123,
             subsample=1, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [25]:
pred = xgb_r.predict(X_test)

In [26]:
rmse = np.sqrt(MSE(y_test, pred)) 
print("RMSE : % f" %(rmse)) 

RMSE :  3.070653


# create submission csv

In [27]:
test_pred = xgb_r.predict(df_test)

sub_df = pd.DataFrame({"id":df_test.id.values, "speed":test_pred})

sub_df.to_csv("sampleSubmission.csv", index=False)