In [1]:
import numpy as np
import pandas as pd
import os
import math
import matplotlib.pyplot as plt

%matplotlib inline
import seaborn; seaborn.set()

# Data sets
TRAINING = "./data/train.csv"
TEST = "./data/test.csv"

In [2]:
data = pd.read_csv(TRAINING, dayfirst=True, parse_dates=['date'])
test_data = pd.read_csv(TEST, dayfirst=True, parse_dates=['date'])

In [3]:
data['y-m-d'] = data['date'].dt.date

In [None]:
data

In [4]:
test_data['y-m-d'] = test_data['date'].dt.date

### 增加特征0: 月/日/时

In [5]:
data['year'] = pd.Series([i.year for i in data['date']])

In [6]:
data['month'] = pd.Series([i.month for i in data['date']])

In [7]:
data['day'] = pd.Series([i.day for i in data['date']])

In [8]:
data['hour'] = pd.Series([i.hour for i in data['date']])

In [9]:
test_data['year'] = pd.Series([i.year for i in test_data['date']])
test_data['month'] = pd.Series([i.month for i in test_data['date']])
test_data['day'] = pd.Series([i.day for i in test_data['date']])
test_data['hour'] = pd.Series([i.hour for i in test_data['date']])

In [None]:
data

### 增加特征1： 星期

In [10]:
#指定date为index
data.set_index(['date'], inplace=True)
#train_data.head(2)

In [11]:
test_data.set_index(['date'], inplace=True)
#test_data.head(2)

In [12]:
def add_dayofweek(daily):
    days = ['Mon','Tue','Wed','Thurs','Fri','Sat','Sun']
    for i in range(7):
        daily[days[i]] = (daily.index.dayofweek == i).astype(float)

In [13]:
add_dayofweek(data)

In [14]:
add_dayofweek(test_data)

In [None]:
data

### 增加特征2: 天气

In [15]:
weather = pd.read_csv('./data/hongkong.csv', dayfirst=True, parse_dates=['date_time'])
weather.set_index(['date_time'], inplace=True)
# df_weather = pd.read_csv('hongkong.csv', parse_dates=['date_time'], index_col='date_time', date_parser=dateparse)
weather_fea = weather[['tempC','visibility','winddirDegree','windspeedKmph','humidity','cloudcover', 'WindChillC']]

In [None]:
weather.columns

In [18]:
data.set_index('y-m-d', inplace=True)
data = data.join(weather_fea)

In [19]:
test_data.set_index('y-m-d', inplace=True)
test_data = test_data.join(weather_fea)

In [None]:
data

### 增加特征3:  是否为节假日

In [20]:
# HKholidays
HKholidays_2017 = pd.to_datetime(['2017-01-02', '2017-1-28', '2017-1-30', '2017-1-31', \
                                  '2017-4-4', '2017-4-5', '2017-4-15', '2017-4-17',  \
                                  '2017-5-1', '2017-5-3', '2017-5-30', '2017-7-1',   \
                                  '2017-10-2', '2017-10-5', '2017-10-28', '2017-12-25', '2017-12-26',
                                  '2018-01-01', '2018-2-16', '2018-2-17', '2018-2-19', \
                                  '2018-3-30', '2018-3-31', '2018-4-2', '2018-4-5',  \
                                  '2018-5-1', '2018-5-22', '2018-6-18', '2018-7-2',   \
                                  '2018-9-25', '2018-10-1', '2018-10-17', '2018-12-25', '2018-12-26'])

In [21]:
HKholidays_2017 = pd.to_datetime(HKholidays_2017)

In [22]:
HKholidays_2017 = pd.Series(1,index=HKholidays_2017,name='holiday')

In [23]:
data = data.join(HKholidays_2017)

In [24]:
data['holiday'].fillna(0,inplace=True) #以0填充缺失值

In [25]:
#test_data.set_index('y-m-d', inplace=True)
test_data = test_data.join(HKholidays_2017)
test_data['holiday'].fillna(0,inplace=True) #以0填充缺失值

In [None]:
data

## 准备数据

In [31]:
column_names = ['year', 'Mon','Tue','Wed','Thurs','Fri','Sat','Sun','month', 'day', 'hour', 'holiday', 'tempC', 'visibility', 'winddirDegree', 'windspeedKmph','humidity','cloudcover', 'WindChillC']
# column_names = ['Sat','Sun', 'hour', 'holiday']
# column_names = ['Mon','Tue','Wed','Thurs','Fri','Sat','Sun','holiday','hour']
# column_names = column_names + hour
X = data[column_names]
X_sub = test_data[column_names]

y = data['speed']
#y_test = test['speed']

###  XGBoost

In [27]:
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error

# lt = np.arange(0.06,0.1,0.005)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=1)
# for i in lt:

#X_train, y_train= X, y
model = xgb.XGBRegressor(max_depth=7, learning_rate=0.09, n_estimators=500, objective='reg:squarederror') #MSE
model.fit(X_train, y_train)
y_pre = model.predict(X_test)
print('-'*10)
# print(i)
print('回归树二乘偏差均值:', mean_squared_error(y_test, y_pre))

----------
mse: 2.5995119935658946
回归树二乘偏差均值: 8.402239290272808


In [None]:
print(model.feature_importances_)
from matplotlib import pyplot
pyplot.bar(range(len(model.feature_importances_)), model.feature_importances_)
pyplot.show()

### 导出数据 

In [28]:
test_data['speed'] = model.predict(X_sub)

In [29]:
test_pre = test_data[['id', 'speed']].set_index('id')

In [None]:
test_pre

In [None]:
test_pre.to_csv('Submission_17th_newwea.csv')  