In [1]:
import feather
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook

In [2]:
%%time
df = feather.read_dataframe('/Volumes/transcend/大檔案/Taxi/NYT/nyc_taxi_data_raw_clean_origin.feather')

CPU times: user 9.13 s, sys: 10.4 s, total: 19.5 s
Wall time: 1min 22s


In [3]:
df['weekday'] = df['weekday'].replace(['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'], [0, 1, 2, 3, 4, 5, 6])

In [4]:
df.head(2)

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance,year,month,weekday,hour
0,4.5,2009-06-15 17:26:00+00:00,-73.844315,40.721317,-73.841614,40.712276,1,1.029579,2009,6,0,17
1,16.9,2010-01-05 16:52:00+00:00,-74.016045,40.711304,-73.979271,40.782005,1,8.443307,2010,1,1,16


In [25]:
df2 = df.sample(n=5000000)
df2 = df2.reset_index(drop=True)

In [4]:
X = df.drop(['fare_amount','pickup_datetime'], axis=1)
y = df['fare_amount']

### 資料標準化

In [5]:
from sklearn import preprocessing
minmax = preprocessing.MinMaxScaler()
X[['distance']] = minmax.fit_transform(X[['distance']])

### Training & Testing

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

### Random Forest Regression

In [10]:
%%time
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()
rfr.fit(X_train.values, y_train.values)



CPU times: user 9min 9s, sys: 7.89 s, total: 9min 17s
Wall time: 9min 44s


### XGboost Regression

In [10]:
%%time
from xgboost import XGBRegressor
xgb = XGBRegressor()
xgb.fit(X_train.values, y_train.values)

[23:24:48] Tree method is automatically selected to be 'approx' for faster speed. To use old behavior (exact greedy algorithm on single machine), set tree_method to 'exact'.
CPU times: user 3h 29min 10s, sys: 31min 21s, total: 4h 31s
Wall time: 4h 22min 41s


In [11]:
%%time
# y_predict_rfr = rfr.predict(X_test)
y_predict_xgb = xgb.predict(X_test.values)

CPU times: user 47.7 s, sys: 7.9 s, total: 55.6 s
Wall time: 1min 1s


In [12]:
from sklearn.metrics import mean_squared_error
# mse = mean_squared_error(y_test.values, y_predict_rfr)
mse = mean_squared_error(y_test.values, y_predict_xgb)
print("MSE : ",mse)

MSE :  18.442097


In [13]:
from math import sqrt
# rms = sqrt(mean_squared_error(y_test.values, y_predict_rfr))
rms = sqrt(mean_squared_error(y_test.values, y_predict_xgb))
print("RMSE : ",rms)

RMSE :  4.294426237602071


In [14]:
# R_2 = rfr.score(X_train, y_train) 
R_2 = xgb.score(X_train.values, y_train.values) 
adj_R_2 = R_2 - (1 - R_2) * (X_train.shape[1] / (X_train.shape[0] - X_train.shape[1] - 1))
print("Adjusted R-squared : ",adj_R_2)

Adjusted R-squared :  0.8035955453841719


### Testing data

In [15]:
df_test = pd.read_csv('/Volumes/transcend/大檔案/Taxi/NYT/test.csv')

In [16]:
len(df_test)

9914

In [17]:
import geopy.distance
df_test['pickup_datetime'] = pd.to_datetime(df_test['pickup_datetime'].str.replace('UTC',''), format='%Y-%m-%d %H:%M:%S')
dis = []
for i in tqdm_notebook(range(len(df_test))):
    try:
        dis.append(geopy.distance.vincenty((df_test['pickup_latitude'][i],df_test['pickup_longitude'][i]),(df_test['dropoff_latitude'][i],df_test['dropoff_longitude'][i])).km)
    except:
        print(i)
        break
df_test['distance'] = pd.DataFrame(dis)
df_test['year'] = df_test['pickup_datetime'].dt.year
df_test['month'] = df_test['pickup_datetime'].dt.month
df_test['weekday'] = df_test['pickup_datetime'].dt.weekday
df_test['hour'] = df_test['pickup_datetime'].dt.hour






In [18]:
df_test.head(2)

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance,year,month,weekday,hour
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24,-73.97332,40.763805,-73.98143,40.743835,1,2.320991,2015,1,1,13
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24,-73.986862,40.719383,-73.998886,40.739201,1,2.423802,2015,1,1,13


In [18]:
X_test_real = df_test.drop(['key','pickup_datetime'], axis=1)
X_test_real[['distance']] = minmax.fit_transform(X_test_real[['distance']])

In [19]:
%%time
# y_final = rfr.predict(X_test_real)
y_final = xgb.predict(X_test_real.values)

CPU times: user 31.1 ms, sys: 3.42 ms, total: 34.5 ms
Wall time: 35.8 ms


In [20]:
df_submission = pd.DataFrame({'key': df_test.key, 'fare_amount': y_final},columns = ['key', 'fare_amount'])

In [21]:
df_submission.to_csv('submission4.csv', index = False)