# 範例 : 計程車費率預測
https://www.kaggle.com/c/new-york-city-taxi-fare-prediction

# [作業目標]
- 使用並觀察特徵組合, 在計程車費率預測競賽的影響

# [作業重點]
- 仿造範例並參考今日課程內容, 使用經緯度一圈的長度比的概念造出新特徵, 觀察有什麼影響 (In[6], Out[6])
- 只使用上面所造的這個新特徵, 觀察有什麼影響 (In[7], Out[7])

In [1]:
# 程式區塊 A
# 將需要的都import進來
import os
import copy
import time
import math
import numpy             as np
import pandas            as pd
import seaborn           as sns
import datetime          as dt
import warnings
import matplotlib.pyplot as plt
from scipy                   import stats
from sklearn.ensemble        import GradientBoostingRegressor
from sklearn.linear_model    import LogisticRegression,LinearRegression
from sklearn.preprocessing   import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import cross_val_score

# 將較長的函式改名一下
MME  = MinMaxScaler()
LE   = LabelEncoder()
LR   = LogisticRegression()
LIR  = LinearRegression()
GBR  = GradientBoostingRegressor()
PDDF = pd.DataFrame()
# 一些必要的設定
warnings.filterwarnings('ignore')
%matplotlib inline

# 設定【data的資料夾路徑】，命名為【data_folder】
data_folder = 'C:/Users/Ynitsed/Documents/GitHub/2nd-ML100Days/data'

In [2]:
# 設定t001為某個data路徑
# 設定t002為pd裡read data的功能
t001_train = os.path.join(data_folder, 'taxi_data1.csv')
t002_train = pd.read_csv(t001_train)
print('Path of read in data: %s' %t001_train)
print(t002_train.shape)
t002_train.head()

Path of read in data: C:/Users/Ynitsed/Documents/GitHub/2nd-ML100Days/data\taxi_data1.csv
(5000, 7)


Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,4.5,2011-10-21 23:54:10 UTC,-73.99058,40.761071,-73.981128,40.758634,2
1,12.0,2015-02-03 10:42:03 UTC,-73.988403,40.723431,-73.989647,40.741695,1
2,6.5,2014-03-16 18:58:58 UTC,-74.015785,40.71511,-74.012029,40.707888,2
3,6.5,2009-06-13 16:10:54 UTC,-73.977322,40.787275,-73.95803,40.778838,3
4,11.0,2014-06-12 03:25:56 UTC,-73.989683,40.729717,-73.98249,40.761887,3


In [3]:
# 把【train_Y】先列出來
train_Y = t002_train['fare_amount']
# 把【train_Y】要的欄位拿掉，可以當作在處理【train_X】，只是還沒到最終步。
t003_train = t002_train.drop(['fare_amount'] , axis=1)
t003_train.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2011-10-21 23:54:10 UTC,-73.99058,40.761071,-73.981128,40.758634,2
1,2015-02-03 10:42:03 UTC,-73.988403,40.723431,-73.989647,40.741695,1
2,2014-03-16 18:58:58 UTC,-74.015785,40.71511,-74.012029,40.707888,2
3,2009-06-13 16:10:54 UTC,-73.977322,40.787275,-73.95803,40.778838,3
4,2014-06-12 03:25:56 UTC,-73.989683,40.729717,-73.98249,40.761887,3


In [4]:
# 1.直接將數值化的欄位(5個變數)拿來跑分數
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
t004_train = t003_train.drop(['pickup_datetime'] , axis=1)
train_X = MME.fit_transform(t004_train)
print(train_X.shape)
print(f'Linear Reg Score : {cross_val_score(LIR, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GBR, train_X, train_Y, cv=5).mean()}')

(5000, 5)
Linear Reg Score : 0.009613341943386654
Gradient Boosting Reg Score : 0.6976478043764092


In [5]:
# 時間特徵分解方式:使用datetime
t003_train['pickup_datetime'] = t003_train['pickup_datetime'].apply(lambda x: dt.datetime.strptime(x, '%Y-%m-%d %H:%M:%S UTC'))
t003_train['pickup_year']     = t003_train['pickup_datetime'].apply(lambda x: dt.datetime.strftime(x, '%Y')).astype('int64')
t003_train['pickup_month']    = t003_train['pickup_datetime'].apply(lambda x: dt.datetime.strftime(x, '%m')).astype('int64')
t003_train['pickup_day']      = t003_train['pickup_datetime'].apply(lambda x: dt.datetime.strftime(x, '%d')).astype('int64')
t003_train['pickup_hour']     = t003_train['pickup_datetime'].apply(lambda x: dt.datetime.strftime(x, '%H')).astype('int64')
t003_train['pickup_minute']   = t003_train['pickup_datetime'].apply(lambda x: dt.datetime.strftime(x, '%M')).astype('int64')
t003_train['pickup_second']   = t003_train['pickup_datetime'].apply(lambda x: dt.datetime.strftime(x, '%S')).astype('int64')
# t003_train['pickup_dow']      = t003_train['pickup_datetime'].apply(lambda x: dt.datetime.strftime(x, '%w')).astype('int64')
# t003_train['pickup_woy']      = t003_train['pickup_datetime'].apply(lambda x: dt.datetime.strftime(x, '%W')).astype('int64')
# t003_train['day_cycle']       = t003_train['pickup_hour']/12 + t003_train['pickup_minute']/720 + t003_train['pickup_second']/43200
# t003_train['day_cycle']       = t003_train['day_cycle'].map(lambda x:math.sin(x*math.pi))
t003_train.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56


In [6]:
# 2.將t003新增的6個欄位(共11個變數)來跑
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
t004_train = t003_train.drop(['pickup_datetime'] , axis=1)
train_X = MME.fit_transform(t004_train)
print(train_X.shape)
print(f'Linear Reg Score : {cross_val_score(LIR, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GBR, train_X, train_Y, cv=5).mean()}')

(5000, 11)
Linear Reg Score : 0.026876871475636888
Gradient Boosting Reg Score : 0.711451706345192


In [7]:
# 增加緯度差, 經度差, 座標距離等三個特徵
t004_train['longitude_diff'] = t004_train['dropoff_longitude'] - t004_train['pickup_longitude']
t004_train['latitude_diff'] = t004_train['dropoff_latitude'] - t004_train['pickup_latitude']
t004_train['distance_2D'] = (t004_train['longitude_diff']**2 + t004_train['latitude_diff']**2)**0.5
t004_train[['distance_2D', 'longitude_diff', 'latitude_diff', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']].head()

Unnamed: 0,distance_2D,longitude_diff,latitude_diff,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
0,0.009761,0.009452,-0.002437,-73.99058,40.761071,-73.981128,40.758634
1,0.018307,-0.001244,0.018265,-73.988403,40.723431,-73.989647,40.741695
2,0.00814,0.003756,-0.007222,-74.015785,40.71511,-74.012029,40.707888
3,0.021056,0.019292,-0.008437,-73.977322,40.787275,-73.95803,40.778838
4,0.032964,0.007193,0.03217,-73.989683,40.729717,-73.98249,40.761887


In [8]:
# 3.再新增的3個欄位(共14個變數)來跑
# 結果 : 準確度上升
train_X = MME.fit_transform(t004_train)
print(train_X.shape)
print(f'Linear Reg Score : {cross_val_score(LIR, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GBR, train_X, train_Y, cv=5).mean()}')

(5000, 14)
Linear Reg Score : 0.027525445787303093
Gradient Boosting Reg Score : 0.8038225106415243


# 作業1
* 參考今日教材，試著使用經緯度一圈的長度比這一概念，組合出一個新特徵，再觀察原特徵加上新特徵是否提升了正確率?

In [9]:
latitude_average = t004_train['pickup_latitude'].mean()
latitude_factor = math.cos(latitude_average/180*math.pi)
t004_train['distance_real'] = ((t004_train['longitude_diff']*latitude_factor)**2 + t004_train['latitude_diff']**2)**0.5

# 4.再新增的1個欄位(共15個變數)來跑
# 觀察結果 : 準確度下降
train_X = MME.fit_transform(t004_train)
print(train_X.shape)
print(f'Linear Reg Score : {cross_val_score(LIR, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GBR, train_X, train_Y, cv=5).mean()}')

(5000, 15)
Linear Reg Score : 0.025381272659436283
Gradient Boosting Reg Score : 0.8022479566546277


# 作業2
* 試著只使用新特徵估計目標值(忽略原特徵)，效果跟作業1的結果比較起來效果如何?

In [10]:
# 5.只選1個欄位(特徵化之後的欄位)來跑
train_X = MME.fit_transform(t004_train[['distance_real']])
print(train_X.shape)
print(f'Linear Reg Score : {cross_val_score(LIR, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GBR, train_X, train_Y, cv=5).mean()}')

(5000, 1)
Linear Reg Score : 0.0014462469864103156
Gradient Boosting Reg Score : 0.7236450666734193


### Day26教材方向和目標
選用【train_X】的理解  
[分數實際的意思](https://scikit-learn.org/stable/modules/model_evaluation.html#common-cases-predefined-values)  
[Gradient Boosting的概念](https://www.ycc.idv.tw/confusion-matrix.html)

### Day26忽略部分


### Day26其他補充
這裡面有蠻多重要的統計觀念在裡面，這教材是非戰之罪，  
因為教材不應該還得教大家統計學，只是一個不懂統計學的人來學這個，  
我實在不知道他該如何從完全不會到真的弄得懂。  

今天的課程不斷在嘗試製作【train_X】：
1. 直接將數值化的欄位(5個變數)拿來跑分數
2. 將t003新增的6個欄位(共11個變數)來跑
3. 再新增的3個欄位(共14個變數)來跑
4. 再新增的1個欄位(共15個變數)來跑
5. 只選1個欄位(特徵化之後的欄位)(共1個變數)來跑

分數大小順序分別是：  
A. 2相較於1上升：因為變數一口氣多了6個，所以解釋度變高很合理，但以增加幅度來看，仍不太有意義。  
B. 3相較於2上升：變數多了3個，直接將0.711拉到0.805，可以看出這三個變數加得很不錯  
C. 4相較於3下降：只加了1個欄位，而且還和最原始的5個變數其中4個有完全相關，那當然增加這個變數(從14個變成15個)是無效的。  
D. 5相較於4下降：當我們只拿4的這一個新增欄位來看，可以發現只放1個變數時，當然會比放了15個變數的解釋度還低。  
E. 5相較於1上升：只是只用1個欄位(1個變數)時，卻已經比最原始的5個欄位的解釋度還來得高，可見這1個變數仍是有很大的解釋力。  

從上述可發現一些統計學該要有的基礎知識：  
從C，增加變數不一定會增加解釋度，必須看這個變數是不是新的，如果根本是拿舊的欄位來做的，那可以想像成已經解釋過了，沒辦法增加解釋，這是【共線性(線性重和)】的議題。  
從E，可知道妥當處理過的變數可以增加解釋度，縱使概念上是將4個變數合併成1個，但只要處理得夠好，1個變數即能超過4個變數的解釋度。