In [0]:
# Change directory to VSCode workspace root so that relative path loads work correctly. Turn this addition off with the DataScience.changeDirOnImportExport setting
# ms-python.python added
import os
try:
	os.chdir(os.path.join(os.getcwd(), '..'))
	print(os.getcwd())
except:
	pass


 # [範例重點]
 - 增加精度差與緯度差兩個特徵, 觀察線性迴歸與梯度提升樹的預測結果有什麼影響 (In[4], Out[4], In[5], Out[5])
 - 再增加座標距離特徵, 觀察線性迴歸與梯度提升樹的預測結果有什麼影響 (In[6], Out[6], In[7], Out[7])

 # [教學目標]
 - 使用並觀察特徵組合, 在計程車費率預測競賽的影響

In [28]:
# 做完特徵工程前的所有準備
import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

data_path = 'data/'
df = pd.read_csv(data_path + 'taxi_data1.csv')

train_Y = df['fare_amount']
df = df.drop(['fare_amount'] , axis=1)
df.head()



Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2011-10-21 23:54:10 UTC,-73.99058,40.761071,-73.981128,40.758634,2
1,2015-02-03 10:42:03 UTC,-73.988403,40.723431,-73.989647,40.741695,1
2,2014-03-16 18:58:58 UTC,-74.015785,40.71511,-74.012029,40.707888,2
3,2009-06-13 16:10:54 UTC,-73.977322,40.787275,-73.95803,40.778838,3
4,2014-06-12 03:25:56 UTC,-73.989683,40.729717,-73.98249,40.761887,3


In [29]:
# 時間特徵分解方式:使用datetime
df['pickup_datetime'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S UTC'))
df['pickup_year'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%Y')).astype('int64')
df['pickup_month'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%m')).astype('int64')
df['pickup_day'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%d')).astype('int64')
df['pickup_hour'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%H')).astype('int64')
df['pickup_minute'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%M')).astype('int64')
df['pickup_second'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%S')).astype('int64')
df.head()



Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56


In [30]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
df = df.drop(['pickup_datetime'] , axis=1)
scaler = MinMaxScaler()
train_X = scaler.fit_transform(df)
Linear = LinearRegression()
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
GDBT = GradientBoostingRegressor()
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')



  return self.partial_fit(X, y)


Linear Reg Score : 0.026876871475640173


Gradient Boosting Reg Score : 0.7113004047116129


In [31]:
# 增加緯度差, 經度差兩個特徵
df['longitude_diff'] = df['dropoff_longitude'] - df['pickup_longitude']
df['latitude_diff'] = df['dropoff_latitude'] - df['pickup_latitude']
df[['longitude_diff', 'latitude_diff', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']].head()



Unnamed: 0,longitude_diff,latitude_diff,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
0,0.009452,-0.002437,-73.99058,40.761071,-73.981128,40.758634
1,-0.001244,0.018265,-73.988403,40.723431,-73.989647,40.741695
2,0.003756,-0.007222,-74.015785,40.71511,-74.012029,40.707888
3,0.019292,-0.008437,-73.977322,40.787275,-73.95803,40.778838
4,0.007193,0.03217,-73.989683,40.729717,-73.98249,40.761887


In [32]:
# 結果 : 光是用經緯度差, 準確度就有巨幅上升
train_X = scaler.fit_transform(df)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')



  return self.partial_fit(X, y)


Linear Reg Score : 0.026777745780496964


Gradient Boosting Reg Score : 0.7979911220465687


In [33]:
# 增加座標距離特徵
df['distance_2D'] = (df['longitude_diff']**2 + df['latitude_diff']**2)**0.5
df[['distance_2D', 'longitude_diff', 'latitude_diff']].head()



Unnamed: 0,distance_2D,longitude_diff,latitude_diff
0,0.009761,0.009452,-0.002437
1,0.018307,-0.001244,0.018265
2,0.00814,0.003756,-0.007222
3,0.021056,0.019292,-0.008437
4,0.032964,0.007193,0.03217


In [34]:
# 結果 : 加上座標距離後, 準確度再度上升(包含線性迴歸)
train_X = scaler.fit_transform(df)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')


  return self.partial_fit(X, y)


Linear Reg Score : 0.027473522287347694


Gradient Boosting Reg Score : 0.8052892864840526


In [35]:
from math import sin, cos, sqrt, atan2, radians
R = 6373.0
df.apply(lambda x: print(x))
"""
Your Code Here, set new character at df['distance_real']
"""

# 觀察結果 
train_X = scaler.fit_transform(df)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')


0      -73.990580
1      -73.988403
2      -74.015785
3      -73.977322
4      -73.989683
5      -73.997631
6      -73.954743
7      -73.982279
8      -73.965039
9      -73.986570
10     -73.953742
11     -73.982057
12     -73.983368
13     -74.005307
14     -73.970473
15     -73.951788
16     -73.999353
17     -74.001532
18     -73.966994
19     -73.967032
20     -73.988503
21     -73.958500
22     -73.966323
23     -73.989962
24     -73.958430
25     -73.979752
26     -73.994117
27     -73.980308
28     -73.980436
29     -73.951540
          ...    
4970   -73.982535
4971   -73.994720
4972   -74.007568
4973   -73.988347
4974   -74.003000
4975   -73.982075
4976   -73.853805
4977   -73.991290
4978   -74.007162
4979   -73.977730
4980   -73.967935
4981   -73.961487
4982   -73.863480
4983   -73.957769
4984   -73.991764
4985   -74.007777
4986   -73.996112
4987   -73.975698
4988   -73.990323
4989   -73.965067
4990   -73.975028
4991   -73.991309
4992   -73.957620
4993   -73.973512
4994   -73

  return self.partial_fit(X, y)


In [36]:
from math import sin, cos, sqrt, atan2, radians
R = 6373.0
df.apply(lambda x: print(x),axis = 0)
"""
Your Code Here, set new character at df['distance_real']
"""

# 觀察結果 
train_X = scaler.fit_transform(df)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')


0      -73.990580
1      -73.988403
2      -74.015785
3      -73.977322
4      -73.989683
5      -73.997631
6      -73.954743
7      -73.982279
8      -73.965039
9      -73.986570
10     -73.953742
11     -73.982057
12     -73.983368
13     -74.005307
14     -73.970473
15     -73.951788
16     -73.999353
17     -74.001532
18     -73.966994
19     -73.967032
20     -73.988503
21     -73.958500
22     -73.966323
23     -73.989962
24     -73.958430
25     -73.979752
26     -73.994117
27     -73.980308
28     -73.980436
29     -73.951540
          ...    
4970   -73.982535
4971   -73.994720
4972   -74.007568
4973   -73.988347
4974   -74.003000
4975   -73.982075
4976   -73.853805
4977   -73.991290
4978   -74.007162
4979   -73.977730
4980   -73.967935
4981   -73.961487
4982   -73.863480
4983   -73.957769
4984   -73.991764
4985   -74.007777
4986   -73.996112
4987   -73.975698
4988   -73.990323
4989   -73.965067
4990   -73.975028
4991   -73.991309
4992   -73.957620
4993   -73.973512
4994   -73

  return self.partial_fit(X, y)


In [37]:
from math import sin, cos, sqrt, atan2, radians
R = 6373.0
df.apply(lambda x: print(x),axis = 1)
"""
Your Code Here, set new character at df['distance_real']
"""

# 觀察結果 
train_X = scaler.fit_transform(df)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')


pickup_longitude      -73.990580
pickup_latitude        40.761071
dropoff_longitude     -73.981128
dropoff_latitude       40.758634
passenger_count         2.000000
pickup_year          2011.000000
pickup_month           10.000000
pickup_day             21.000000
pickup_hour            23.000000
pickup_minute          54.000000
pickup_second          10.000000
longitude_diff          0.009452
latitude_diff          -0.002437
distance_2D             0.009761
Name: 0, dtype: float64
pickup_longitude      -73.988403
pickup_latitude        40.723431
dropoff_longitude     -73.989647
dropoff_latitude       40.741695
passenger_count         1.000000
pickup_year          2015.000000
pickup_month            2.000000
pickup_day              3.000000
pickup_hour            10.000000
pickup_minute          42.000000
pickup_second           3.000000
longitude_diff         -0.001244
latitude_diff           0.018265
distance_2D             0.018307
Name: 1, dtype: float64
pickup_longitude      -74.01

  return self.partial_fit(X, y)


In [38]:
from math import sin, cos, sqrt, atan2, radians
R = 6373.0
df['distance_temp']=df.apply(lambda x: sin(radians(x['latitude_diff']) / 2)**2 + cos(radians(x['pickup_latitude'])) * cos(radians(x['dropoff_latitude'])) * sin(radians(x['longitude_diff']) / 2)**2,axis = 1)
df['distance_real']=df.apply(lambda x : R * 2 * atan2(sqrt(x['distance_temp']), sqrt(1 - x['distance_temp'])))
"""
Your Code Here, set new character at df['distance_real']
"""
df.drop(columns=['distance_temp'],inplace=True)
# 觀察結果 
train_X = scaler.fit_transform(df)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')


KeyError: ('distance_temp', 'occurred at index pickup_longitude')

In [39]:
from math import sin, cos, sqrt, atan2, radians
R = 6373.0
df.apply(lambda x : print(x['pickup_latitude']),axis = 1)
# df['distance_temp']=df.apply(lambda x: sin(radians(x['latitude_diff']) / 2)**2 + cos(radians(x['pickup_latitude'])) * cos(radians(x['dropoff_latitude'])) * sin(radians(x['longitude_diff']) / 2)**2,axis = 1)
# df['distance_real']=df.apply(lambda x : R * 2 * atan2(sqrt(x['distance_temp']), sqrt(1 - x['distance_temp'])))
# """
# Your Code Here, set new character at df['distance_real']
# """
# df.drop(columns=['distance_temp'],inplace=True)
# # 觀察結果 
# train_X = scaler.fit_transform(df)
# print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
# print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')


40.761071
40.72343063354492
40.715109999999996
40.787275
40.729717
40.721804999999996
40.789303000000004
40.764668
40.769971999999996
40.734727
40.77404
40.76879
40.727033
40.728124
40.783983
40.714289
40.761494
40.730875
40.757627
40.803832
40.731349
40.810140000000004
40.761511999999996
40.767047999999996
40.781613
40.766
40.75905
40.780442
40.770263
40.769698
40.778872
40.721392
40.728907
40.782242
40.769675
40.719572
40.778162
40.74077
40.740137
40.745531
40.780597
40.780448
40.759996
40.759965
40.785712
40.756921999999996
40.732062
40.77287000000001
40.786277
40.715532
40.708110999999995
40.739512
40.761565000000004
40.730245000000004
40.731742
40.769835
40.779154999999996
40.765926361083984
40.731012
40.750232000000004
40.764818
40.741875
40.731
40.782029
40.781555
40.757095
40.740671
40.747095
40.74295
40.727173
40.752052307128906
40.77235200000001
40.727756
40.773645
40.769977000000004
40.716275
40.71718
40.75722885131836
40.772242
40.723017
40.762029
40.748307000000004
40.7644

0       None
1       None
2       None
3       None
4       None
5       None
6       None
7       None
8       None
9       None
10      None
11      None
12      None
13      None
14      None
15      None
16      None
17      None
18      None
19      None
20      None
21      None
22      None
23      None
24      None
25      None
26      None
27      None
28      None
29      None
        ... 
4970    None
4971    None
4972    None
4973    None
4974    None
4975    None
4976    None
4977    None
4978    None
4979    None
4980    None
4981    None
4982    None
4983    None
4984    None
4985    None
4986    None
4987    None
4988    None
4989    None
4990    None
4991    None
4992    None
4993    None
4994    None
4995    None
4996    None
4997    None
4998    None
4999    None
Length: 5000, dtype: object

 # 範例 : 計程車費率預測
 https://www.kaggle.com/c/new-york-city-taxi-fare-prediction

 # [作業目標]
 - 使用並觀察特徵組合, 在計程車費率預測競賽的影響

 # [作業重點]
 - 仿造範例並參考今日課程內容, 使用經緯度一圈的長度比的概念造出新特徵, 觀察有什麼影響 (In[6], Out[6])
 - 只使用上面所造的這個新特徵, 觀察有什麼影響 (In[7], Out[7])

In [40]:
# 做完特徵工程前的所有準備
import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

data_path = 'data/'
df = pd.read_csv(data_path + 'taxi_data1.csv')

train_Y = df['fare_amount']
df = df.drop(['fare_amount'] , axis=1)
df.head()



Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2011-10-21 23:54:10 UTC,-73.99058,40.761071,-73.981128,40.758634,2
1,2015-02-03 10:42:03 UTC,-73.988403,40.723431,-73.989647,40.741695,1
2,2014-03-16 18:58:58 UTC,-74.015785,40.71511,-74.012029,40.707888,2
3,2009-06-13 16:10:54 UTC,-73.977322,40.787275,-73.95803,40.778838,3
4,2014-06-12 03:25:56 UTC,-73.989683,40.729717,-73.98249,40.761887,3


In [41]:
# 時間特徵分解方式:使用datetime
df['pickup_datetime'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S UTC'))
df['pickup_year'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%Y')).astype('int64')
df['pickup_month'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%m')).astype('int64')
df['pickup_day'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%d')).astype('int64')
df['pickup_hour'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%H')).astype('int64')
df['pickup_minute'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%M')).astype('int64')
df['pickup_second'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%S')).astype('int64')
df.head()



Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56


In [42]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
df = df.drop(['pickup_datetime'] , axis=1)
scaler = MinMaxScaler()
train_X = scaler.fit_transform(df)
Linear = LinearRegression()
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
GDBT = GradientBoostingRegressor()
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')



  return self.partial_fit(X, y)


Linear Reg Score : 0.026876871475640173


Gradient Boosting Reg Score : 0.7105095260656622


In [43]:

# 增加緯度差, 經度差, 座標距離等三個特徵
df['longitude_diff'] = df['dropoff_longitude'] - df['pickup_longitude']
df['latitude_diff'] = df['dropoff_latitude'] - df['pickup_latitude']
df['distance_2D'] = (df['longitude_diff']**2 + df['latitude_diff']**2)**0.5
df[['distance_2D', 'longitude_diff', 'latitude_diff', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']].head()



Unnamed: 0,distance_2D,longitude_diff,latitude_diff,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
0,0.009761,0.009452,-0.002437,-73.99058,40.761071,-73.981128,40.758634
1,0.018307,-0.001244,0.018265,-73.988403,40.723431,-73.989647,40.741695
2,0.00814,0.003756,-0.007222,-74.015785,40.71511,-74.012029,40.707888
3,0.021056,0.019292,-0.008437,-73.977322,40.787275,-73.95803,40.778838
4,0.032964,0.007193,0.03217,-73.989683,40.729717,-73.98249,40.761887


In [44]:
# 結果 : 準確度上升
train_X = scaler.fit_transform(df)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')


Linear Reg Score : 0.027473522287347694
Gradient Boosting Reg Score : 0.8051287162405014


  return self.partial_fit(X, y)


 # 作業1
 * 參考今日教材，試著使用經緯度一圈的長度比這一概念，組合出一個新特徵，再觀察原特徵加上新特徵是否提升了正確率?

In [45]:
from math import sin, cos, sqrt, atan2, radians
R = 6373.0
# df.apply(lambda x : print(x['pickup_latitude']),axis = 1)
df['distance_temp']=df.apply(lambda x: sin(radians(x['latitude_diff']) / 2)**2 + cos(radians(x['pickup_latitude'])) * cos(radians(x['dropoff_latitude'])) * sin(radians(x['longitude_diff']) / 2)**2,axis = 1)
df['distance_real']=df.apply(lambda x : R * 2 * atan2(sqrt(x['distance_temp']), sqrt(1 - x['distance_temp'])))
"""
Your Code Here, set new character at df['distance_real']
"""
df.drop(columns=['distance_temp'],inplace=True)
# 觀察結果 
train_X = scaler.fit_transform(df)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')


KeyError: ('distance_temp', 'occurred at index pickup_longitude')

 # 作業2
 * 試著只使用新特徵估計目標值(忽略原特徵)，效果跟作業1的結果比較起來效果如何?

In [46]:
train_X = scaler.fit_transform(df[['distance_real']])
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')



KeyError: "None of [Index(['distance_real'], dtype='object')] are in the [columns]"

In [47]:
from math import sin, cos, sqrt, atan2, radians
R = 6373.0
# df.apply(lambda x : print(x['pickup_latitude']),axis = 1)
df['distance_temp']=df.apply(lambda x: sin(radians(x['latitude_diff']) / 2)**2 + cos(radians(x['pickup_latitude'])) * cos(radians(x['dropoff_latitude'])) * sin(radians(x['longitude_diff']) / 2)**2,axis = 1)
# df['distance_real']=df.apply(lambda x : R * 2 * atan2(sqrt(x['distance_temp']), sqrt(1 - x['distance_temp'])))
"""
Your Code Here, set new character at df['distance_real']
"""
df.drop(columns=['distance_temp'],inplace=True)
# 觀察結果 
train_X = scaler.fit_transform(df)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')


  return self.partial_fit(X, y)


Linear Reg Score : 0.027473522287347694


Gradient Boosting Reg Score : 0.8050845398433102


In [48]:
from math import sin, cos, sqrt, atan2, radians
R = 6373.0
# df.apply(lambda x : print(x['pickup_latitude']),axis = 1)
df['distance_temp']=df.apply(lambda x: sin(radians(x['latitude_diff']) / 2)**2 + cos(radians(x['pickup_latitude'])) * cos(radians(x['dropoff_latitude'])) * sin(radians(x['longitude_diff']) / 2)**2,axis = 1)
df['distance_real']=df.apply(lambda x : R * 2 * atan2(sqrt(x['distance_temp']), sqrt(1 - x['distance_temp'])))
"""
Your Code Here, set new character at df['distance_real']
"""
df.drop(columns=['distance_temp'],inplace=True)
# 觀察結果 
train_X = scaler.fit_transform(df)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')


KeyError: ('distance_temp', 'occurred at index pickup_longitude')

In [49]:
from math import sin, cos, sqrt, atan2, radians
R = 6373.0
# df.apply(lambda x : print(x['pickup_latitude']),axis = 1)
df['distance_temp']=df.apply(lambda x: sin(radians(x['latitude_diff']) / 2)**2 + cos(radians(x['pickup_latitude'])) * cos(radians(x['dropoff_latitude'])) * sin(radians(x['longitude_diff']) / 2)**2,axis = 1)
df['distance_real']=df.apply(lambda x : R * 2 * atan2(sqrt(x['distance_temp']), sqrt(1 - x['distance_temp'])))
"""
Your Code Here, set new character at df['distance_real']
"""
df.drop(columns=['distance_temp'],inplace=True)
# 觀察結果 
train_X = scaler.fit_transform(df)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')


KeyError: ('distance_temp', 'occurred at index pickup_longitude')

In [50]:
from math import sin, cos, sqrt, atan2, radians
R = 6373.0
# df.apply(lambda x : print(x['pickup_latitude']),axis = 1)
df['distance_temp']=df.apply(lambda x: sin(radians(x['latitude_diff']) / 2)**2 + cos(radians(x['pickup_latitude'])) * cos(radians(x['dropoff_latitude'])) * sin(radians(x['longitude_diff']) / 2)**2,axis = 1)

df['distance_real']=df.apply(lambda x : R * 2 * atan2(sqrt(x['distance_temp']), sqrt(1 - x['distance_temp'])),axis = 1)
"""
Your Code Here, set new character at df['distance_real']
"""
df.drop(columns=['distance_temp'],inplace=True)
# 觀察結果 
train_X = scaler.fit_transform(df)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')


  return self.partial_fit(X, y)


Linear Reg Score : 0.3673261412185111


Gradient Boosting Reg Score : 0.8045840029878848


 # 作業2
 * 試著只使用新特徵估計目標值(忽略原特徵)，效果跟作業1的結果比較起來效果如何?

In [51]:
df.drop(columns=['distance_2D','latitude_diff','longitude_diff','pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude'])

Unnamed: 0,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,distance_real
0,2,2011,10,21,23,54,10,0.841214
1,1,2015,2,3,10,42,3,2.034290
2,2,2014,3,16,18,58,58,0.863469
3,3,2009,6,13,16,10,54,1.876349
4,3,2014,6,12,3,25,56,3.629241
5,1,2011,7,16,1,19,59,1.669293
6,1,2009,6,27,18,15,0,1.965126
7,1,2009,1,23,20,38,16,1.139969
8,1,2010,8,9,14,46,3,1.331393
9,1,2011,9,29,21,56,45,1.656440


In [52]:
train_X = scaler.fit_transform(df[['distance_real']])
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')



Linear Reg Score : 0.00115360961423967
Gradient Boosting Reg Score : 0.715704780543987


 # [教學目標]
 - 以下用房價預測資料, 觀察群聚編碼的效果

 # [範例重點]
 - 了解群聚編碼的寫作方式(In[3], Out[3])
 - 觀察群聚編碼, 搭配線性迴歸以及隨機森林分別有什麼影響 (In[6]~In[9], Out[6]~Out[9])

In [53]:
# 請先確認您的 sklearn 版本是否相同, 如果版本不是 0.21.1 版, 有可能跑出與本範例不同的結果
import sklearn
sklearn.__version__



'0.20.3'

In [54]:
# 請先確認您的 sklearn 版本是否相同, 如果版本不是 0.21.1 版, 有可能跑出與本範例不同的結果
import sklearn
sklearn.__version__



'0.20.3'

In [55]:
# 請先確認您的 sklearn 版本是否相同, 如果版本不是 0.21.1 版, 有可能跑出與本範例不同的結果
import sklearn
sklearn.__version__



'0.20.3'

In [1]:
# 請先確認您的 sklearn 版本是否相同, 如果版本不是 0.21.1 版, 有可能跑出與本範例不同的結果
import sklearn
sklearn.__version__



'0.21.2'

In [2]:
# 做完特徵工程前的所有準備
import pandas as pd
import numpy as np
import copy
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')

data_path = 'data/'
df = pd.read_csv(data_path + 'house_train.csv.gz')

train_Y = np.log1p(df['SalePrice'])
df = df.drop(['Id', 'SalePrice'] , axis=1)
df.head()



Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [3]:
# 生活總面積(GrLivArea)對販售條件(SaleCondition)做群聚編碼
# 寫法類似均值編碼, 只是對另一個特徵, 而非目標值
df['SaleCondition'] = df['SaleCondition'].fillna('None')
mean_df = df.groupby(['SaleCondition'])['GrLivArea'].mean().reset_index()
mode_df = df.groupby(['SaleCondition'])['GrLivArea'].apply(lambda x: x.mode()[0]).reset_index()
median_df = df.groupby(['SaleCondition'])['GrLivArea'].median().reset_index()
max_df = df.groupby(['SaleCondition'])['GrLivArea'].max().reset_index()
temp = pd.merge(mean_df, mode_df, how='left', on=['SaleCondition'])
temp = pd.merge(temp, median_df, how='left', on=['SaleCondition'])
temp = pd.merge(temp, max_df, how='left', on=['SaleCondition'])
temp.columns = ['SaleCondition', 'Area_Sale_Mean', 'Area_Sale_Mode', 'Area_Sale_Median', 'Area_Sale_Max']
temp



Unnamed: 0,SaleCondition,Area_Sale_Mean,Area_Sale_Mode,Area_Sale_Median,Area_Sale_Max
0,Abnorml,1436.128713,864,1302.0,4476
1,AdjLand,1112.5,980,1143.0,1184
2,Alloca,1701.75,1535,1439.5,3194
3,Family,1480.95,948,1390.5,2526
4,Normal,1492.96828,864,1456.0,4316
5,Partial,1795.696,1456,1646.0,5642


In [4]:
df = pd.merge(df, temp, how='left', on=['SaleCondition'])
df = df.drop(['SaleCondition'] , axis=1)
df.head()



Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,Area_Sale_Mean,Area_Sale_Mode,Area_Sale_Median,Area_Sale_Max
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,,,0,2,2008,WD,1492.96828,864,1456.0,4316
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,,,0,5,2007,WD,1492.96828,864,1456.0,4316
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,,,0,9,2008,WD,1492.96828,864,1456.0,4316
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,,,0,2,2006,WD,1436.128713,864,1302.0,4476
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,,,0,12,2008,WD,1492.96828,864,1456.0,4316


In [5]:
#只取 int64, float64 兩種數值型欄位, 存於 num_features 中
num_features = []
for dtype, feature in zip(df.dtypes, df.columns):
    if dtype == 'float64' or dtype == 'int64':
        num_features.append(feature)
print(f'{len(num_features)} Numeric Features : {num_features}\n')

# 削減文字型欄位, 只剩數值型欄位
df = df[num_features]
df = df.fillna(-1)
MMEncoder = MinMaxScaler()
df.head()



40 Numeric Features : ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'Area_Sale_Mean', 'Area_Sale_Mode', 'Area_Sale_Median', 'Area_Sale_Max']



Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,Area_Sale_Mean,Area_Sale_Mode,Area_Sale_Median,Area_Sale_Max
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,0,0,0,0,2,2008,1492.96828,864,1456.0,4316
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,0,0,0,0,5,2007,1492.96828,864,1456.0,4316
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,0,0,0,0,9,2008,1492.96828,864,1456.0,4316
3,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,0,0,0,0,2,2006,1436.128713,864,1302.0,4476
4,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,0,0,0,0,12,2008,1492.96828,864,1456.0,4316


In [6]:
# 沒有這四個新特徵的 dataframe 稱為 df_minus
df_minus = df.drop(['Area_Sale_Mean', 'Area_Sale_Mode', 'Area_Sale_Median', 'Area_Sale_Max'] , axis=1)

# 原始特徵 + 線性迴歸
train_X = MMEncoder.fit_transform(df_minus)
estimator = LinearRegression()
cross_val_score(estimator, train_X, train_Y, cv=5).mean()



0.8466510874327978

In [7]:
# 新特徵 + 線性迴歸 : 有些微改善
train_X = MMEncoder.fit_transform(df)
cross_val_score(estimator, train_X, train_Y, cv=5).mean()



0.8492735339540589

In [8]:
# 原始特徵 + 隨機森林
train_X = MMEncoder.fit_transform(df_minus)
estimator = RandomForestRegressor()
cross_val_score(estimator, train_X, train_Y, cv=5).mean()



0.8477258616251987

In [9]:
# 新特徵 + 隨機森林
train_X = MMEncoder.fit_transform(df)
cross_val_score(estimator, train_X, train_Y, cv=5).mean()


0.8431978472246943

 # 作業1
 * 試著使用鐵達尼號的例子，創立兩種以上的群聚編碼特徵( mean、median、mode、max、min、count 均可 )

 # 作業2
 * 將上述的新特徵，合併原有的欄位做生存率預估，結果是否有改善?

In [10]:
# 做完特徵工程前的所有準備 (與前範例相同)
import pandas as pd
import numpy as np
import copy
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

data_path = 'data/'
df = pd.read_csv(data_path + 'titanic_train.csv')

train_Y = df['Survived']
df = df.drop(['PassengerId', 'Survived'] , axis=1)
df.head()



Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [11]:
# 取一個類別型欄位, 與一個數值型欄位, 做群聚編碼
"""
Your Code Here
"""
mean = df.groupby('Ticket')['Fare'].mean().reset_index(drop =True)
mean



0       86.5000
1       79.6500
2       52.0000
3       26.5500
4       75.2500
5       33.5000
6       38.5000
7       57.9792
8       30.0000
9       26.5500
10      26.5500
11      26.5500
12       0.0000
13       0.0000
14      30.0000
15       0.0000
16       0.0000
17      31.0000
18      39.6000
19      26.5500
20      28.5000
21      26.5500
22      27.7500
23      26.5500
24      26.0000
25      47.1000
26      30.0000
27     211.5000
28      55.0000
29      61.9792
         ...   
651      8.0500
652      8.0500
653      8.0500
654      8.0500
655      7.9250
656      7.1250
657      7.1250
658      7.1250
659      7.9250
660      7.9250
661      7.9250
662      7.9250
663      7.9250
664      7.9250
665      7.9250
666      7.9250
667      7.9250
668     15.8500
669      7.9250
670      7.9250
671      7.9250
672     10.5000
673     10.5000
674     10.5000
675     23.4500
676     34.3750
677      7.5500
678     61.1750
679     10.5000
680     71.0000
Name: Fare, Length: 681,

In [12]:
# 取一個類別型欄位, 與一個數值型欄位, 做群聚編碼
"""
Your Code Here
"""
mean = df.groupby('Ticket')['Fare'].agg({'Fare_Mean':'mean'}).reset_index()
mean



Unnamed: 0,Ticket,Fare_Mean
0,110152,86.5000
1,110413,79.6500
2,110465,52.0000
3,110564,26.5500
4,110813,75.2500
5,111240,33.5000
6,111320,38.5000
7,111361,57.9792
8,111369,30.0000
9,111426,26.5500


In [13]:
# 取一個類別型欄位, 與一個數值型欄位, 做群聚編碼
"""
Your Code Here
"""
mean = df.groupby('Ticket')['Fare'].agg({'Fare_Mean':'mean'}).reset_index()
pd.merge(df,mean,how='left',on='Ticket')



Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Fare_Mean
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,7.25000
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,71.28330
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,7.92500
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,53.10000
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,8.05000
5,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,8.45830
6,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,51.86250
7,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S,21.07500
8,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,11.13330
9,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,30.07080


In [14]:
# 取一個類別型欄位, 與一個數值型欄位, 做群聚編碼
"""
Your Code Here
"""
mean = df.groupby('Ticket')['Fare'].agg({'Fare_Mean':'mean'}).reset_index()
df = pd.merge(df,mean,how='left',on='Ticket')



In [15]:
#只取 int64, float64 兩種數值型欄位, 存於 num_features 中
num_features = []
for dtype, feature in zip(df.dtypes, df.columns):
    if dtype == 'float64' or dtype == 'int64':
        num_features.append(feature)
print(f'{len(num_features)} Numeric Features : {num_features}\n')

# 削減文字型欄位, 只剩數值型欄位
df = df[num_features]
df = df.fillna(-1)
MMEncoder = MinMaxScaler()
df.head()


6 Numeric Features : ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Fare_Mean']



Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Fare_Mean
0,3,22.0,1,0,7.25,7.25
1,1,38.0,1,0,71.2833,71.2833
2,3,26.0,0,0,7.925,7.925
3,1,35.0,1,0,53.1,53.1
4,3,35.0,0,0,8.05,8.05


In [16]:
# 原始特徵 + 邏輯斯迴歸
"""
Your Code Here
"""
x_train = df[['Pclass','Age','SibSp','Parch','Fare','Fare_Mean']]
estimator = LinearRegression()
cross_val_score(estimator, train_X, train_Y, cv=5).mean()



ValueError: Found input variables with inconsistent numbers of samples: [1460, 891]

In [17]:
x_train.shape

(891, 6)

In [18]:
train_Y.shape

(891,)

In [19]:
# 原始特徵 + 邏輯斯迴歸
"""
Your Code Here
"""
train_X = df[['Pclass','Age','SibSp','Parch','Fare','Fare_Mean']]
estimator = LinearRegression()
cross_val_score(estimator, train_X, train_Y, cv=5).mean()



0.11434482469447602

In [20]:
# 新特徵 + 邏輯斯迴歸
"""
Your Code Here
"""
train_X = df[['Fare_Mean']]
estimator = LogisticRegression()
cross_val_score(estimator,train_X,train_X)


ValueError: Unknown label type: 'continuous'

In [21]:
# 新特徵 + 邏輯斯迴歸
"""
Your Code Here
"""

train_X = df[['Fare_Mean']]
train_X
# estimator = LogisticRegression()
# cross_val_score(estimator,train_X,train_X)


Unnamed: 0,Fare_Mean
0,7.25000
1,71.28330
2,7.92500
3,53.10000
4,8.05000
5,8.45830
6,51.86250
7,21.07500
8,11.13330
9,30.07080


In [22]:
# 新特徵 + 邏輯斯迴歸
"""
Your Code Here
"""

train_X = df[['Fare_Mean']]
train_X
estimator = LogisticRegression()
cross_val_score(estimator,train_X,train_Y,cv=5).mean()


0.6644676707850629

In [23]:
# 新特徵 + 邏輯斯迴歸
"""
Your Code Here
"""

train_X = df[['Fare_Mean']]
train_X
estimator = LogisticRegression()
cross_val_score(estimator,train_X,train_Y,cv=5).mean()


0.6644676707850629