In [0]:
# Change directory to VSCode workspace root so that relative path loads work correctly. Turn this addition off with the DataScience.changeDirOnImportExport setting
# ms-python.python added
import os
try:
	os.chdir(os.path.join(os.getcwd(), '..'))
	print(os.getcwd())
except:
	pass


 # 範例 : (Kaggle)鐵達尼生存預測

 # [教學目標]
 - 以下用鐵達尼生存預測資料, 觀察計數編碼與特徵雜湊的效果

 # [範例重點]
 - 了解計數編碼的寫作方式(In[5], Out[5]), 以及計數編碼搭配邏輯斯迴歸對於測結果有什麼影響 (In[7], Out[7])
 - 觀察 雜湊編碼, 以及 計數編碼+雜湊編碼 分別搭配邏輯斯迴歸對於測結果有什麼影響 (In[8], Out[8], In[9], Out[9])

In [1]:
# 做完特徵工程前的所有準備
import pandas as pd
import numpy as np
import copy, time
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

data_path = 'data/'
df_train = pd.read_csv(data_path + 'titanic_train.csv')
df_test = pd.read_csv(data_path + 'titanic_test.csv')

train_Y = df_train['Survived']
ids = df_test['PassengerId']
df_train = df_train.drop(['PassengerId', 'Survived'] , axis=1)
df_test = df_test.drop(['PassengerId'] , axis=1)
df = pd.concat([df_train,df_test])
df.head()



Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
#只取類別值 (object) 型欄位, 存於 object_features 中
object_features = []
for dtype, feature in zip(df.dtypes, df.columns):
    if dtype == 'object':
        object_features.append(feature)
print(f'{len(object_features)} Object Features : {object_features}\n')

# 只留類別型欄位
df = df[object_features]
df = df.fillna('None')
train_num = train_Y.shape[0]
df.head()



5 Object Features : ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']



Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S
4,"Allen, Mr. William Henry",male,373450,,S


In [3]:
# 觀察欄位相異值數量
df.select_dtypes(include=["object"]).apply(pd.Series.nunique)



Name        1307
Sex            2
Ticket       929
Cabin        187
Embarked       4
dtype: int64

In [4]:
# 對照組 : 標籤編碼 + 邏輯斯迴歸
df_temp = pd.DataFrame()
for c in df.columns:
    df_temp[c] = LabelEncoder().fit_transform(df[c])
train_X = df_temp[:train_num]
estimator = LogisticRegression()
print(cross_val_score(estimator, train_X, train_Y, cv=5).mean())
df_temp.head()



0.780004837244799




Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,155,1,720,185,3
1,286,0,816,106,0
2,523,0,914,185,3
3,422,0,65,70,3
4,22,1,649,185,3


In [5]:
# 加上 'Ticket' 欄位的計數編碼
# 第一行 : df.groupby(['Ticket']) 會輸出 df 以 'Ticket' 群聚後的結果, 但因為群聚一類只會有一個值, 因此必須要定義運算
# 例如 df.groupby(['Ticket']).size(), 但欄位名稱會變成 size, 要取別名就需要用語法 df.groupby(['Ticket']).agg({'Ticket_Count':'size'})
# 這樣出來的計數欄位名稱會叫做 'Ticket_Count', 因為這樣群聚起來的 'Ticket' 是 index, 所以需要 reset_index() 轉成一欄
# 因此第一行的欄位, 在第三行按照 'Ticket_Count' 排序後, 最後的 DataFrame 輸出如 Out[5]
count_df = df.groupby(['Ticket'])['Name'].agg({'Ticket_Count':'size'}).reset_index()
# 但是上面資料表結果只是 'Ticket' 名稱對應的次數, 要做計數編碼還需要第二行 : 將上表結果與原表格 merge, 合併於 'Ticket' 欄位
# 使用 how='left' 是完全保留原資料表的所有 index 與順序
df = pd.merge(df, count_df, on=['Ticket'], how='left')
count_df.sort_values(by=['Ticket_Count'], ascending=False).head(10)



is deprecated and will be removed in a future version
  


Unnamed: 0,Ticket,Ticket_Count
778,CA. 2343,11
104,1601,8
775,CA 2144,8
335,3101295,7
454,347077,7
459,347082,7
847,S.O.C. 14879,7
824,PC 17608,7
123,19950,6
49,113781,6


In [6]:
# 印出來看看, 加了計數編碼的資料表 df 有何不同
df.head()



Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked,Ticket_Count
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S,1
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C,2
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S,1
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S,2
4,"Allen, Mr. William Henry",male,373450,,S,1


In [7]:
# 'Ticket'計數編碼 + 邏輯斯迴歸
df_temp = pd.DataFrame()
for c in object_features:
    df_temp[c] = LabelEncoder().fit_transform(df[c])
df_temp['Ticket_Count'] = df['Ticket_Count']
train_X = df_temp[:train_num]
estimator = LogisticRegression()
print(cross_val_score(estimator, train_X, train_Y, cv=5).mean())
df_temp.head()





0.7811221556805532




Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked,Ticket_Count
0,155,1,720,185,3,1
1,286,0,816,106,0,2
2,523,0,914,185,3,1
3,422,0,65,70,3,2
4,22,1,649,185,3,1


In [8]:
# 'Ticket'特徵雜湊 + 邏輯斯迴歸
df_temp = pd.DataFrame()
for c in object_features:
    df_temp[c] = LabelEncoder().fit_transform(df[c])
# 這邊的雜湊編碼, 是直接將 'Ticket' 的名稱放入雜湊函數的輸出數值, 為了要確定是緊密(dense)特徵, 因此除以10後看餘數
# 這邊的 10 是隨機選擇, 不一定要用 10, 同學可以自由選擇購小的數字試看看. 基本上效果都不會太好
df_temp['Ticket_Hash'] = df['Ticket'].map(lambda x:hash(x) % 10)
train_X = df_temp[:train_num]
estimator = LogisticRegression()
print(cross_val_score(estimator, train_X, train_Y, cv=5).mean())
df_temp.head()



0.7799985601749351




Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked,Ticket_Hash
0,155,1,720,185,3,6
1,286,0,816,106,0,5
2,523,0,914,185,3,8
3,422,0,65,70,3,0
4,22,1,649,185,3,3


In [9]:
# 'Ticket'計數編碼 + 'Ticket'特徵雜湊 + 邏輯斯迴歸
df_temp = pd.DataFrame()
for c in object_features:
    df_temp[c] = LabelEncoder().fit_transform(df[c])
df_temp['Ticket_Hash'] = df['Ticket'].map(lambda x:hash(x) % 10)
df_temp['Ticket_Count'] = df['Ticket_Count']
train_X = df_temp[:train_num]
estimator = LogisticRegression()
print(cross_val_score(estimator, train_X, train_Y, cv=5).mean())
df_temp.head()




0.7811158786106893


Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked,Ticket_Hash,Ticket_Count
0,155,1,720,185,3,6,1
1,286,0,816,106,0,5,2
2,523,0,914,185,3,8,1
3,422,0,65,70,3,0,2
4,22,1,649,185,3,3,1


 # 作業1
 * 參考範例，將鐵達尼的艙位代碼( 'Cabin' )欄位使用特徵雜湊 / 標籤編碼 / 目標均值編碼三種轉換後，
 與其他類別型欄位一起預估生存機率

 # 作業2
 * 承上題，三者比較效果何者最好?

In [10]:
# 做完特徵工程前的所有準備 (與前範例相同)
import pandas as pd
import numpy as np
import copy, time
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

data_path = 'data/'
df_train = pd.read_csv(data_path + 'titanic_train.csv')
df_test = pd.read_csv(data_path + 'titanic_test.csv')

train_Y = df_train['Survived']
ids = df_test['PassengerId']
df_train = df_train.drop(['PassengerId', 'Survived'] , axis=1)
df_test = df_test.drop(['PassengerId'] , axis=1)
df = pd.concat([df_train,df_test])
df.head()



Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [11]:
#只取類別值 (object) 型欄位, 存於 object_features 中
object_features = []
for dtype, feature in zip(df.dtypes, df.columns):
    if dtype == 'object':
        object_features.append(feature)
print(f'{len(object_features)} Object Features : {object_features}\n')

# 只留類別型欄位
df = df[object_features]
df = df.fillna('None')
train_num = train_Y.shape[0]
df.head()


5 Object Features : ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']



Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S
4,"Allen, Mr. William Henry",male,373450,,S


 # 作業2
 * 承上題，三者比較效果何者最好?

In [12]:
# 加上 'Cabin' 欄位的計數編碼
df_group  = df_temp.groupby(['Cabin'])['Name'].aggregate({'Cabin_Count':'size'})
df_group


is deprecated and will be removed in a future version
  


Unnamed: 0_level_0,Cabin_Count
Cabin,Unnamed: 1_level_1
0,1
1,1
2,1
3,1
4,1
5,1
6,1
7,1
8,1
9,1


In [13]:
# 加上 'Cabin' 欄位的計數編碼
df_group  = df_temp.groupby(['Cabin'])['Cabin'].aggregate({'Cabin_Count':'size'})
df_group


is deprecated and will be removed in a future version
  


Unnamed: 0_level_0,Cabin_Count
Cabin,Unnamed: 1_level_1
0,1
1,1
2,1
3,1
4,1
5,1
6,1
7,1
8,1
9,1


In [14]:
# 加上 'Cabin' 欄位的計數編碼
df_group  = df_temp.groupby(['Cabin'])['Cabin'].aggregate({'Cabin_Count':'size'}).reset_index()
df_group


is deprecated and will be removed in a future version
  


Unnamed: 0,Cabin,Cabin_Count
0,0,1
1,1,1
2,2,1
3,3,1
4,4,1
5,5,1
6,6,1
7,7,1
8,8,1
9,9,1


In [15]:
# 加上 'Cabin' 欄位的計數編碼
df_group  = df_temp.groupby(['Cabin'])['Cabin'].aggregate({'Cabin_Count':'size'}).reset_index(drop=True)
df_group


is deprecated and will be removed in a future version
  


Unnamed: 0,Cabin_Count
0,1
1,1
2,1
3,1
4,1
5,1
6,1
7,1
8,1
9,1


In [16]:
# 加上 'Cabin' 欄位的計數編碼
df_group  = df_temp.groupby(['Cabin'])['Cabin'].aggregate({'Cabin_Count':'size'}).reset_index(drop=True)
df_temp = df_temp.merge(df_group,how='inner',on='Cabin')
df_temp



is deprecated and will be removed in a future version
  


KeyError: 'Cabin'

In [17]:
# 加上 'Cabin' 欄位的計數編碼
df_group  = df_temp.groupby(['Cabin'])['Cabin'].aggregate({'Cabin_Count':'size'}).reset_index(drop=True)
df_temp = df_temp.merge(df_group,how='left',on=['Cabin'])
df_temp



is deprecated and will be removed in a future version
  


KeyError: 'Cabin'

In [18]:
# 加上 'Cabin' 欄位的計數編碼
df_group  = df_temp.groupby(['Cabin'])['Cabin'].aggregate({'Cabin_Count':'size'}).reset_index(drop=True)

# df_temp = df_temp.merge(df_group,how='left',on=['Cabin'])
df_group



is deprecated and will be removed in a future version
  


Unnamed: 0,Cabin_Count
0,1
1,1
2,1
3,1
4,1
5,1
6,1
7,1
8,1
9,1


In [19]:
# 加上 'Cabin' 欄位的計數編碼
df_group  = df_temp.groupby(['Cabin'])['Cabin'].aggregate({'Cabin':'Cabin','Cabin_Count':'size'}).reset_index(drop=True)

# df_temp = df_temp.merge(df_group,how='left',on=['Cabin'])
df_group



is deprecated and will be removed in a future version
  


AttributeError: 'SeriesGroupBy' object has no attribute 'Cabin'

In [20]:
# 加上 'Cabin' 欄位的計數編碼
df_group  = df_temp.groupby(['Cabin'])['Cabin'].aggregate({'Cabin':'Cabin','Cabin_Count':'size'}).reset_index(drop=True)

# df_temp = df_temp.merge(df_group,how='left',on=['Cabin'])
df_group.describe



is deprecated and will be removed in a future version
  


AttributeError: 'SeriesGroupBy' object has no attribute 'Cabin'

In [21]:
# 加上 'Cabin' 欄位的計數編碼
df_group  = df_temp.groupby(['Cabin'])['Cabin'].aggregate({'Cabin_Count':'size'}).reset_index(drop=True)

# df_temp = df_temp.merge(df_group,how='left',on=['Cabin'])
df_group.describe



is deprecated and will be removed in a future version
  


<bound method NDFrame.describe of      Cabin_Count
0              1
1              1
2              1
3              1
4              1
5              1
6              1
7              1
8              1
9              1
10             1
11             1
12             1
13             1
14             3
15             1
16             1
17             1
18             1
19             1
20             1
21             1
22             1
23             1
24             2
25             1
26             2
27             2
28             1
29             1
..           ...
157            1
158            1
159            1
160            1
161            2
162            1
163            2
164            1
165            2
166            1
167            1
168            1
169            1
170            2
171            1
172            1
173            2
174            1
175            1
176            1
177            1
178            2
179            2
180            4
181           

In [22]:
# 加上 'Cabin' 欄位的計數編碼
df_group  = df_temp.groupby(['Cabin']).reset_index(drop=True)

# df_temp = df_temp.merge(df_group,how='left',on=['Cabin'])
df_group.describe



AttributeError: Cannot access callable attribute 'reset_index' of 'DataFrameGroupBy' objects, try using the 'apply' method

In [23]:
# 加上 'Cabin' 欄位的計數編碼
df_group  = df_temp.groupby(['Cabin'])

# df_temp = df_temp.merge(df_group,how='left',on=['Cabin'])
df_group



<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000020BB70293C8>

In [24]:
# 加上 'Cabin' 欄位的計數編碼
df_group  = df_temp.groupby(['Cabin'])

# df_temp = df_temp.merge(df_group,how='left',on=['Cabin'])
df_group.describe()



Unnamed: 0_level_0,Name,Name,Name,Name,Name,Name,Name,Name,Sex,Sex,...,Ticket_Hash,Ticket_Hash,Ticket_Count,Ticket_Count,Ticket_Count,Ticket_Count,Ticket_Count,Ticket_Count,Ticket_Count,Ticket_Count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Cabin,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,1.0,1024.000000,,1024.0,1024.00,1024.0,1024.00,1024.0,1.0,1.000000,...,0.00,0.0,1.0,1.000000,,1.0,1.0,1.0,1.0,1.0
1,1.0,1022.000000,,1022.0,1022.00,1022.0,1022.00,1022.0,1.0,0.000000,...,8.00,8.0,1.0,1.000000,,1.0,1.0,1.0,1.0,1.0
2,1.0,243.000000,,243.0,243.00,243.0,243.00,243.0,1.0,1.000000,...,4.00,4.0,1.0,2.000000,,2.0,2.0,2.0,2.0,2.0
3,1.0,352.000000,,352.0,352.00,352.0,352.00,352.0,1.0,0.000000,...,5.00,5.0,1.0,1.000000,,1.0,1.0,1.0,1.0,1.0
4,1.0,354.000000,,354.0,354.00,354.0,354.00,354.0,1.0,1.000000,...,1.00,1.0,1.0,1.000000,,1.0,1.0,1.0,1.0,1.0
5,1.0,1117.000000,,1117.0,1117.00,1117.0,1117.00,1117.0,1.0,1.000000,...,5.00,5.0,1.0,1.000000,,1.0,1.0,1.0,1.0,1.0
6,1.0,353.000000,,353.0,353.00,353.0,353.00,353.0,1.0,1.000000,...,0.00,0.0,1.0,2.000000,,2.0,2.0,2.0,2.0,2.0
7,1.0,151.000000,,151.0,151.00,151.0,151.00,151.0,1.0,1.000000,...,4.00,4.0,1.0,1.000000,,1.0,1.0,1.0,1.0,1.0
8,1.0,93.000000,,93.0,93.00,93.0,93.00,93.0,1.0,1.000000,...,4.00,4.0,1.0,1.000000,,1.0,1.0,1.0,1.0,1.0
9,1.0,1013.000000,,1013.0,1013.00,1013.0,1013.00,1013.0,1.0,1.000000,...,4.00,4.0,1.0,1.000000,,1.0,1.0,1.0,1.0,1.0


In [25]:
# 加上 'Cabin' 欄位的計數編碼
df_group  = df_temp.groupby(['Cabin'])

# df_temp = df_temp.merge(df_group,how='left',on=['Cabin'])
df_group.head()



Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked,Ticket_Hash,Ticket_Count
0,155,1,720,185,3,6,1
1,286,0,816,106,0,5,2
2,523,0,914,185,3,8,1
3,422,0,65,70,3,0,2
4,22,1,649,185,3,3,1
5,818,1,373,185,2,8,1
6,767,1,109,163,3,1,2
7,914,1,541,185,3,9,5
10,1067,0,839,184,3,7,3
11,133,0,50,62,3,0,1


In [26]:
# 加上 'Cabin' 欄位的計數編碼
df_group  = df_temp.groupby(['Cabin'])

# df_temp = df_temp.merge(df_group,how='left',on=['Cabin'])
df_group.head(5)



Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked,Ticket_Hash,Ticket_Count
0,155,1,720,185,3,6,1
1,286,0,816,106,0,5,2
2,523,0,914,185,3,8,1
3,422,0,65,70,3,0,2
4,22,1,649,185,3,3,1
5,818,1,373,185,2,8,1
6,767,1,109,163,3,1,2
7,914,1,541,185,3,9,5
10,1067,0,839,184,3,7,3
11,133,0,50,62,3,0,1


In [27]:
# 加上 'Cabin' 欄位的計數編碼
df_group  = df_temp.groupby(['Cabin']).reset_index()

# df_temp = df_temp.merge(df_group,how='left',on=['Cabin'])
df_group.head(5)



AttributeError: Cannot access callable attribute 'reset_index' of 'DataFrameGroupBy' objects, try using the 'apply' method

In [28]:
# 加上 'Cabin' 欄位的計數編碼
df_group  = df_temp.groupby(['Cabin'])

# df_temp = df_temp.merge(df_group,how='left',on=['Cabin'])
df_group.head(5)



Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked,Ticket_Hash,Ticket_Count
0,155,1,720,185,3,6,1
1,286,0,816,106,0,5,2
2,523,0,914,185,3,8,1
3,422,0,65,70,3,0,2
4,22,1,649,185,3,3,1
5,818,1,373,185,2,8,1
6,767,1,109,163,3,1,2
7,914,1,541,185,3,9,5
10,1067,0,839,184,3,7,3
11,133,0,50,62,3,0,1


In [29]:
# 加上 'Cabin' 欄位的計數編碼
df_group  = df_temp.groupby(['Cabin'])['Name'].size()

# df_temp = df_temp.merge(df_group,how='left',on=['Cabin'])
df_group.head(5)



Cabin
0    1
1    1
2    1
3    1
4    1
Name: Name, dtype: int64

In [30]:
# 加上 'Cabin' 欄位的計數編碼
df_group  = df_temp.groupby(['Cabin'])['Name'].size()

# df_temp = df_temp.merge(df_group,how='left',on=['Cabin'])
df_group.head()



Cabin
0    1
1    1
2    1
3    1
4    1
Name: Name, dtype: int64

In [31]:
# 加上 'Cabin' 欄位的計數編碼
df_group  = df_temp.groupby(['Cabin'])

# df_temp = df_temp.merge(df_group,how='left',on=['Cabin'])
df_group.head()



Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked,Ticket_Hash,Ticket_Count
0,155,1,720,185,3,6,1
1,286,0,816,106,0,5,2
2,523,0,914,185,3,8,1
3,422,0,65,70,3,0,2
4,22,1,649,185,3,3,1
5,818,1,373,185,2,8,1
6,767,1,109,163,3,1,2
7,914,1,541,185,3,9,5
10,1067,0,839,184,3,7,3
11,133,0,50,62,3,0,1


In [32]:
# 加上 'Cabin' 欄位的計數編碼
df_group  = df_temp.groupby(['Cabin']).resample()

# df_temp = df_temp.merge(df_group,how='left',on=['Cabin'])
df_group.head()



TypeError: resample() missing 1 required positional argument: 'rule'

In [33]:
# 加上 'Cabin' 欄位的計數編碼
df_group  = df_temp.groupby(['Cabin'])

# df_temp = df_temp.merge(df_group,how='left',on=['Cabin'])
df_group.columns



AttributeError: Cannot access attribute 'columns' of 'DataFrameGroupBy' objects, try using the 'apply' method

In [34]:
# 加上 'Cabin' 欄位的計數編碼
df_group  = df_temp.groupby(['Cabin'])

# df_temp = df_temp.merge(df_group,how='left',on=['Cabin'])
df_group.size()



Cabin
0         1
1         1
2         1
3         1
4         1
5         1
6         1
7         1
8         1
9         1
10        1
11        1
12        1
13        1
14        3
15        1
16        1
17        1
18        1
19        1
20        1
21        1
22        1
23        1
24        2
25        1
26        2
27        2
28        1
29        1
       ... 
157       1
158       1
159       1
160       1
161       2
162       1
163       2
164       1
165       2
166       1
167       1
168       1
169       1
170       2
171       1
172       1
173       2
174       1
175       1
176       1
177       1
178       2
179       2
180       4
181       4
182       1
183       4
184       5
185    1014
186       1
Length: 187, dtype: int64

In [35]:
# 加上 'Cabin' 欄位的計數編碼
df_group  = df_temp.groupby(['Cabin']).size()

df_temp = df_temp.merge(df_group,how='left',on=['Cabin'])
df_temp



ValueError: Cannot merge a Series without a name

In [36]:
# 加上 'Cabin' 欄位的計數編碼
df_group  = df_temp.groupby(['Cabin']).aggregate({'Cabin_Cnt':'size'}).reset_index()

df_temp = df_temp.merge(df_group,how='left',on=['Cabin'])
df_temp



  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


KeyError: 'Cabin_Cnt'

In [37]:
# 加上 'Cabin' 欄位的計數編碼
df_group  = df_temp.groupby(['Cabin']).agg({'Cabin_Cnt':'size'}).reset_index()

df_temp = df_temp.merge(df_group,how='left',on=['Cabin'])
df_temp



KeyError: 'Cabin_Cnt'

In [38]:
# 加上 'Cabin' 欄位的計數編碼
df_group  = df_temp.groupby(['Cabin']).agg({'Cabin_Cnt':'size'})
df_group
#df_temp = df_temp.merge(df_group,how='left',on=['Cabin'])




KeyError: 'Cabin_Cnt'

In [39]:
# 加上 'Cabin' 欄位的計數編碼
df_group  = df_temp.groupby(['Cabin'])['Name'].agg({'Cabin_Cnt':'size'})
df_group
#df_temp = df_temp.merge(df_group,how='left',on=['Cabin'])




is deprecated and will be removed in a future version
  


Unnamed: 0_level_0,Cabin_Cnt
Cabin,Unnamed: 1_level_1
0,1
1,1
2,1
3,1
4,1
5,1
6,1
7,1
8,1
9,1


In [40]:
# 加上 'Cabin' 欄位的計數編碼
df_group  = df_temp.groupby(['Cabin'])['Name'].agg({'Cabin_Cnt':'size'}).reset_index()
df_group
#df_temp = df_temp.merge(df_group,how='left',on=['Cabin'])




is deprecated and will be removed in a future version
  


Unnamed: 0,Cabin,Cabin_Cnt
0,0,1
1,1,1
2,2,1
3,3,1
4,4,1
5,5,1
6,6,1
7,7,1
8,8,1
9,9,1


In [41]:
# 加上 'Cabin' 欄位的計數編碼
df_group  = df_temp.groupby(['Cabin'])['Name'].agg({'Cabin_Cnt':'size'}).reset_index()
#df_group
df_temp = df_temp.merge(df_group,how='left',on=['Cabin'])
df_temp




is deprecated and will be removed in a future version
  


Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked,Ticket_Hash,Ticket_Count,Cabin_Cnt
0,155,1,720,185,3,6,1,1014
1,286,0,816,106,0,5,2,2
2,523,0,914,185,3,8,1,1014
3,422,0,65,70,3,0,2,2
4,22,1,649,185,3,3,1,1014
5,818,1,373,185,2,8,1,1014
6,767,1,109,163,3,1,2,2
7,914,1,541,185,3,9,5,1014
8,605,0,477,185,3,6,3,1014
9,847,0,174,185,0,6,2,1014


In [42]:
# 'Cabin'計數編碼 + 邏輯斯迴歸
train_X = df_temp[:train_num]
estimator = LogisticRegression()
print(cross_val_score(estimator, train_X, train_Y, cv=5).mean())
df_temp.head()


0.7856167504850543




Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked,Ticket_Hash,Ticket_Count,Cabin_Cnt
0,155,1,720,185,3,6,1,1014
1,286,0,816,106,0,5,2,2
2,523,0,914,185,3,8,1,1014
3,422,0,65,70,3,0,2,2
4,22,1,649,185,3,3,1,1014


In [43]:
# 對照組 : 標籤編碼 + 邏輯斯迴歸
df_temp = pd.DataFrame()
for c in df.columns:
    df_temp[c] = LabelEncoder().fit_transform(df[c])
train_X = df_temp[:train_num]
estimator = LogisticRegression()
print(cross_val_score(estimator, train_X, train_Y, cv=5).mean())
df_temp.head()



0.780004837244799




Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,155,1,720,185,3
1,286,0,816,106,0
2,523,0,914,185,3
3,422,0,65,70,3
4,22,1,649,185,3


In [44]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸
df_temp['Cabin_Hash'] = df['Cabin'].map(lambda x :hash(x))
df_temp



ValueError: cannot reindex from a duplicate axis

In [45]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸
df['Cabin'].map(lambda x :hash(x))




0      6097522628410843938
1     -2835752437946489239
2      6097522628410843938
3      4905423006018131875
4      6097522628410843938
5      6097522628410843938
6      2677398239773026556
7      6097522628410843938
8      6097522628410843938
9      6097522628410843938
10    -3069913196726041193
11    -3193709815818869201
12     6097522628410843938
13     6097522628410843938
14     6097522628410843938
15     6097522628410843938
16     6097522628410843938
17     6097522628410843938
18     6097522628410843938
19     6097522628410843938
20     6097522628410843938
21    -8784841843237084442
22     6097522628410843938
23    -7706805886701669038
24     6097522628410843938
25     6097522628410843938
26     6097522628410843938
27     8459639832252913839
28     6097522628410843938
29     6097522628410843938
              ...         
388    6097522628410843938
389    6097522628410843938
390   -1921438572292735713
391   -7456340541540082510
392    6097522628410843938
393    6097522628410843938
3

In [46]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸
df['Cabin'].map(lambda x :hash(x)%100)




0      38
1      61
2      38
3      75
4      38
5      38
6      56
7      38
8      38
9      38
10      7
11     99
12     38
13     38
14     38
15     38
16     38
17     38
18     38
19     38
20     38
21     58
22     38
23     62
24     38
25     38
26     38
27     39
28     38
29     38
       ..
388    38
389    38
390    87
391    90
392    38
393    38
394    38
395    52
396    38
397    41
398    38
399    38
400    77
401    38
402    38
403    38
404    25
405    95
406    38
407    16
408    38
409    38
410    38
411    45
412    38
413    38
414    30
415    38
416    38
417    38
Name: Cabin, Length: 1309, dtype: int64

In [47]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸
df_temp




Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,155,1,720,185,3
1,286,0,816,106,0
2,523,0,914,185,3
3,422,0,65,70,3
4,22,1,649,185,3
5,818,1,373,185,2
6,767,1,109,163,3
7,914,1,541,185,3
8,605,0,477,185,3
9,847,0,174,185,0


In [48]:
# 對照組 : 標籤編碼 + 邏輯斯迴歸
df_temp = pd.DataFrame()
for c in df.columns:
    df_temp[c] = LabelEncoder().fit_transform(df[c])
train_X = df_temp[:train_num]
estimator = LogisticRegression()
print(cross_val_score(estimator, train_X, train_Y, cv=5).mean())
df_temp.head()



0.780004837244799




Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,155,1,720,185,3
1,286,0,816,106,0
2,523,0,914,185,3
3,422,0,65,70,3
4,22,1,649,185,3


In [49]:
# 加上 'Cabin' 欄位的計數編碼
df_group  = df_temp.groupby(['Cabin'])['Name'].agg({'Cabin_Cnt':'size'}).reset_index()
#df_group
df_temp = df_temp.merge(df_group,how='left',on=['Cabin'])
df_temp





is deprecated and will be removed in a future version
  


Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked,Cabin_Cnt
0,155,1,720,185,3,1014
1,286,0,816,106,0,2
2,523,0,914,185,3,1014
3,422,0,65,70,3,2
4,22,1,649,185,3,1014
5,818,1,373,185,2,1014
6,767,1,109,163,3,2
7,914,1,541,185,3,1014
8,605,0,477,185,3,1014
9,847,0,174,185,0,1014


In [50]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸
df_temp




Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked,Cabin_Cnt
0,155,1,720,185,3,1014
1,286,0,816,106,0,2
2,523,0,914,185,3,1014
3,422,0,65,70,3,2
4,22,1,649,185,3,1014
5,818,1,373,185,2,1014
6,767,1,109,163,3,2
7,914,1,541,185,3,1014
8,605,0,477,185,3,1014
9,847,0,174,185,0,1014


In [51]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸
df['Cabin'] .map(lambda x:hash(x)%5)




0      3
1      1
2      3
3      0
4      3
5      3
6      1
7      3
8      3
9      3
10     2
11     4
12     3
13     3
14     3
15     3
16     3
17     3
18     3
19     3
20     3
21     3
22     3
23     2
24     3
25     3
26     3
27     4
28     3
29     3
      ..
388    3
389    3
390    2
391    0
392    3
393    3
394    3
395    2
396    3
397    1
398    3
399    3
400    2
401    3
402    3
403    3
404    0
405    0
406    3
407    1
408    3
409    3
410    3
411    0
412    3
413    3
414    0
415    3
416    3
417    3
Name: Cabin, Length: 1309, dtype: int64

In [52]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸
df_temp['Cabin_Hash'] = df['Cabin'] .map(lambda x:hash(x)%5)




ValueError: cannot reindex from a duplicate axis

In [53]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸
tmpCabin_Hash = df['Cabin'] .map(lambda x:hash(x)%5)
pd.concat(df_temp,tmpCabin_Hash)
df_temp





TypeError: first argument must be an iterable of pandas objects, you passed an object of type "DataFrame"

In [54]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸
tmpCabin_Hash = df['Cabin'] .map(lambda x:hash(x)%5)
pd.concat([df_temp,tmpCabin_Hash]"")






SyntaxError: invalid syntax (<ipython-input-54-faf99a820c24>, line 3)

In [55]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸
tmpCabin_Hash = df['Cabin'] .map(lambda x:hash(x)%5)
pd.concat([df_temp,tmpCabin_Hash])






Unnamed: 0,0,Cabin,Cabin_Cnt,Embarked,Name,Sex,Ticket
0,,185.0,1014.0,3.0,155.0,1.0,720.0
1,,106.0,2.0,0.0,286.0,0.0,816.0
2,,185.0,1014.0,3.0,523.0,0.0,914.0
3,,70.0,2.0,3.0,422.0,0.0,65.0
4,,185.0,1014.0,3.0,22.0,1.0,649.0
5,,185.0,1014.0,2.0,818.0,1.0,373.0
6,,163.0,2.0,3.0,767.0,1.0,109.0
7,,185.0,1014.0,3.0,914.0,1.0,541.0
8,,185.0,1014.0,3.0,605.0,0.0,477.0
9,,185.0,1014.0,0.0,847.0,0.0,174.0


In [56]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸
tmpCabin_Hash = df['Cabin'] .map(lambda x:hash(x)%5)
pd.concat([df_temp,tmpCabin_Hash],axis=1)






ValueError: Shape of passed values is (1727, 7), indices imply (1309, 7)

In [57]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸
tmpCabin_Hash = df['Cabin'] .map(lambda x:hash(x)%5)
tmpCabin_Hash.shape





(1309,)

In [58]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸
tmpCabin_Hash = df['Cabin'] .map(lambda x:hash(x)%5)
df_temp.shape





(1309, 6)

In [59]:
# 對照組 : 標籤編碼 + 邏輯斯迴歸
df_temp = pd.DataFrame()
for c in df.columns:
    df_temp[f'{c}_LabelEncoder'] = LabelEncoder().fit_transform(df[c])
train_X = df_temp[:train_num]
estimator = LogisticRegression()
print(cross_val_score(estimator, train_X, train_Y, cv=5).mean())
df_temp.head()



0.780004837244799




Unnamed: 0,Name_LabelEncoder,Sex_LabelEncoder,Ticket_LabelEncoder,Cabin_LabelEncoder,Embarked_LabelEncoder
0,155,1,720,185,3
1,286,0,816,106,0
2,523,0,914,185,3
3,422,0,65,70,3
4,22,1,649,185,3


In [60]:
# 加上 'Cabin' 欄位的計數編碼
df_group  = df_temp.groupby(['Cabin'])['Name'].agg({'Cabin_Cnt':'size'}).reset_index()
#df_group
df_temp = df_temp.merge(df_group,how='left',on=['Cabin'])
df_temp





KeyError: 'Cabin'

In [61]:
# 對照組 : 標籤編碼 + 邏輯斯迴歸
df_temp = pd.DataFrame()
for c in df.columns:
    df_temp[f'{c}_LabelEncoder'] = LabelEncoder().fit_transform(df[c])
train_X = df_temp[:train_num]
estimator = LogisticRegression()
print(cross_val_score(estimator, train_X, train_Y, cv=5).mean())
df_temp.head()



0.780004837244799




Unnamed: 0,Name_LabelEncoder,Sex_LabelEncoder,Ticket_LabelEncoder,Cabin_LabelEncoder,Embarked_LabelEncoder
0,155,1,720,185,3
1,286,0,816,106,0
2,523,0,914,185,3
3,422,0,65,70,3
4,22,1,649,185,3


In [62]:
# 加上 'Cabin' 欄位的計數編碼
df_group  = df_temp.groupby(['Cabin_LabelEncoder'])['Name'].agg({'Cabin_Cnt':'size'}).reset_index()
#df_group
df_temp = df_temp.merge(df_group,how='left',on=['Cabin_LabelEncoder'])
df_temp





KeyError: 'Column not found: Name'

In [63]:
# 對照組 : 標籤編碼 + 邏輯斯迴歸
df_temp = pd.DataFrame()
for c in df.columns:
    df_temp[f'{c}_LabelEncoder'] = LabelEncoder().fit_transform(df[c])
train_X = df_temp[:train_num]
estimator = LogisticRegression()
print(cross_val_score(estimator, train_X, train_Y, cv=5).mean())
df_temp.head()



0.780004837244799




Unnamed: 0,Name_LabelEncoder,Sex_LabelEncoder,Ticket_LabelEncoder,Cabin_LabelEncoder,Embarked_LabelEncoder
0,155,1,720,185,3
1,286,0,816,106,0
2,523,0,914,185,3
3,422,0,65,70,3
4,22,1,649,185,3


In [64]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸
df_temp = pd.DataFrame()
for c in df.columns:
    df_temp[c] = LabelEncoder().fit_transform(df[c])
df_temp['Cabin_Hash'] = df['Cabin'].map(lambda x : hash(x)%3)
train_X = df_temp[:train_num]
estimator = LogisticRegression()
print(cross_val_score(estimator, train_X, train_Y, cv=5).mean())
df_temp.head()




ValueError: cannot reindex from a duplicate axis

In [65]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸
df_temp = pd.DataFrame()
for c in df.columns:
    df_temp[c] = LabelEncoder().fit_transform(df[c])
df_temp['Cabin_Hash'] = df['Cabin'].map(lambda x : hash(x)%3).reset_index
train_X = df_temp[:train_num]
estimator = LogisticRegression()
print(cross_val_score(estimator, train_X, train_Y, cv=5).mean())
df_temp.head()






TypeError: float() argument must be a string or a number, not 'method'

In [66]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸
df_temp = pd.DataFrame()
for c in df.columns:
    df_temp[c] = LabelEncoder().fit_transform(df[c])
df_temp['Cabin_Hash'] = df['Cabin'].map(lambda x : hash(x)%3).reset_index()
train_X = df_temp[:train_num]
estimator = LogisticRegression()
print(cross_val_score(estimator, train_X, train_Y, cv=5).mean())
df_temp.head()




ValueError: Wrong number of items passed 2, placement implies 1

In [67]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸
df_temp = pd.DataFrame()
for c in df.columns:
    df_temp[c] = LabelEncoder().fit_transform(df[c])
df_temp['Cabin_Hash'] = df['Cabin'].map(lambda x : hash(x)%3)
train_X = df_temp[:train_num]
estimator = LogisticRegression()
print(cross_val_score(estimator, train_X, train_Y, cv=5).mean())
df_temp.head()




ValueError: cannot reindex from a duplicate axis

In [68]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸
df['Cabin'].map(lambda x : hash(x)%3)





0      0
1      2
2      0
3      1
4      0
5      0
6      1
7      0
8      0
9      0
10     1
11     1
12     0
13     0
14     0
15     0
16     0
17     0
18     0
19     0
20     0
21     1
22     0
23     1
24     0
25     0
26     0
27     0
28     0
29     0
      ..
388    0
389    0
390    0
391    2
392    0
393    0
394    0
395    2
396    0
397    2
398    0
399    0
400    1
401    0
402    0
403    0
404    2
405    0
406    0
407    1
408    0
409    0
410    0
411    1
412    0
413    0
414    2
415    0
416    0
417    0
Name: Cabin, Length: 1309, dtype: int64

In [69]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸
df['Cabin'].map(lambda x : hash(x)%3).reset_index()





Unnamed: 0,index,Cabin
0,0,0
1,1,2
2,2,0
3,3,1
4,4,0
5,5,0
6,6,1
7,7,0
8,8,0
9,9,0


In [70]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸
tmp = df['Cabin'].map(lambda x : hash(x)%3).reset_index()
tmp.groupby('index').size()





index
0      2
1      2
2      2
3      2
4      2
5      2
6      2
7      2
8      2
9      2
10     2
11     2
12     2
13     2
14     2
15     2
16     2
17     2
18     2
19     2
20     2
21     2
22     2
23     2
24     2
25     2
26     2
27     2
28     2
29     2
      ..
861    1
862    1
863    1
864    1
865    1
866    1
867    1
868    1
869    1
870    1
871    1
872    1
873    1
874    1
875    1
876    1
877    1
878    1
879    1
880    1
881    1
882    1
883    1
884    1
885    1
886    1
887    1
888    1
889    1
890    1
Length: 891, dtype: int64

In [71]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸
tmp = df['Cabin'].map(lambda x : hash(x)%3).reset_index()
tmp.nunique()






index    891
Cabin      3
dtype: int64

In [72]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸
tmp = df['Cabin'].map(lambda x : hash(x)%3).reset_index()
tmp.unique()






AttributeError: 'DataFrame' object has no attribute 'unique'

In [73]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸
df_temp['Cabin_Orgin'] = copy.deepcopy(df['Cabin'])







ValueError: cannot reindex from a duplicate axis

In [74]:
#只取類別值 (object) 型欄位, 存於 object_features 中
object_features = []
for dtype, feature in zip(df.dtypes, df.columns):
    if dtype == 'object':
        object_features.append(feature)
print(f'{len(object_features)} Object Features : {object_features}\n')

# 只留類別型欄位
df = df[object_features]
df = df.fillna('None')
train_num = train_Y.shape[0]
df.head()


5 Object Features : ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']



Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S
4,"Allen, Mr. William Henry",male,373450,,S


In [75]:
# 對照組 : 標籤編碼 + 邏輯斯迴歸
df_temp = pd.DataFrame()
for c in df.columns:
    df_temp[c] = LabelEncoder().fit_transform(df[c])
train_X = df_temp[:train_num]
estimator = LogisticRegression()
print(cross_val_score(estimator, train_X, train_Y, cv=5).mean())
df_temp.head()



0.780004837244799




Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,155,1,720,185,3
1,286,0,816,106,0
2,523,0,914,185,3
3,422,0,65,70,3
4,22,1,649,185,3


In [76]:
# 對照組 : 標籤編碼 + 邏輯斯迴歸
df_temp = pd.DataFrame()
for c in df.columns:
    df_temp[c] = LabelEncoder().fit_transform(df[c])
train_X = df_temp[:train_num]
estimator = LogisticRegression()
print(cross_val_score(estimator, train_X, train_Y, cv=5).mean())
df_temp.head()



0.780004837244799




Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,155,1,720,185,3
1,286,0,816,106,0
2,523,0,914,185,3
3,422,0,65,70,3
4,22,1,649,185,3


In [77]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸
df







Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S
4,"Allen, Mr. William Henry",male,373450,,S
5,"Moran, Mr. James",male,330877,,Q
6,"McCarthy, Mr. Timothy J",male,17463,E46,S
7,"Palsson, Master. Gosta Leonard",male,349909,,S
8,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,347742,,S
9,"Nasser, Mrs. Nicholas (Adele Achem)",female,237736,,C


In [78]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸
df.shape







(1309, 5)

In [79]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸
df.shape
df_temp.shape







(1309, 5)

In [80]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸
df['Cabin']







0             None
1              C85
2             None
3             C123
4             None
5             None
6              E46
7             None
8             None
9             None
10              G6
11            C103
12            None
13            None
14            None
15            None
16            None
17            None
18            None
19            None
20            None
21             D56
22            None
23              A6
24            None
25            None
26            None
27     C23 C25 C27
28            None
29            None
          ...     
388           None
389           None
390            B24
391            D28
392           None
393           None
394           None
395            C31
396           None
397            B41
398           None
399           None
400             C7
401           None
402           None
403           None
404            D40
405            D38
406           None
407            C80
408           None
409         

In [81]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸
df['Cabin'][2]







2    None
2    None
Name: Cabin, dtype: object

In [82]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸
df[('Cabin' == 'C85')]






KeyError: False

In [83]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸
df[['Cabin'] == 'C85']






KeyError: False

In [84]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸
df[['Cabin = C85']]






KeyError: "None of [Index(['Cabin = C85'], dtype='object')] are in the [columns]"

In [85]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸
df[['Cabin']=='C85']






KeyError: False

In [86]:
df.query('Cabin == C85
')

SyntaxError: EOL while scanning string literal (<ipython-input-86-8a7bf435bb8f>, line 1)

In [87]:
df.query('Cabin == C85')

UndefinedVariableError: name 'C85' is not defined

In [88]:
df.query('Cabin == "C85"')

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C
234,"Cumings, Mr. John Bradley",male,PC 17599,C85,C


In [89]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸

df.query('Cabin == "C85" && Sex == "male" ')





SyntaxError: Python keyword not valid identifier in numexpr query (<unknown>, line 1)

In [90]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸

df.query('Cabin == "C85" and Sex == "male" ')





Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
234,"Cumings, Mr. John Bradley",male,PC 17599,C85,C


In [91]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸

df.query('Cabin == "C85" & Sex == "male" ')





Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
234,"Cumings, Mr. John Bradley",male,PC 17599,C85,C


In [92]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸

df.query('Cabin == "C85" & Sex == "female" ')





Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C


In [93]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸

df.query('Cabin == "C85" ')





Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C
234,"Cumings, Mr. John Bradley",male,PC 17599,C85,C


In [94]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸

df['Cabin'].apply(lambda x:hash(x))





0      6097522628410843938
1     -2835752437946489239
2      6097522628410843938
3      4905423006018131875
4      6097522628410843938
5      6097522628410843938
6      2677398239773026556
7      6097522628410843938
8      6097522628410843938
9      6097522628410843938
10    -3069913196726041193
11    -3193709815818869201
12     6097522628410843938
13     6097522628410843938
14     6097522628410843938
15     6097522628410843938
16     6097522628410843938
17     6097522628410843938
18     6097522628410843938
19     6097522628410843938
20     6097522628410843938
21    -8784841843237084442
22     6097522628410843938
23    -7706805886701669038
24     6097522628410843938
25     6097522628410843938
26     6097522628410843938
27     8459639832252913839
28     6097522628410843938
29     6097522628410843938
              ...         
388    6097522628410843938
389    6097522628410843938
390   -1921438572292735713
391   -7456340541540082510
392    6097522628410843938
393    6097522628410843938
3

 # [範例重點]
 - 觀察時間特徵分解, 在線性迴歸分數 / 梯度提升樹分數上, 分別有什麼影響 (In[2], Out[2], In[3], Out[3])
 - 觀察加入週期循環特徵, 在線性迴歸分數 / 梯度提升樹分數上, 分別有什麼影響 (In[4], Out[4], In[5], Out[5])

In [95]:
# 做完特徵工程前的所有準備
import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

data_path = 'data/'
df = pd.read_csv(data_path + 'taxi_data1.csv')

train_Y = df['fare_amount']
df = df.drop(['fare_amount'] , axis=1)
df.head()



Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2011-10-21 23:54:10 UTC,-73.99058,40.761071,-73.981128,40.758634,2
1,2015-02-03 10:42:03 UTC,-73.988403,40.723431,-73.989647,40.741695,1
2,2014-03-16 18:58:58 UTC,-74.015785,40.71511,-74.012029,40.707888,2
3,2009-06-13 16:10:54 UTC,-73.977322,40.787275,-73.95803,40.778838,3
4,2014-06-12 03:25:56 UTC,-73.989683,40.729717,-73.98249,40.761887,3


In [96]:
# 時間轉換方式 : 使用 datetime.strptime 解析(parse)時間字串 / 使用 datetime.strftime 匯出時間格式(format)
# 參考官網 https://docs.python.org/3/library/datetime.html
df['pickup_datetime'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S UTC'))
df['pickup_year'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%Y')).astype('int64')
df['pickup_month'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%m')).astype('int64')
df['pickup_day'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%d')).astype('int64')
df['pickup_hour'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%H')).astype('int64')
df['pickup_minute'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%M')).astype('int64')
df['pickup_second'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%S')).astype('int64')
df.head()



Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56


In [97]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
df_temp = df.drop(['pickup_datetime'] , axis=1)
scaler = MinMaxScaler()
train_X = scaler.fit_transform(df_temp)
Linear = LinearRegression()
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
GDBT = GradientBoostingRegressor()
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')



  return self.partial_fit(X, y)


Linear Reg Score : 0.026876871475640173


Gradient Boosting Reg Score : 0.7120563497833997


In [98]:
# 加上"日週期"特徵 (參考講義"週期循環特徵")
import math
df['day_cycle'] = df['pickup_hour']/12 + df['pickup_minute']/720 + df['pickup_second']/43200
df['day_cycle'] = df['day_cycle'].map(lambda x:math.sin(x*math.pi))
df.head()



Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,day_cycle
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10,-0.02545
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3,0.333601
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58,-0.967083
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54,-0.888817
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56,0.782427


In [99]:
# 結果 : 預測力反而下降
df_temp = df.drop(['pickup_datetime'] , axis=1)
train_X = scaler.fit_transform(df_temp)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')


  return self.partial_fit(X, y)


Linear Reg Score : 0.026412252675043525


Gradient Boosting Reg Score : 0.7092714368156806


In [100]:
# 做完特徵工程前的所有準備
import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

data_path = 'data/'
df = pd.read_csv(data_path + 'taxi_data1.csv')

train_Y = df['fare_amount']
df = df.drop(['fare_amount'] , axis=1)
df.head()



Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2011-10-21 23:54:10 UTC,-73.99058,40.761071,-73.981128,40.758634,2
1,2015-02-03 10:42:03 UTC,-73.988403,40.723431,-73.989647,40.741695,1
2,2014-03-16 18:58:58 UTC,-74.015785,40.71511,-74.012029,40.707888,2
3,2009-06-13 16:10:54 UTC,-73.977322,40.787275,-73.95803,40.778838,3
4,2014-06-12 03:25:56 UTC,-73.989683,40.729717,-73.98249,40.761887,3


In [101]:
# 時間特徵分解方式:使用datetime
df['pickup_datetime'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S UTC'))
df['pickup_year'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%Y')).astype('int64')
df['pickup_month'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%m')).astype('int64')
df['pickup_day'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%d')).astype('int64')
df['pickup_hour'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%H')).astype('int64')
df['pickup_minute'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%M')).astype('int64')
df['pickup_second'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%S')).astype('int64')
df.head()



Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56


In [102]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
df_temp = df.drop(['pickup_datetime'] , axis=1)
scaler = MinMaxScaler()
train_X = scaler.fit_transform(df_temp)
Linear = LinearRegression()
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
GDBT = GradientBoostingRegressor()
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')


  return self.partial_fit(X, y)


Linear Reg Score : 0.026876871475640173


Gradient Boosting Reg Score : 0.7107181899059468


In [103]:
# 加入星期幾與第幾周兩個特徵
df['week_day'] = df['pickup_datetime'].apply(lambda x: x.weekday())

"""
Your Code Here
"""
df.head()



Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,week_day
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10,4
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3,1
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58,6
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54,5
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56,3


In [104]:
# 加入星期幾與第幾周兩個特徵
df['week_day'] = df['pickup_datetime'].apply(lambda x: x.weekday())
df['weekOfYear'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%U')).astype('int64')


df.head()



Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,week_day,weekOfYear
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10,4,42
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3,1,5
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58,6,11
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54,5,23
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56,3,23


In [105]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
df_temp = df.drop(['pickup_datetime'] , axis=1)
train_X = scaler.fit_transform(df_temp)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')



  return self.partial_fit(X, y)


Linear Reg Score : 0.026467123376537716


Gradient Boosting Reg Score : 0.7167549614324216


In [106]:
# 加上"日週期"特徵 (參考講義"週期循環特徵")
import math
df['day_cycle'] = df['pickup_hour']/12 + df['pickup_minute']/720 + df['pickup_second']/43200
df['day_cycle'] = df['day_cycle'].map(lambda x:math.sin(x*math.pi))
df.head()



Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,week_day,weekOfYear,day_cycle
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10,4,42,-0.02545
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3,1,5,0.333601
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58,6,11,-0.967083
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54,5,23,-0.888817
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56,3,23,0.782427


In [107]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
df_temp = df.drop(['pickup_datetime'] , axis=1)
train_X = scaler.fit_transform(df_temp)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')


  return self.partial_fit(X, y)


Linear Reg Score : 0.026048223699888774


Gradient Boosting Reg Score : 0.7100703872224134


In [108]:
# 加上"年週期"與"周週期"特徵
df['year_cycle'] = df['pickup_month']/6 + df['pickup_day']/180
df['year_cycle'] = math.cos(df['year_cycle']*math.pi)


"""
Your Code Here
"""
df.head()



TypeError: cannot convert the series to <class 'float'>

In [109]:
# 加上"年週期"與"周週期"特徵
df['year_cycle'] = df['pickup_month']/6 + df['pickup_day']/180
df['year_cycle'] = df['year_cycle'].map(lambda x : math.cos(x*math.pi))


"""
Your Code Here
"""
df.head()



Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,week_day,weekOfYear,day_cycle,year_cycle
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10,4,42,-0.02545,0.777146
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3,1,5,0.333601,0.45399
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58,6,11,-0.967083,-0.275637
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54,5,23,-0.888817,-0.97437
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56,3,23,0.782427,-0.978148


In [110]:
# 加上"年週期"與"周週期"特徵
df['year_cycle'] = df['pickup_month']/6 + df['pickup_day']/180
df['year_cycle'] = df['year_cycle'].map(lambda x : math.cos(x*math.pi))
df['week_cycle'] = df['week_day']/3.5 + df['pickup_hour']/84
df['week_cycle'] = df['week_cycle'].map(lambda x : math.cos(x*math.pi))

"""
Your Code Here
"""
df.head()



Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,week_day,weekOfYear,day_cycle,year_cycle,week_cycle
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10,4,42,-0.02545,0.777146,-0.258819
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3,1,5,0.333601,0.45399,0.294755
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58,6,11,-0.967083,-0.275637,0.974928
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54,5,23,-0.888817,-0.97437,0.365341
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56,3,23,0.782427,-0.978148,-0.943883


In [111]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
df_temp = df.drop(['pickup_datetime'] , axis=1)

train_X = scaler.fit_transform(df_temp)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')



  return self.partial_fit(X, y)


Linear Reg Score : 0.025808309784423056


Gradient Boosting Reg Score : 0.7065494492606705
