# 作業 : (Kaggle)鐵達尼生存預測
***
https://www.kaggle.com/c/titanic

# [作業目標]
- 試著模仿範例寫法, 在鐵達尼生存預測中, 觀察計數編碼與特徵雜湊的效果

# [作業重點]
- 仿造範例, 完成自己挑選特徵的群聚編碼 (In[2], Out[2])
- 觀察群聚編碼, 搭配邏輯斯回歸, 看看有什麼影響 (In[5], Out[5], In[6], Out[6]) 

# 作業1
* 試著使用鐵達尼號的例子，創立兩種以上的群聚編碼特徵( mean、median、mode、max、min、count 均可 )

In [1]:
# 程式區塊 A
# 將需要的都import進來
import os
import copy
import time
import math
import numpy             as np
import pandas            as pd
import seaborn           as sns
import datetime          as dt
import warnings
import matplotlib.pyplot as plt
from scipy                   import stats
from sklearn.ensemble        import GradientBoostingRegressor
from sklearn.linear_model    import LogisticRegression,LinearRegression
from sklearn.preprocessing   import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import cross_val_score

# 將較長的函式改名一下
MME  = MinMaxScaler()
LE   = LabelEncoder()
LR   = LogisticRegression()
LIR  = LinearRegression()
GBR  = GradientBoostingRegressor()
PDDF = pd.DataFrame()
# 一些必要的設定
warnings.filterwarnings('ignore')
%matplotlib inline

# 設定【data的資料夾路徑】，命名為【data_folder】
data_folder = 'C:/Users/Ynitsed/Documents/GitHub/2nd-ML100Days/data'

In [2]:
# 設定t001為某個data路徑
# 設定t002為pd裡read data的功能
t001_train = os.path.join(data_folder, 'titanic_train.csv')
t002_train = pd.read_csv(t001_train)
print('Path of read in data: %s' %t001_train)
print(t002_train.shape)
t002_train.head()

Path of read in data: C:/Users/Ynitsed/Documents/GitHub/2nd-ML100Days/data\titanic_train.csv
(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# 設定t001為某個data路徑
# 設定t002為pd裡read data的功能
t001_test  = os.path.join(data_folder,  'titanic_test.csv')
t002_test  = pd.read_csv(t001_test)
print('Path of read in data: %s' %t001_test)
print(t002_test.shape)
t002_test.head()

Path of read in data: C:/Users/Ynitsed/Documents/GitHub/2nd-ML100Days/data\titanic_test.csv
(418, 11)


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
# 程式區塊 B-1：train取【Survived】為Y、test取【PassengerId】為最終將比對的唯一識別
train_Y     = t002_train['Survived']
test_unique = t002_test['PassengerId']
# 程式區塊 B-2：train捨棄【PassengerId,Survived】、test捨棄【PassengerId】。
t003_train = t002_train.drop(['PassengerId', 'Survived'] , axis=1)
t003_test  = t002_test.drop(['PassengerId'] , axis=1)
print(t003_train.shape)
print(t003_test.shape)

(891, 10)
(418, 10)


In [5]:
# 取船票票號(Ticket), 對乘客年齡(Age)做群聚編碼
# 空值補None
t003_train['Ticket'] = t003_train['Ticket'].fillna('None')
# 空值補mean
t003_train['Age'] = t003_train['Age'].fillna(t003_train['Age'].mean())

# group by 【Ticket】，對【Age】做：mean、mode、median、max、min
mean_t003_train = t003_train.groupby(['Ticket'])['Age'].mean().reset_index()
mode_t003_train = t003_train.groupby(['Ticket'])['Age'].apply(lambda x: x.mode()[0]).reset_index()
median_t003_train = t003_train.groupby(['Ticket'])['Age'].median().reset_index()
max_t003_train = t003_train.groupby(['Ticket'])['Age'].max().reset_index()
min_t003_train = t003_train.groupby(['Ticket'])['Age'].min().reset_index()

# 你看SQL的效率這邊就凸顯出來了，Python必須這樣寫真是有夠麻煩的
# 以下的方法，在SQL的概念也是先做出temp table再left join回來
# 但重點是SQL可以寫【over partition】！
temp = pd.merge(mean_t003_train, mode_t003_train, how='left', on=['Ticket'])
temp = pd.merge(temp, median_t003_train, how='left', on=['Ticket'])
temp = pd.merge(temp, max_t003_train, how='left', on=['Ticket'])
temp = pd.merge(temp, min_t003_train, how='left', on=['Ticket'])
temp.columns = ['Ticket', 'Age_Mean', 'Age_Mode', 'Age_Median', 'Age_Max', 'Age_Min']
print(temp.shape)
temp.head()

(681, 6)


Unnamed: 0,Ticket,Age_Mean,Age_Mode,Age_Median,Age_Max,Age_Min
0,110152,26.333333,16.0,30.0,33.0,16.0
1,110413,36.333333,18.0,39.0,52.0,18.0
2,110465,38.349559,29.699118,38.349559,47.0,29.699118
3,110564,28.0,28.0,28.0,28.0,28.0
4,110813,60.0,60.0,60.0,60.0,60.0


In [6]:
# 把temp併回去，【Ticket】是join key
t004_train = pd.merge(t003_train, temp, how='left', on=['Ticket'])
print(t004_train.shape)
# 把【Ticket】先移掉，暫時不知道為什麼
t005_train = t004_train.drop(['Ticket'] , axis=1)
print(t005_train.shape)
t005_train.head()

(891, 15)
(891, 14)


Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Age_Mean,Age_Mode,Age_Median,Age_Max,Age_Min
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,,S,22.0,22.0,22.0,22.0,22.0
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C85,C,38.0,38.0,38.0,38.0,38.0
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,,S,26.0,26.0,26.0,26.0,26.0
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,C123,S,36.0,35.0,36.0,37.0,35.0
4,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,,S,35.0,35.0,35.0,35.0,35.0


In [7]:
#只取 int64, float64 兩種數值型欄位, 存於 num_features 中
num_features = []
for a, b in zip(t005_train.dtypes, t005_train.columns):
    if a == 'float64' or a == 'int64':
        num_features.append(b)
print(f'{len(num_features)} Numeric Features : {num_features}\n')

10 Numeric Features : ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Age_Mean', 'Age_Mode', 'Age_Median', 'Age_Max', 'Age_Min']



In [8]:
# 只保留剩數值型欄位
t006_train = t005_train[num_features]
# 補null為-1
t007_train = t006_train.fillna(-1)
print(t007_train.shape)
t007_train.head()

(891, 10)


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Age_Mean,Age_Mode,Age_Median,Age_Max,Age_Min
0,3,22.0,1,0,7.25,22.0,22.0,22.0,22.0,22.0
1,1,38.0,1,0,71.2833,38.0,38.0,38.0,38.0,38.0
2,3,26.0,0,0,7.925,26.0,26.0,26.0,26.0,26.0
3,1,35.0,1,0,53.1,36.0,35.0,36.0,37.0,35.0
4,3,35.0,0,0,8.05,35.0,35.0,35.0,35.0,35.0


# 作業2
* 將上述的新特徵，合併原有的欄位做生存率預估，結果是否有改善?
> 結果可以發現 : 不論是例題的線性迴歸或者梯度提升樹, 以及作業的邏輯斯迴歸  
聚類編碼都在正確率上有穩定提升, 這就是我們所說的:均值編碼容易overfitting/聚類編碼不容易overfitting的效果  
不過助教這邊的數值型特徵與類別型特徵, 是有特別用特徵重要性挑選過的, 因此同學自行挑選的特徵可能未必提升  
至於特徵重要性如何使用, 請同學參考 Day29 內容

In [9]:
# 沒有這四個新特徵的 dataframe 稱為 df_minus
t008_train = t007_train.drop(['Age_Mean', 'Age_Mode', 'Age_Median', 'Age_Max', 'Age_Min'] , axis=1)

# 原始特徵 + 邏輯斯迴歸
train_X = MME.fit_transform(t008_train)
print(train_X.shape)
print(f'Logistic Reg Score : {cross_val_score(LR, train_X, train_Y, cv=5).mean()}')
print(f'Linear Reg Score : {cross_val_score(LIR, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GBR, train_X, train_Y, cv=5).mean()}')

(891, 5)
Logistic Reg Score : 0.6982266036406296
Linear Reg Score : 0.1379185137403691
Gradient Boosting Reg Score : 0.16813348870869646


In [10]:
# 新特徵 + 邏輯斯迴歸
train_X = MME.fit_transform(t007_train)
print(train_X.shape)
print(f'Logistic Reg Score : {cross_val_score(LR, train_X, train_Y, cv=5).mean()}')
print(f'Linear Reg Score : {cross_val_score(LIR, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GBR, train_X, train_Y, cv=5).mean()}')

(891, 10)
Logistic Reg Score : 0.7049239534759185
Linear Reg Score : 0.1577189512589498
Gradient Boosting Reg Score : 0.18967315323579184


### Day27教材方向和目標
延續昨天教材，將所學套用到其他資料上

### Day27忽略部分
無

### Day27其他補充
教材速度有放緩，也解釋得比較清楚，還不錯。