In [2]:
import pickle 
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

with open("titanic_step3_feature_engineering.pickle", "rb") as pickle_file:
    onehot_df = pd.read_pickle(pickle_file)

with open("titanic_step3_feature_engineering_y.pickle", "rb") as pickle_y_file:
    y_train = pd.read_pickle(pickle_y_file)


ntrain = 891
X_train, X_test = onehot_df[:ntrain], onehot_df[ntrain:]
X_train.head()



Unnamed: 0,Pclass_0,Pclass_1,Pclass_2,Sex_0,Sex_1,Age_0,Age_1,Age_2,Age_3,Age_4,...,HighSurvival_0,HighSurvival_1,HighSurvival_2,HighSurvival_3,HighSurvival_4,HighSurvival_5,HighSurvival_6,LowSurvival_0,LowSurvival_1,LowSurvival_2
0,False,False,True,True,False,False,False,False,True,False,...,True,False,False,False,False,False,False,True,False,False
1,True,False,False,False,True,False,False,False,False,True,...,False,False,True,False,False,False,False,True,False,False
2,False,False,True,False,True,False,False,False,True,False,...,True,False,False,False,False,False,False,True,False,False
3,True,False,False,False,True,False,False,False,False,True,...,False,False,True,False,False,False,False,True,False,False
4,False,False,True,True,False,False,False,False,False,True,...,True,False,False,False,False,False,False,False,False,True


#### feature engineering의 과정
- EDA: 데이터의 특징 파악
- 결측치 제거 
- 범주형 데이터 숫치형 데이터로 변환
- 연속형(수치형) 데이터를 경향성이 보이는 범위의 범주형 숫자 데이터로 변환
- 경향성이 강한 데이터 생성
- onehot encoding / label encoding 등등

#### 현재 feature engineering의 문제 
- one-hot encoding 과정을 통해 feature의 수가 너무 많아짐
- 계산 속도 및 성능에 영향을 줄 가능성
- 따라서 결과에 유의미한 영향을 주는 feature들의 중요도를 파악하여 해당 feature를 모델에 적용

In [3]:
!pip install lightgbm
!pip install xgboost

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 1.5/1.5 MB 16.8 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0
Collecting xgboost
  Downloading xgboost-3.0.5-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.5-py3-none-win_amd64.whl (56.8 MB)
   ---------------------------------------- 0.0/56.8 MB ? eta -:--:--
   -- ------------------------------------- 3.9/56.8 MB 19.8 MB/s eta 0:00:03
   ---- ----------------------------------- 6.6/56.8 MB 17.2 MB/s eta 0:00:03
   ------- -------------------------------- 10.2/56.8 MB 15.9 MB/s eta 0:00:03
   --------- ------------------------------ 13.9/56.8 MB 16.4 MB/s eta 0:00:03
   ------------ --------------------------- 17.3/56.8 MB 16.3 MB/s eta 0:00:03
   -------------- 

In [4]:
import numpy as np 
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

### Machine Learning Model 대표 메서드 

1. fit() 
- 모델을 데이터로 학습
- 모델의 파라미터 및 예측에 사용될 최종 모델 생성
- feature importance 및 학습 모델의 속성 파악 가능 

2. predict()
- 학습된 모델을 기반으로 테스트 데이터로 예측을 할 때 사용 
- 모델의 성능 평가를 위한 예측값을 위해 사용

실제 ML 파이프라인
1. 탐색: cross_val_score() 를 사용하여 모델 및 파라미터 평가
2. 구현: 최종 선택한 모델로 fit()을 사용하여 학습
3. 예측: 학습 모델을 predict()를 사용하여 새로운 데이터에 투입
4. 평가: score() 등의 함수로 성능 확인


In [7]:
logreg = LogisticRegression()
svc = SVC()
decision_tree = DecisionTreeClassifier()
random_forest = RandomForestClassifier()
extra_tree = ExtraTreesClassifier()
gbm = GradientBoostingClassifier()
nb = GaussianNB()
xgb = XGBClassifier(eval_metric="logloss")
lgbm = LGBMClassifier()

result = {}


In [10]:
alg = logreg
alg.fit(X_train, y_train)
accuracy = alg.score(X_train, y_train)
result[alg.__class__.__name__] = accuracy

alg = svc
alg.fit(X_train, y_train)
accuracy = alg.score(X_train, y_train)
result[alg.__class__.__name__] = accuracy

alg = decision_tree
alg.fit(X_train, y_train)
accuracy = alg.score(X_train, y_train)
result[alg.__class__.__name__] = accuracy

alg = random_forest
alg.fit(X_train, y_train)
accuracy = alg.score(X_train, y_train)
result[alg.__class__.__name__] = accuracy

alg = extra_tree
alg.fit(X_train, y_train)
accuracy = alg.score(X_train, y_train)
result[alg.__class__.__name__] = accuracy

alg = gbm
alg.fit(X_train, y_train)
accuracy = alg.score(X_train, y_train)
result[alg.__class__.__name__] = accuracy


alg = nb
alg.fit(X_train, y_train)
accuracy = alg.score(X_train, y_train)
result[alg.__class__.__name__] = accuracy

alg = xgb
alg.fit(X_train, y_train)
accuracy = alg.score(X_train, y_train)
result[alg.__class__.__name__] = accuracy


alg = lgbm
alg.fit(X_train, y_train)
accuracy = alg.score(X_train, y_train)
result[alg.__class__.__name__] = accuracy

[LightGBM] [Info] Number of positive: 342, number of negative: 549
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000439 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 114
[LightGBM] [Info] Number of data points in the train set: 891, number of used features: 57
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383838 -> initscore=-0.473288
[LightGBM] [Info] Start training from score -0.473288


In [11]:
result

{'LogisticRegression': 0.8552188552188552,
 'SVC': 0.8664421997755332,
 'DecisionTreeClassifier': 0.9461279461279462,
 'RandomForestClassifier': 0.9461279461279462,
 'ExtraTreesClassifier': 0.9461279461279462,
 'GradientBoostingClassifier': 0.8866442199775533,
 'GaussianNB': 0.6992143658810326,
 'XGBClassifier': 0.9337822671156004,
 'LGBMClassifier': 0.9270482603815937}

In [13]:
sorted(result.items(), key=lambda x: x[1], reverse=True)

[('DecisionTreeClassifier', 0.9461279461279462),
 ('RandomForestClassifier', 0.9461279461279462),
 ('ExtraTreesClassifier', 0.9461279461279462),
 ('XGBClassifier', 0.9337822671156004),
 ('LGBMClassifier', 0.9270482603815937),
 ('GradientBoostingClassifier', 0.8866442199775533),
 ('SVC', 0.8664421997755332),
 ('LogisticRegression', 0.8552188552188552),
 ('GaussianNB', 0.6992143658810326)]

#### 트리 관련 모델은 feature importance를 측정
- 트리 결정 과정에서 각 feature의 중요성을 수치화하고, feature_importances_에 저장
- 해당 값을 기준으로 내림차순으로 정리하면 중요성이 낮은 feature를 필터링 할 수 있음

In [14]:
tree_models = [
    decision_tree,
    random_forest,
    extra_tree,
    xgb,
    gbm
]

In [15]:
for model in tree_models:
    try:
        print(model.__class__.__name__)
        print(model.feature_importances_)
    except:
        print(alg.__class__.__name__, "X")
              

DecisionTreeClassifier
[3.80974578e-03 8.94020343e-05 8.43866645e-02 0.00000000e+00
 4.99110926e-03 0.00000000e+00 9.38721361e-04 1.69653691e-02
 2.46322544e-02 1.40196029e-02 1.58376115e-02 5.43426827e-03
 0.00000000e+00 3.76452019e-02 1.02492640e-02 6.22108008e-03
 4.82770985e-03 0.00000000e+00 3.50892756e-03 4.08859836e-02
 5.08139489e-03 2.48761161e-03 0.00000000e+00 5.28949080e-03
 7.81984627e-03 5.63232816e-03 0.00000000e+00 0.00000000e+00
 1.70665854e-02 5.97574156e-03 8.07074112e-04 3.57987997e-01
 1.48021537e-02 5.98221522e-03 3.94262971e-03 0.00000000e+00
 0.00000000e+00 9.86559426e-03 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 5.01278843e-03 0.00000000e+00
 0.00000000e+00 7.20591350e-03 2.19701109e-02 2.01840953e-02
 1.20361191e-02 0.00000000e+00 5.06909535e-03 3.26647824e-03
 0.00000000e+00 0.00000000e+00 4.69651993e-03 1.59221585e-03
 0.00000000e+00 6.59685256e-03 8.36544309e-03 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 9.84279113e-03
 

In [17]:
decision_tree_importance = pd.DataFrame({"Feature": X_train.columns, "decision_tree": decision_tree.feature_importances_})
random_forest_importance = pd.DataFrame({"Feature": X_train.columns, "random_forest": random_forest.feature_importances_})
extra_tree_importance = pd.DataFrame({"Feature": X_train.columns, "extra_tree": extra_tree.feature_importances_})
xgb_importance = pd.DataFrame({"Feature": X_train.columns, "xgb": xgb.feature_importances_})
gbm_importance = pd.DataFrame({"Feature": X_train.columns, "gbm": gbm.feature_importances_})

In [18]:
from functools import reduce

dfs = [
    decision_tree_importance,
    random_forest_importance,
    extra_tree_importance,
    xgb_importance,
    gbm_importance
]

importances = reduce(lambda left, right: pd.merge(left, right, on='Feature'), dfs)

In [20]:
importances.head()

Unnamed: 0,Feature,decision_tree,random_forest,extra_tree,xgb,gbm
0,Pclass_0,0.00381,0.017198,0.019164,0.008982,0.014844
1,Pclass_1,8.9e-05,0.012318,0.01223,0.004788,0.003293
2,Pclass_2,0.084387,0.030327,0.028387,0.059855,0.106215
3,Sex_0,0.0,0.073598,0.078742,0.016426,0.005721
4,Sex_1,0.004991,0.062893,0.067258,0.0,0.093971


#### 각 feature 별 평균 중요도 도출 

In [22]:
columns = importances.select_dtypes(include=[np.number])

importances["avg_importance"] = columns.mean(axis=1)

importances = importances[['Feature', 'avg_importance']]
importances = importances.sort_values(['avg_importance'], ascending=False)
importances.head()

Unnamed: 0,Feature,avg_importance
31,Initial_0,0.246801
2,Pclass_2,0.061834
4,Sex_1,0.045823
84,HighSurvival_0,0.039118
3,Sex_0,0.034897


#### 상위 50개의 feature로만 학습 진행

In [23]:
importances = importances[:50]


In [24]:
train_importances = X_train[importances["Feature"].tolist()]
test_importances = X_test[importances["Feature"].tolist()]

In [25]:
train_importances.head()

Unnamed: 0,Initial_0,Pclass_2,Sex_1,HighSurvival_0,Sex_0,Cabin_0,Fare_0,LowSurvival_0,Age_3,Ticket_Num_Cut_1,...,Embarked_1,Ticket_Num_Cut_6,Family_4,Ticket_Num_Cut_9,Ticket_Initial2_15,Cabin_4,Family_3,Pclass_1,Cabin_1,Ticket_Initial2_9
0,True,True,False,True,True,True,True,True,True,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,True,False,False,False,False,True,False,False,...,True,False,False,False,False,False,False,False,True,False
2,False,True,True,True,False,True,True,True,True,False,...,False,False,False,True,False,False,False,False,False,False
3,False,False,True,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,True,False
4,True,True,False,True,True,True,True,False,False,False,...,False,False,False,True,False,False,False,False,False,False


In [27]:
result2 = {}

In [28]:
alg = logreg
alg.fit(X_train, y_train)
accuracy = alg.score(X_train, y_train)
result2[alg.__class__.__name__] = accuracy

alg = svc
alg.fit(X_train, y_train)
accuracy = alg.score(X_train, y_train)
result2[alg.__class__.__name__] = accuracy

alg = decision_tree
alg.fit(X_train, y_train)
accuracy = alg.score(X_train, y_train)
result2[alg.__class__.__name__] = accuracy

alg = random_forest
alg.fit(X_train, y_train)
accuracy = alg.score(X_train, y_train)
result2[alg.__class__.__name__] = accuracy

alg = extra_tree
alg.fit(X_train, y_train)
accuracy = alg.score(X_train, y_train)
result2[alg.__class__.__name__] = accuracy

alg = gbm
alg.fit(X_train, y_train)
accuracy = alg.score(X_train, y_train)
result2[alg.__class__.__name__] = accuracy


alg = nb
alg.fit(X_train, y_train)
accuracy = alg.score(X_train, y_train)
result2[alg.__class__.__name__] = accuracy

alg = xgb
alg.fit(X_train, y_train)
accuracy = alg.score(X_train, y_train)
result2[alg.__class__.__name__] = accuracy


alg = lgbm
alg.fit(X_train, y_train)
accuracy = alg.score(X_train, y_train)
result2[alg.__class__.__name__] = accuracy

[LightGBM] [Info] Number of positive: 342, number of negative: 549
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000548 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 114
[LightGBM] [Info] Number of data points in the train set: 891, number of used features: 57
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383838 -> initscore=-0.473288
[LightGBM] [Info] Start training from score -0.473288


In [30]:
sorted(result2.items(), key=lambda x: x[1], reverse=True)

[('DecisionTreeClassifier', 0.9461279461279462),
 ('ExtraTreesClassifier', 0.9461279461279462),
 ('RandomForestClassifier', 0.9450056116722784),
 ('XGBClassifier', 0.9337822671156004),
 ('LGBMClassifier', 0.9270482603815937),
 ('GradientBoostingClassifier', 0.8866442199775533),
 ('SVC', 0.8664421997755332),
 ('LogisticRegression', 0.8552188552188552),
 ('GaussianNB', 0.6992143658810326)]

In [None]:
import pickle

with open("titanc")