<a href="https://colab.research.google.com/github/btg1631/study_AIs/blob/main/docs/quests/MLs/SpineSurgeryList_FeatureEngin.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
df_SSL = pd.read_csv('SpineSurgeryList.csv')
df_SSL[:2]

Unnamed: 0.1,Unnamed: 0,환자ID,Large Lymphocyte,Location of herniation,ODI,가족력,간질성폐질환,고혈압여부,과거수술횟수,당뇨여부,...,Modic change,PI,PT,Seg Angle(raw),Vaccum disc,골밀도,디스크단면적,디스크위치,척추이동척도,척추전방위증
0,0,1PT,22.8,3,51.0,0.0,0,0,0,0,...,3,51.6,36.6,14.4,0,-1.01,2048.5,4,Down,0
1,1,2PT,44.9,4,26.0,0.0,0,0,0,0,...,0,40.8,7.2,17.8,0,-1.14,1753.1,4,Up,0


In [2]:
df_SSL.columns

Index(['Unnamed: 0', '환자ID', 'Large Lymphocyte', 'Location of herniation',
       'ODI', '가족력', '간질성폐질환', '고혈압여부', '과거수술횟수', '당뇨여부', '말초동맥질환여부', '빈혈여부',
       '성별', '스테로이드치료', '신부전여부', '신장', '심혈관질환', '암발병여부', '연령', '우울증여부', '입원기간',
       '입원일자', '종양진행여부', '직업', '체중', '퇴원일자', '헤모글로빈수치', '혈전합병증여부', '환자통증정도',
       '흡연여부', '통증기간(월)', '수술기법', '수술시간', '수술실패여부', '수술일자', '재발여부', '혈액형',
       '전방디스크높이(mm)', '후방디스크높이(mm)', '지방축적도', 'Instability', 'MF + ES',
       'Modic change', 'PI', 'PT', 'Seg Angle(raw)', 'Vaccum disc', '골밀도',
       '디스크단면적', '디스크위치', '척추이동척도', '척추전방위증'],
      dtype='object')

## 데이터
- 목표변수(target) : 재발여부
- 설명변수(features) : 수치형 5개, 범주형 2개
- 수치형 : 신장, 체중, 전방디스크높이(mm), 후방디스크높이(mm), 지방축적도
- 범주형 : Instability, Vaccum disc

In [3]:
df_SSL['재발여부'].unique()

array([0, 1])

In [4]:
df_SSL[['재발여부', '연령', '체중', '전방디스크높이(mm)', '후방디스크높이(mm)', '지방축적도', 'Instability', 'Vaccum disc']][:2]

Unnamed: 0,재발여부,연령,체중,전방디스크높이(mm),후방디스크높이(mm),지방축적도,Instability,Vaccum disc
0,0,66,60.3,16.1,12.3,282.3,0,0
1,0,47,71.7,13.7,6.4,177.3,0,0


In [5]:
df_SSL_extract = df_SSL.loc[:, ['재발여부', '연령', '체중', '전방디스크높이(mm)', '후방디스크높이(mm)', '지방축적도', 'Instability', 'Vaccum disc']].dropna()
df_SSL_extract.isnull().sum()

재발여부           0
연령             0
체중             0
전방디스크높이(mm)    0
후방디스크높이(mm)    0
지방축적도          0
Instability    0
Vaccum disc    0
dtype: int64

In [6]:
# 이상치 처리


## 특성공학(Feature Engineering)
- 수치 평준화 : 수치형과 범주형 각각 적용
- 개수 균형화 : target 수량

### Scaling 수치형에 적용
- Standard Scaling : 평균 0, 표준편차 1
- Min-Max Scaling : 0 ~ 1 사이
- Robust Scaling : 이상치가 많은 데이터 셋(중앙값 기준)

In [7]:
from sklearn.preprocessing import MinMaxScaler

def apply_min_max_scaler(data, feature):
    scaler = MinMaxScaler()
    scaler.fit(data[feature].values.reshape(-1,1))
    data[feature+'_scaled'] = scaler.transform(data[feature].values.reshape(-1,1))
    return data

In [8]:
apply_min_max_scaler(df_SSL_extract, '연령')
apply_min_max_scaler(df_SSL_extract, '체중')
apply_min_max_scaler(df_SSL_extract, '전방디스크높이(mm)')
apply_min_max_scaler(df_SSL_extract, '후방디스크높이(mm)')
apply_min_max_scaler(df_SSL_extract, '지방축적도')

Unnamed: 0,재발여부,연령,체중,전방디스크높이(mm),후방디스크높이(mm),지방축적도,Instability,Vaccum disc,연령_scaled,체중_scaled,전방디스크높이(mm)_scaled,후방디스크높이(mm)_scaled,지방축적도_scaled
0,0,66,60.3,16.1,12.3,282.3,0,0,0.718310,0.250247,0.698540,0.113,0.021206
1,0,47,71.7,13.7,6.4,177.3,0,0,0.450704,0.363007,0.585492,0.054,0.012520
2,0,39,77.1,13.6,7.4,256.8,0,0,0.338028,0.416419,0.580782,0.064,0.019097
3,0,40,74.2,10.6,7.3,250.1,0,0,0.352113,0.387735,0.439472,0.063,0.018542
4,0,42,80.7,17.1,8.1,232.2,0,0,0.380282,0.452028,0.745643,0.071,0.017061
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1889,0,59,64.0,17.0,10.7,237.5,0,0,0.619718,0.286845,0.740933,0.097,0.017500
1890,0,42,59.0,9.4,8.2,288.0,0,0,0.380282,0.237389,0.382949,0.072,0.021678
1891,0,61,70.0,13.5,5.5,148.5,0,1,0.647887,0.346192,0.576072,0.045,0.010137
1892,0,29,77.0,14.0,10.0,89.0,0,0,0.197183,0.415430,0.599623,0.090,0.005214


### Onehot encoding은 범주형에 적용
- Instability, Vaccum disc

In [9]:
df_SSL_extract['Instability'].unique()  # array([0, 1])
df_SSL_extract['Vaccum disc'].unique()  # array([0, 1])

array([0, 1])

In [10]:
from sklearn.preprocessing import OneHotEncoder
oneHotEncoder_Instability = OneHotEncoder()
oneHotEncoder_Vaccum = OneHotEncoder()
oneHotEncoder_Instability.fit(df_SSL_extract[['Instability']])
oneHotEncoder_Vaccum.fit(df_SSL_extract[['Vaccum disc']])

In [11]:
oneHotEncoder_Instability.get_feature_names_out()

array(['Instability_0', 'Instability_1'], dtype=object)

In [12]:
oneHotEncoder_Vaccum.get_feature_names_out()

array(['Vaccum disc_0', 'Vaccum disc_1'], dtype=object)

In [13]:
encoder_Instability = oneHotEncoder_Instability.transform(df_SSL_extract[['Instability']]).toarray()
encoder_Instability.shape

(1891, 2)

In [14]:
encoder_Vaccum = oneHotEncoder_Vaccum.transform(df_SSL_extract[['Vaccum disc']]).toarray()
encoder_Vaccum.shape

(1891, 2)

In [15]:
# dataframe 전환
df_encoder_Instability = pd.DataFrame(data=encoder_Instability, columns=oneHotEncoder_Instability.get_feature_names_out())
df_encoder_Instability[:2]

Unnamed: 0,Instability_0,Instability_1
0,1.0,0.0
1,1.0,0.0


In [16]:
# dataframe 전환
df_encoder_Vaccum = pd.DataFrame(data=encoder_Vaccum, columns=oneHotEncoder_Vaccum.get_feature_names_out())
df_encoder_Vaccum[:2]

Unnamed: 0,Vaccum disc_0,Vaccum disc_1
0,1.0,0.0
1,1.0,0.0


In [28]:
# 행으로 concat
df_SSL_extract = pd.concat((df_SSL_extract.reset_index(drop=True), df_encoder_Instability.reset_index(drop=True)), axis=1)
df_SSL_extract = pd.concat((df_SSL_extract.reset_index(drop=True), df_encoder_Vaccum.reset_index(drop=True)), axis=1)
df_SSL_extract[:2]

Unnamed: 0,재발여부,연령,체중,전방디스크높이(mm),후방디스크높이(mm),지방축적도,Instability,Vaccum disc,연령_scaled,체중_scaled,...,Vaccum disc_0,Vaccum disc_1,Instability_0,Instability_1,Vaccum disc_0.1,Vaccum disc_1.1,Instability_0.1,Instability_1.1,Vaccum disc_0.2,Vaccum disc_1.2
0,0,66,60.3,16.1,12.3,282.3,0,0,0.71831,0.250247,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1,0,47,71.7,13.7,6.4,177.3,0,0,0.450704,0.363007,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


## 정형화 단계 - target과 feature 분리

In [18]:
target_train = df_SSL_extract['재발여부']  # 목표변수
features_train = df_SSL_extract.drop(columns=['재발여부', '연령', '체중', '전방디스크높이(mm)', '후방디스크높이(mm)', '지방축적도', 'Instability', 'Vaccum disc'])  # 설명변수
target_train.shape, features_train.shape

((1891,), (1891, 9))

## 모델

In [19]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model

In [20]:
model.fit(features_train, target_train)

## 평가

In [21]:
df_SSL_extract[25:30]

Unnamed: 0,재발여부,연령,체중,전방디스크높이(mm),후방디스크높이(mm),지방축적도,Instability,Vaccum disc,연령_scaled,체중_scaled,전방디스크높이(mm)_scaled,후방디스크높이(mm)_scaled,지방축적도_scaled,Instability_0,Instability_1,Vaccum disc_0,Vaccum disc_1
25,0,52,83.2,12.4,7.1,122.0,0,0,0.521127,0.476756,0.524258,0.061,0.007945,1.0,0.0,1.0,0.0
26,0,63,82.0,13.2,8.3,161.6,0,0,0.676056,0.464886,0.561941,0.073,0.011221,1.0,0.0,1.0,0.0
27,0,37,78.0,11.3,7.4,163.8,1,0,0.309859,0.425321,0.472445,0.064,0.011403,0.0,1.0,1.0,0.0
28,1,64,77.8,8.5,7.5,111.5,0,0,0.690141,0.423343,0.340556,0.065,0.007076,1.0,0.0,1.0,0.0
29,0,58,68.2,8.5,7.5,111.5,0,0,0.605634,0.328388,0.340556,0.065,0.007076,1.0,0.0,1.0,0.0


In [22]:
model.predict(features_train[25:30])
# 실제값 : 0, 0, 0, 1, 0 / 결과값 : 0, 0, 0, 1, 0

array([0, 0, 0, 1, 0])

In [23]:
model.predict_proba(features_train[25:30])   # 확률값

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.]])

### 평가 수치

In [24]:
# 정확도
from sklearn.metrics import accuracy_score
target_train_predict = model.predict(features_train)
target_train_predict.shape

(1891,)

In [25]:
accuracy_score(target_train, target_train_predict)

0.9989423585404548

### F1 score
- 정밀도, 재현율 : F1

In [26]:
from sklearn.metrics import classification_report
print(classification_report(target_train, target_train_predict))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1664
           1       1.00      0.99      1.00       227

    accuracy                           1.00      1891
   macro avg       1.00      1.00      1.00      1891
weighted avg       1.00      1.00      1.00      1891

