### 파킨슨병 데이터
- 환자들의 뇌를 촬영한 사진의 상태를 기록한 자료에 각 환자의 상태 status(1: 파킨슨병 진단, 0: 파킨슨병 아님)로 추가한 테이블
1. 파킨슨 병을 예측하는 모델로 로지스틱 회귀모형을 적용하여 생성
2. 파킨슨병을 예측하는데 영향을 미치는 변수를 중요한 순서대로 3개 선정
3. 파킨슨 병을 진단하는 기준(threshold, cutoff)을 0.5로 했을 때와 0.8로 했을 때 F1-스코어를 비교
    - 분석 조건
        - 필요 없는 컬럼 name을 삭제
        - 데이터의 정규화는 min-max 스케일러 사용
        - 로지스틱 회귀를 위한 상수항 추가
        - status는 카테고리 타입으로 변환
        - 트레이닝셋과 테스트셋 비율은 9:1
        - 모델은 로지스틱 회귀분석 사용
        - 모델의 최적화 방법론은 "bfgs" 사용

In [52]:
import pandas as pd
import numpy as np

In [53]:
df = pd.read_csv("./data/parkinsons.csv")
df

Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,0.00007,0.00370,0.00554,0.01109,0.04374,...,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,phon_R01_S01_2,122.400,148.650,113.819,0.00968,0.00008,0.00465,0.00696,0.01394,0.06134,...,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.335590,2.486855,0.368674
2,phon_R01_S01_3,116.682,131.111,111.555,0.01050,0.00009,0.00544,0.00781,0.01633,0.05233,...,0.08270,0.01309,20.651,1,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,0.00009,0.00502,0.00698,0.01505,0.05492,...,0.08771,0.01353,20.644,1,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,...,0.10470,0.01767,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.332180,0.410335
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,phon_R01_S50_2,174.188,230.978,94.261,0.00459,0.00003,0.00263,0.00259,0.00790,0.04087,...,0.07008,0.02764,19.517,0,0.448439,0.657899,-6.538586,0.121952,2.657476,0.133050
191,phon_R01_S50_3,209.516,253.017,89.488,0.00564,0.00003,0.00331,0.00292,0.00994,0.02751,...,0.04812,0.01810,19.147,0,0.431674,0.683244,-6.195325,0.129303,2.784312,0.168895
192,phon_R01_S50_4,174.688,240.005,74.287,0.01360,0.00008,0.00624,0.00564,0.01873,0.02308,...,0.03804,0.10715,17.883,0,0.407567,0.655683,-6.787197,0.158453,2.679772,0.131728
193,phon_R01_S50_5,198.764,396.961,74.904,0.00740,0.00004,0.00370,0.00390,0.01109,0.02296,...,0.03794,0.07223,19.020,0,0.451221,0.643956,-6.744577,0.207454,2.138608,0.123306


In [54]:
## name 컬럼을 삭제
df.drop(['name'], axis=1, inplace=True)

In [55]:
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import minmax_scale
from sklearn.metrics import f1_score

In [56]:
## minmax_scaler 이용하여 정규화
df_s = minmax_scale(df)
df1 = pd.DataFrame(df_s)
df1.columns = df.columns
df1

Unnamed: 0,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,0.184308,0.112592,0.054815,0.195680,0.249012,0.145472,0.247588,0.145288,0.312215,0.280197,...,0.332584,0.068307,0.511745,1.0,0.369155,0.960148,0.569875,0.585765,0.390661,0.497310
1,0.198327,0.094930,0.278323,0.254130,0.288538,0.191233,0.323687,0.191042,0.472887,0.444536,...,0.516048,0.059331,0.432577,1.0,0.470830,0.977024,0.703277,0.741337,0.473145,0.671326
2,0.165039,0.059128,0.265288,0.280178,0.328063,0.229287,0.369239,0.229411,0.390634,0.326212,...,0.443317,0.039596,0.496220,1.0,0.404416,1.000000,0.636745,0.686371,0.408819,0.596682
3,0.165004,0.072927,0.264200,0.263342,0.328063,0.209056,0.324759,0.208862,0.414278,0.354971,...,0.475478,0.040997,0.495936,1.0,0.416255,0.975885,0.695627,0.738089,0.436977,0.671949
4,0.161150,0.080909,0.260107,0.354511,0.407115,0.282755,0.437299,0.282870,0.499452,0.410025,...,0.584542,0.054174,0.455499,1.0,0.375159,0.992813,0.762472,0.513798,0.404336,0.757611
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,0.499820,0.262986,0.165722,0.092440,0.090909,0.093931,0.089496,0.094076,0.286014,0.262942,...,0.362306,0.085909,0.450134,0.0,0.447684,0.333127,0.257894,0.260408,0.549049,0.183318
191,0.705488,0.307974,0.138243,0.125794,0.090909,0.126686,0.107181,0.126826,0.164050,0.146261,...,0.221338,0.055543,0.435097,0.0,0.408567,0.434101,0.319956,0.276956,0.605474,0.257558
192,0.502730,0.281413,0.050727,0.378653,0.288538,0.267823,0.252947,0.267940,0.123608,0.140509,...,0.156631,0.338988,0.383728,0.0,0.352318,0.324299,0.212945,0.342577,0.558967,0.180580
193,0.642893,0.601807,0.054279,0.181703,0.130435,0.145472,0.159700,0.145288,0.122512,0.128184,...,0.155989,0.227838,0.429936,0.0,0.454176,0.277579,0.220650,0.452885,0.318222,0.163137


In [57]:
## 상수항 추가 
## statsmodel.api에 있는 add_constact() 함수를 이용
df1 = sm.add_constant(df1, has_constant='add')
df1.head()

Unnamed: 0,const,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,1.0,0.184308,0.112592,0.054815,0.19568,0.249012,0.145472,0.247588,0.145288,0.312215,...,0.332584,0.068307,0.511745,1.0,0.369155,0.960148,0.569875,0.585765,0.390661,0.49731
1,1.0,0.198327,0.09493,0.278323,0.25413,0.288538,0.191233,0.323687,0.191042,0.472887,...,0.516048,0.059331,0.432577,1.0,0.47083,0.977024,0.703277,0.741337,0.473145,0.671326
2,1.0,0.165039,0.059128,0.265288,0.280178,0.328063,0.229287,0.369239,0.229411,0.390634,...,0.443317,0.039596,0.49622,1.0,0.404416,1.0,0.636745,0.686371,0.408819,0.596682
3,1.0,0.165004,0.072927,0.2642,0.263342,0.328063,0.209056,0.324759,0.208862,0.414278,...,0.475478,0.040997,0.495936,1.0,0.416255,0.975885,0.695627,0.738089,0.436977,0.671949
4,1.0,0.16115,0.080909,0.260107,0.354511,0.407115,0.282755,0.437299,0.28287,0.499452,...,0.584542,0.054174,0.455499,1.0,0.375159,0.992813,0.762472,0.513798,0.404336,0.757611


In [58]:
## 종속 변수와 독립 변수의 분리 및 종속 변수의 카테고리화
feature_columns = list(df1.columns.difference(['status']))

x = df1[feature_columns]
y = df1['status'].astype('category')

In [59]:
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size=0.1, random_state=2023)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(175, 23) (20, 23) (175,) (20,)


In [60]:
## statsmodel.api에 있는 로지스틱 회귀 사용
model = sm.Logit(y_train, x_train)

In [61]:
result = model.fit(method='bfgs', maxiter=1000)

Optimization terminated successfully.
         Current function value: 0.220513
         Iterations: 287
         Function evaluations: 288
         Gradient evaluations: 288


In [62]:
## 종속변수 예측에 영향을 주는 변수 
result.summary()

0,1,2,3
Dep. Variable:,status,No. Observations:,175.0
Model:,Logit,Df Residuals:,152.0
Method:,MLE,Df Model:,22.0
Date:,"Fri, 27 Jan 2023",Pseudo R-squ.:,0.6045
Time:,15:47:38,Log-Likelihood:,-38.59
converged:,True,LL-Null:,-97.576
Covariance Type:,nonrobust,LLR p-value:,4.067e-15

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
D2,-0.7828,3.823,-0.205,0.838,-8.277,6.711
DFA,1.8955,2.571,0.737,0.461,-3.143,6.934
HNR,-0.0579,5.961,-0.010,0.992,-11.742,11.626
Jitter:DDP,39.4157,3114.702,0.013,0.990,-6065.288,6144.120
MDVP:APQ,36.9851,51.949,0.712,0.476,-64.834,138.804
MDVP:Fhi(Hz),-0.5726,2.231,-0.257,0.797,-4.945,3.800
MDVP:Flo(Hz),-0.3627,2.190,-0.166,0.868,-4.655,3.930
MDVP:Fo(Hz),-1.6751,3.702,-0.452,0.651,-8.931,5.581
MDVP:Jitter(%),-44.0046,39.914,-1.102,0.270,-122.234,34.225


- 종속 변수에 영향을 미치는 상위 변수 3개는 spread2, PPE, MDVP:Shimmer(dB)로 선정
- p-value가 작을수록 종속 변수 예측에 영향을 미친다.

In [63]:
## 임계치에 따라 모델의 정확도 비교
## 임계치에 따라 예측된 확률값을 1 또는 0으로 분할하는 함수 생성
def cut_off(y, thresbold):
    Y = y.copy()
    Y[Y > thresbold] = 1
    Y[Y <= thresbold] = 0
    return (Y.astype(int))

In [64]:
y_test_pred_prob = result.predict(x_test)
pred = cut_off(y_test_pred_prob, 0.8)
pred2 = cut_off(y_test_pred_prob, 0.5)

In [65]:
f1_score(y_test, pred)

0.7200000000000001

In [66]:
f1_score(y_test, pred2)

0.896551724137931

- cut_off 0.5인 경우는 0.8965
- 0.8인 경우는 0.72