### 파킨슨병 데이터
- 환자들의 뇌를 촬영한 사진의 상태를 기록한 자료에 각 환자의 상태 status(1: 파킨슨병 진단, 0: 파킨슨병 아님)로 추가한 테이블
- (data/parkinsons.csv)
1. 파킨슨 병을 예측하는 모델로 로지스틱 회귀모형을 적용하여 생성
2. 파킨슨병을 예측하는데 영향을 미치는 변수를 중요한 순서대로 3개 선정
3. 파킨슨 병을 진단하는 기준를 함수로 생성하여(매개변수명 = threshold, 함수명 = cutoff)을 0.5로 했을 때와 0.8로 했을 때 F1-스코어를 비교
    - 분석 조건
        - 필요 없는 컬럼 name을 삭제
        - 데이터의 정규화는 min-max 스케일러 사용
        - 로지스틱 회귀를 위한 상수항 추가
        - status는 카테고리 타입으로 변환
        - 트레이닝셋과 테스트셋 비율은 9:1
        - 모델은 로지스틱 회귀분석 사용
        - 모델의 최적화 방법론은 "bfgs" 사용

In [2]:
import pandas as pd
import numpy as np

In [33]:
df = pd.read_csv('./csv_assignment/parkinsons.csv')

In [34]:
# EDA-1
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              195 non-null    object 
 1   MDVP:Fo(Hz)       195 non-null    float64
 2   MDVP:Fhi(Hz)      195 non-null    float64
 3   MDVP:Flo(Hz)      195 non-null    float64
 4   MDVP:Jitter(%)    195 non-null    float64
 5   MDVP:Jitter(Abs)  195 non-null    float64
 6   MDVP:RAP          195 non-null    float64
 7   MDVP:PPQ          195 non-null    float64
 8   Jitter:DDP        195 non-null    float64
 9   MDVP:Shimmer      195 non-null    float64
 10  MDVP:Shimmer(dB)  195 non-null    float64
 11  Shimmer:APQ3      195 non-null    float64
 12  Shimmer:APQ5      195 non-null    float64
 13  MDVP:APQ          195 non-null    float64
 14  Shimmer:DDA       195 non-null    float64
 15  NHR               195 non-null    float64
 16  HNR               195 non-null    float64
 1

In [35]:
# EDA-2
df.head(2)

Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,...,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,phon_R01_S01_2,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,...,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674


In [36]:
# EDA-3
df['status'].value_counts()

1    147
0     48
Name: status, dtype: int64

In [37]:
# Preprocessing

# pre - 1. 'name' column 삭제 
df.drop('name', axis=1, inplace=True)
 

In [39]:
# pre - 2. Status 컬럼 dtype 변경
df['status'] = df['status'].astype('category')

In [41]:
df.info() # 'status' dtype 확인

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   const             195 non-null    float64 
 1   MDVP:Fo(Hz)       195 non-null    float64 
 2   MDVP:Fhi(Hz)      195 non-null    float64 
 3   MDVP:Flo(Hz)      195 non-null    float64 
 4   MDVP:Jitter(%)    195 non-null    float64 
 5   MDVP:Jitter(Abs)  195 non-null    float64 
 6   MDVP:RAP          195 non-null    float64 
 7   MDVP:PPQ          195 non-null    float64 
 8   Jitter:DDP        195 non-null    float64 
 9   MDVP:Shimmer      195 non-null    float64 
 10  MDVP:Shimmer(dB)  195 non-null    float64 
 11  Shimmer:APQ3      195 non-null    float64 
 12  Shimmer:APQ5      195 non-null    float64 
 13  MDVP:APQ          195 non-null    float64 
 14  Shimmer:DDA       195 non-null    float64 
 15  NHR               195 non-null    float64 
 16  HNR               195 non-

> LogisticRegression 시작
 

In [40]:
# 1. 상수항 추가
import statsmodels.api as sm

df = sm.add_constant(df,has_constant = "add")
df.head(1)

Unnamed: 0,const,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,1.0,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,...,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654


In [43]:
# 2. 데이터 split
from sklearn.model_selection import train_test_split

# x, y 분리
feature_cols = list(df.columns.difference(['status']))

x = df[feature_cols]
y = df['status']

x_train, x_test, y_train, y_test = train_test_split(x,y, stratify = y ,train_size=0.9, random_state= 1)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)


(175, 23) (20, 23) (175,) (20,)


In [44]:
# 3. minMax scaler

from sklearn.preprocessing import MinMaxScaler
Mmscaler = MinMaxScaler()

x_train_mm = Mmscaler.fit_transform(x_train)
x_test_mm = Mmscaler.fit_transform(x_test)


In [50]:
# 4. feature 선정 
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

    # 학습
log_rg = lr.fit(x_train_mm, y_train)
log_rg



In [51]:
    # 예측
lr_preds = lr.predict(x_test_mm)
lr_preds


array([0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1],
      dtype=int64)

In [52]:
from sklearn.model_selection import cross_val_score

cross_val_score(log_rg, x_train_mm, y_train, cv=5)
# lr.score(x_test_mm, y_test)

0.75

In [46]:
# 4-1. 중요도 상위 3개 col 출력
feature_importance_lr = pd.DataFrame(zip(x.columns.values, lr.coef_.ravel()))
feature_importance_lr.columns = ['feature', 'coef']
feature_importance_lr.sort_values("coef", ascending=False, inplace=True)

feature_importance_lr.head(3)

Unnamed: 0,feature,coef
21,spread1,1.816588
15,PPE,1.537989
0,D2,1.443381


In [47]:
# 5. cutoff함수 생성

def cutoff(y, threshold) :
    Y = y.copy()
    Y[Y>threshold] = 1
    Y[Y<=threshold] = 0
    return Y.astype(int)



In [None]:
pred_y

In [54]:
# 5-1. f1 score 비교
from sklearn.metrics import f1_score

pred_Y_5 = cutoff(lr_preds, 0.5)
pred_Y_8 = cutoff(lr_preds, 0.8)

print(f'threshold = 0.5 일때 : {f1_score(y_test, pred_Y_5):.10f}')
print(f'threshold = 0.8 일때 : {f1_score(y_test, pred_Y_8):.10f}')

threshold = 0.5 일때 : 0.8387096774
threshold = 0.8 일때 : 0.8387096774


### 파킨슨 RE - Try

1. preprocessing 
    - drop 'name'
    - 'status' dtype -> 'category'
    - 상수항 추가

2. Scaling -> X 
    - split -> 9 : 1
    - minMax

3. Logistic Regression
    - fit -> transform 
    - Feature 선정 by 'cross_val_score'
    
4. Logistic Regression w/ Top3 Features
    - split -> 9 : 1
    - fit & trasform 
    - predict
    
5. def cutoff
    - 임계치 0.5 & 0.8 의 F1_score 비교

6. 최적화 bfgs

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('./csv_assignment/parkinsons.csv')

In [4]:
# 1. Pre

# drop
df.drop('name', axis=1, inplace=True)

# dtype 
df['status'] = df['status'].astype('category')

# 상수항 컬럼
import statsmodels.api as sm
df = sm.add_constant(df,has_constant = "add")

In [6]:
# pre 결과 확인인

# df.info()
df['const'].value_counts()

1.0    195
Name: const, dtype: int64

In [16]:
# 2. scaling 위한 X, Y 분리 
from sklearn.model_selection import train_test_split


# x, y 분리 
feature_cols = list(df.columns.difference(['status']))
x = df[feature_cols]
y = df['status']

# train & test split
x_train, x_test, y_train, y_test = train_test_split(
    x,y, stratify = y ,train_size=0.9, random_state= 1)

# minMax 
from sklearn.preprocessing import MinMaxScaler
Mmscaler = MinMaxScaler()

x_train_mm = Mmscaler.fit_transform(x_train)
x_test_mm = Mmscaler.fit_transform(x_test)

In [15]:
#  3. Logistic Regression

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

    # 학습
lr.fit(x_train_mm, x_test_mm)
lr.


Unnamed: 0,D2,DFA,HNR,Jitter:DDP,MDVP:APQ,MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Fo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),...,MDVP:Shimmer(dB),NHR,PPE,RPDE,Shimmer:APQ3,Shimmer:APQ5,Shimmer:DDA,const,spread1,spread2
133,2.034827,0.723096,25.619,0.00504,0.00903,123.723,109.836,118.747,0.00331,3e-05,...,0.099,0.00504,0.141422,0.482296,0.0049,0.00621,0.01471,1.0,-6.448134,0.178713
95,2.344348,0.705658,21.693,0.00602,0.02571,163.267,149.605,157.447,0.00369,2e-05,...,0.283,0.01018,0.164916,0.447285,0.01813,0.01909,0.05439,1.0,-6.247076,0.180528
15,2.205546,0.658245,25.175,0.00471,0.01359,217.455,83.159,142.167,0.00369,3e-05,...,0.126,0.00839,0.234589,0.565924,0.00772,0.00888,0.02316,1.0,-5.340115,0.210185
59,1.831691,0.817396,21.66,0.0112,0.02519,271.314,104.68,114.847,0.00867,8e-05,...,0.35,0.01143,0.316395,0.547975,0.01805,0.01974,0.05414,1.0,-4.609161,0.221711
3,2.405554,0.819235,20.644,0.01505,0.03772,137.871,111.366,116.676,0.00997,9e-05,...,0.517,0.01353,0.368975,0.434969,0.02924,0.04005,0.08771,1.0,-4.117501,0.334147
48,2.079922,0.733659,23.162,0.00507,0.01433,128.611,115.765,122.188,0.00524,4e-05,...,0.143,0.00839,0.133867,0.579597,0.00855,0.00776,0.02566,1.0,-6.439398,0.266392
139,2.445646,0.696049,18.801,0.00542,0.0277,131.731,109.815,116.15,0.00381,3e-05,...,0.267,0.01827,0.184985,0.624811,0.01514,0.01812,0.04543,1.0,-5.866357,0.23307
44,2.330716,0.635285,25.368,0.00327,0.01033,250.912,232.435,243.439,0.0021,9e-06,...,0.126,0.00454,0.09147,0.438296,0.00777,0.00898,0.0233,1.0,-7.057869,0.091608
81,2.232576,0.778747,19.659,0.01283,0.0219,107.715,87.549,95.605,0.00702,7e-05,...,0.296,0.01825,0.260633,0.576084,0.01792,0.01841,0.05377,1.0,-5.132032,0.210458
24,2.692176,0.732479,23.831,0.0114,0.01497,200.841,76.779,163.656,0.00742,5e-05,...,0.164,0.01778,0.215961,0.397937,0.00738,0.00948,0.02214,1.0,-5.557447,0.22089
