In [1]:
# Module Name: pima_indian_diabetes.py

import pandas as pd
import numpy as np

#EDA-chart / 수치화,결측처리-> corr()
import matplotlib.pyplot as plt
import seaborn as sns

#전처리 : outlier, scale(정규/표준), encoding
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, Binarizer #pd.get_dummpy()

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# from sklearn.ensemble import Xg.. li...
# from sklearn.metrics import mean_squared_error ... mse rmse mas rmsle
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
def myscore(y_test, pred, proba=None):
    accuracy = accuracy_score(y_test, pred)
    recall = recall_score(y_test, pred)
    precision = precision_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    confusion = confusion_matrix(y_test, pred)
    auc = roc_auc_score(y_test, proba[:,1].reshape(-1,1))
    print("정확도:{:.4f},재현율:{:.4f},정밀도:{:.4f},f1:{:.4f},auc:{:.4f}".format(accuracy,recall,precision,f1,auc))
    print("confusion:", confusion)

In [1]:
df = pd.read_csv("./dataset/diabetes.csv")
y = df.iloc[:,-1]
X = df.iloc[:,:-1]
print(X.shape, y.shape)
print(df.info())

NameError: name 'pd' is not defined

In [None]:
#============= 결측처리 / 숫자형 --> EDA
df.hist(figsize=(6,6))
plt.show()

In [None]:
#============= EDA(상관분석)
sns.heatmap(df.corr(), cmap='Blues', annot=True, fmt='0.2f')
plt.show()
print(df.columns)

* 타켓과 상관도가 높은 피쳐<br>
'Outcome' : 'Glucose','BMI','Age'<br>
* 다중공선x , 주요있게 살펴야할 피쳐<br>
'Age' : 'Pregnancies'<br>
'Insulin' : 'SkinThickness', 'Glucose'   


### 데이터 전처리 전 우선 점수부터 확인
* 숫치형으로만 이루어진 데이터
* 결측이 없는 데이터

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=160)
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
pred = rf.predict(X_test)
proba = rf.predict_proba(X_test)
print("전처리 전 우선 점수부터 확인------\n")
myscore(y_test, pred, proba)

In [None]:
#============= 전처리 : oulier 일환 = 0값 찾기
features = X.columns
for feature in features:
    zero_cnt = X[X[feature]==0][feature].count()
    print(feature, zero_cnt, zero_cnt/df.shape[0]*100)


In [None]:
df.describe()

In [None]:
zero_column = features[1:6]  #['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
print(zero_column)

zero_column_mean = X[zero_column].mean().round(0)
X[zero_column] = X[zero_column].replace(0, zero_column_mean)

In [None]:
#------------ 0값 평균값으로 대체 : 변경확인 -----------
for feature in X.columns:
    zero_cnt = X[X[feature]==0][feature].count()
    print(feature, zero_cnt, zero_cnt/X.shape[0]*100)

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=160)
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
pred = rf.predict(X_test)
proba = rf.predict_proba(X_test)
print("평균값 채운 후 점수 확인------\n")
myscore(y_test, pred, proba)

In [None]:
#============= 정규화 : 평균0 분산1로 스케일 조정
scaler = StandardScaler()
X_scaler = scaler.fit_transform(X)
X_scaler_df = pd.DataFrame(X_scaler, columns=X.columns)
X_scaler_df.hist()
plt.show()

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X_scaler_df, y, test_size=0.2, random_state=160)
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
pred = rf.predict(X_test)
proba = rf.predict_proba(X_test)
print(pred[:10])  #당뇨인지 아닌지 내가 예측한거
print(proba[:10])

print("정규화 후 점수 확인------\n")
myscore(y_test, pred, proba)

#### 정밀도_재현율 커브 곡선 ( 임계치 확인용 )

In [None]:
pscore,rscore,th = precision_recall_curve(y_test, proba[:,-1])
print(len(pscore), len(rscore), len(th)) #58 58 57

plt.plot(th, pscore[:len(th)], label="precision")
plt.plot(th, rscore[:len(th)], label="recall")
plt.xlabel("thredholds")
plt.ylabel("precision recall value")
plt.legend()
plt.grid()
plt.show()

In [None]:
thresholds = [0.3,0.4,0.5,0.55, 0.6]
for th in thresholds:
    binarizer = Binarizer(threshold=th)
    pred = binarizer.fit_transform(proba[:,-1].reshape(-1,1))
    
    print("Negative:Positive",th,1-th)  #, pred[:10], proba[:10])#P.s
    myscore(y_test, pred, proba)
    

#### roc_curv() 커브 곡선 ( 임계치 확인용 )

In [None]:

fpr,tpr,th = roc_curve(y_test, proba[:,-1]) #th: max(score_len) + 1
plt.plot(fpr, tpr, label="roc")
plt.plot([0,1], [0,1], label="th:0.5")
plt.xlabel("FPR(1-sen.)")
plt.ylabel("TPR(recall)")
plt.legend()
plt.grid()
plt.show()


In [None]:
auc = roc_auc_score(y_test, proba[:,-1].reshape(-1,1))
print("AUC Score:", auc)

### 단일 모델 
* 평가  / 검증 / 튜닝


* 5.2. KNN<br>
n_neighbors: Number of neighbors to use by default for k_neighbors queries
model_KNN = KNeighborsClassifier()
neighbors = [1,2,3,4]
param_grid = dict(n_neighbors=neighbors)
<br>
* 5.3. SVC<br>
C: The Penalty parameter C of the error term.
Kernel: Kernel type could be linear, poly, rbf or sigmoid.
c_values = [0.1, 0.3, 0.5, 0.7, 0.9, 1.0, 1.3, 1.5, 1.7, 2.0]
kernel_values = [ 'linear' , 'poly' , 'rbf' , 'sigmoid' ]
param_grid = dict(C=c_values, kernel=kernel_values)
model_SVC = SVC()
<br>
* 5.4. Decision Tree<br>
param_grid = dict(max_depth = 4, min_samples_leaf = 6,
                  criterion = ["gini", "entropy"])
model_CART = DecisionTreeClassifier()
 <br>
* 5.5 AdaBoostClassifier <br>
learning_rate: Learning rate shrinks the contribution of each classifier by learning_rate.
n_estimators: Number of trees to build. <br>

learning_rate_value = [.01,.05,.1,.5, 1]
n_estimators_value = [50,100,150,200,250,300]

model_Ad = AdaBoostClassifier()

* 5.1.Logistic Regression<br>
C(L2 규제) 값이 클수록 규제강도가 커진다(오버피팅 피하기) <br>
L2 - 릿지 - 왜곡,편중,이상치 데이터가 많은 경우 <br>
L1 - 라쏘 - 불필요한 피쳐수를 줄이는 경우 <br>

C : Regularization value, the more, the stronger the regularization(double).<br>
RegularizationType: Can be either "L2" or “L1”. Default is “L2”.

In [None]:
#X_train,X_test,y_train,y_test
def fit_predict(model):
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    proba = model.predict_proba(X_test)
    myscore(y_test, pred, proba)

from sklearn.linear_model import LogisticRegression
penalty = ['l2']  #['l1', 'l2']
C= [0.5, 1.0, 3.0]

for p in penalty:
    for c in C:
        print("규제, 강도", p, c)
        model = LogisticRegression(random_state=11,penalty=p, C=c)
        fit_predict(model)

* 5.6 GradientBoosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
# cv=5
learning_rate_value = [1.0]
n_estimators_value = [100,300,500,600]
# for c in cv:  #GridsearchCV와 다르다.(같은 데이터를 5번 학습)
for lr in learning_rate_value:
    for est in n_estimators_value:
        print("lr, 트리갯수", lr, est)
        model = GradientBoostingClassifier(random_state=11,learning_rate=lr, n_estimators=est)
        fit_predict(model)


* 5.7 XGBoost, LightGBM, Nueal Network 

In [None]:
from xgboost import XGBClassifier
n_estimators_value = [100,300,500]
max_depth = [3,5,7]
#과적합 :: min_child_weight(작게) / gamma(노드분기:크게)
#         eta(learnrate:작게)
for est in n_estimators_value:
    for depth in max_depth:
        print("트리갯수, 트리깊이", est, depth)
        model = XGBClassifier(objective="binary:logistic",
                            #early_stopping_rounds=10,
                            eval_metric="auc",
                            n_estimators=est,
                            max_depth = depth
        )
        fit_predict(model)


### Ensemble : Voting 

In [None]:
from sklearn.ensemble import VotingClassifier

# model1 = LogisticRegression()
# model8 = RandomForestClassifier()

model_1 = GradientBoostingClassifier(random_state=11,learning_rate=1.0, n_estimators=600)        
model_2 = XGBClassifier(objective="binary:logistic",
                            #early_stopping_rounds=10,
                            eval_metric="auc",
                            n_estimators=100,
                            max_depth = 3
        )

model_list = [('GB',model_1), ('XGB',model_2)]
ensemble_model = VotingClassifier(model_list, voting='soft')

ensemble_model.fit(X_train, y_train)
pred = ensemble_model.predict(X_test)
proba = ensemble_model.predict_proba(X_test)
myscore(y_test, pred, proba)


# score5 = cross_val_score(ensemble_model, X, y, cv=5)  #, scoring='f1')
# print('평균정확도: ',score5.mean())


### 하이퍼파라미터 튜닝

In [None]:
hyper_param = { "n_estimators":[5,10,50,100] #,200,300,400], 
                "max_depth":[3,4,5,6,7,8]}
model = XGBClassifier(objective="binary:logistic",
                            #eval_metric="auc",
                            #n_estimators=est,
                            #max_depth = depth
        )
grid_model = GridSearchCV(estimator=model,
                          param_grid=hyper_param,
             scoring='roc_auc',  refit=True, cv=5, verbose=0)
grid_model.fit(X_train, y_train)
print(grid_model.best_params_)
print(grid_model.best_score_)



### Stacking

In [None]:
##