### Coulmns Info
- Pregnancies: 임신 횟수
- Glucose: 포도당 부하 검사 수치
- BloodPressure: 혈압(mm Hg)
- Skin Thickness: 팔 삼두근 뒤쪽의 피하지방 측정값(mm)
- Insulin: 혈청 인슐린(mu U/ml)
- BMI: 체질량지수(체중(kg)/(키(m)^2))
- DiabetesPedigreeFunction: 당뇨 내력 가중치 값
- Age: 나이
- Outccome: 클래스 결정 값(0 or 1)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

diabetes_data = pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')
print(diabetes_data['Outcome'].value_counts())
diabetes_data.head(5)

In [None]:
diabetes_data.info()

In [None]:
# Set a function to print evaluation metric
def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix(y_test, pred) 
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    F1 = f1_score(y_test, pred)
    AUC = roc_auc_score(y_test, pred_proba)

    print('Confusion matrix:\n', confusion)
    print('\nAccurcy: {:.4f}'.format(accuracy))
    print('Precision: {:.4f}'.format(precision))
    print('Recall: {:.4f}'.format(recall))
    print('F1: {:.4f}'.format(F1))
    print('AUC: {:.4f}'.format(AUC))

In [None]:
# Precision-Recall Curve Plot
def precision_recall_curve_plot(y_test=None, pred_proba=None):
    # threshold ndarray와 이 threshold에 따른 정밀도, 재현율 ndarray 추출
    precisions, recalls, thresholds = precision_recall_curve(y_test, pred_proba)
    
    # x =  threshold, y = Precision, 
    plt.figure(figsize=(8, 6))
    thresholds_boundary = thresholds.shape[0]
    plt.plot(thresholds, precisions[0:thresholds_boundary], linestyle='--', label='precision')
    plt.plot(thresholds, recalls[0:thresholds_boundary], linestyle=':', label='recall')
    
    # threshold's X scale change to 0.1
    stard, end = plt.xlim()
    plt.xticks(np.round(np.arange(stard, end, 0.1), 2))
    
    plt.xlim()
    plt.xlabel('thresholds')
    plt.ylabel('precision and recall value')
    plt.legend()
    plt.grid()
    plt.show()

### Training and Predict with Logistic Regression

In [None]:
x = diabetes_data.iloc[:, :-1]
y = diabetes_data.iloc[:, -1]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=156, stratify=y)

# training and predict
lr_clf = LogisticRegression(solver='liblinear')
lr_clf.fit(x_train, y_train)
pred = lr_clf.predict(x_test)
pred_proba = lr_clf.predict_proba(x_test)[:, 1]

get_clf_eval(y_test, pred, pred_proba)

### Curve plot precision and recall

In [None]:
pred_proba = lr_clf.predict_proba(x_test)[:, 1]
precision_recall_curve_plot(y_test, pred_proba)

### Quaternary distribution of each Features

In [None]:
diabetes_data.describe()

In [None]:
plt.hist(diabetes_data['Glucose'], bins=100)
plt.show()

### Check and replace data what include 0 value

In [None]:
# Features list to check
zero_features = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

# Amount of all data
total_count = diabetes_data['Glucose'].count()

# Extract and calculate 0 value of data
for feature in zero_features:
    zero_count = diabetes_data[diabetes_data[feature] == 0][feature].count()
    print(f'{feature} 건수는 {zero_count}, 퍼센트는 {100*zero_count/total_count:.2f}')

In [None]:
# Replace 0 values to mean of other values
diabetes_data[zero_features] = diabetes_data[zero_features].replace(0, diabetes_data[zero_features].mean())

In [None]:
for feature in zero_features:
    zero_count = diabetes_data[diabetes_data[feature] == 0][feature].count()
    print(f'{feature} 건수는 {zero_count}, 퍼센트는 {100*zero_count/total_count:.2f}')

### Scaling with StandardScale and training that data

In [None]:
x = diabetes_data.iloc[:, :-1]
y = diabetes_data.iloc[:, -1]

# scaling with standardscler class
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=156, stratify=y)

# training with Logistic regression
lr_clf = LogisticRegression(solver='liblinear')
lr_clf.fit(x_train, y_train)
pred = lr_clf.predict(x_test)
pred_proba = lr_clf.predict_proba(x_test)[:, 1]

get_clf_eval(y_test, pred, pred_proba)

### Evaluation metric during change classification threshold

In [None]:
from sklearn.preprocessing import Binarizer

def get_eval_by_threshold(y_test, pred_proba, threshold):
    # Iteration and Evaluation each threshold in list
    for custom_threshold in thresholds:
        binzrizer = Binarizer(threshold=custom_threshold).fit(pred_proba)
        custom_predict = binzrizer.transform(pred_proba)
        print(f'Treshold: {custom_threshold}')
        get_clf_eval(y_test, custom_predict, pred_proba)

In [None]:
thresholds = [0.3, 0.33, 0.36, 0.39, 0.42, 0.45, 0.48, 0.50]
pred_proba = lr_clf.predict_proba(x_test)

get_eval_by_threshold(y_test, pred_proba[:, -1].reshape(-1, 1), thresholds)

In [None]:
# Create Binarizer what threshold=0.48, 0.39
binarizer1 = Binarizer(threshold=0.48)
binarizer2 = Binarizer(threshold=0.39)

# Transform column what predict percent is 1 in above binarizer
pred_th_048 = binarizer1.fit_transform(pred_proba[:, 1].reshape(-1, 1))
pred_th_039 = binarizer2.fit_transform(pred_proba[:, 1].reshape(-1, 1))

get_clf_eval(y_test, pred_th_048, pred_proba[:, 1])
get_clf_eval(y_test, pred_th_039, pred_proba[:, 1])