# PROJECT: PREDICTING DIABETES

In [None]:
import numpy as np
import pandas as pd 
import seaborn as sns
from sklearn.preprocessing import scale 
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_auc_score,roc_curve
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier


from warnings import filterwarnings
filterwarnings('ignore')

# First Look at Data

In [None]:
diabetes = pd.read_csv("diabetes.csv")
df = diabetes.copy()
df = df.dropna()
df.head()

In [None]:
df.info()

In [None]:
df["Outcome"].value_counts()

In [None]:
df["Outcome"].value_counts().plot.barh();

In [None]:
df.describe().T

In [None]:
y = df["Outcome"]
X = df.drop(["Outcome"], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.30, 
                                                    random_state = 42)


# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
loj = LogisticRegression(solver = "liblinear")
loj_model = loj.fit(X_train,y_train)
loj_model

In [None]:
loj_model.intercept_

In [None]:
loj_model.coef_

In [None]:
y_pred = loj_model.predict(X)

In [None]:
confusion_matrix(y, y_pred)

In [None]:
accuracy_score(y, y_pred)

In [None]:
#roc curve

In [None]:
logit_roc_auc = roc_auc_score(y_test, loj_model.predict(X_test))

fpr, tpr, thresholds = roc_curve(y_test, loj_model.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='AUC (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Ratio')
plt.ylabel('True Positive Ratio')
plt.title('ROC')
plt.show()

In [None]:
accuracy_score(y_test, loj_model.predict(X_test))

In [None]:
cross_val_score(loj_model, X_test, y_test, cv = 10).mean()

# Random Forests

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_model = RandomForestClassifier().fit(X_train, y_train)

In [None]:
rf_model

In [None]:
y_pred = rf_model.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
rf_model

In [None]:
rf_params = {"max_depth": [2,5,8,10],
            "max_features": [2,5,8],
            "n_estimators": [10,500,1000],
            "min_samples_split": [2,5,10]}

In [None]:
rf_model = RandomForestClassifier()

rf_cv_model = GridSearchCV(rf_model, 
                           rf_params, 
                           cv = 10, 
                           n_jobs = -1, 
                           verbose = 2) 

In [None]:
rf_cv_model.fit(X_train, y_train)

In [None]:
rf_tuned = RandomForestClassifier(max_depth = 5, 
                                  max_features = 5, 
                                  min_samples_split = 10,
                                  n_estimators = 10)

rf_tuned.fit(X_train, y_train)

In [None]:
y_pred = rf_tuned.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
Importance = pd.DataFrame({"Importance": rf_tuned.feature_importances_*100},
                         index = X_train.columns)

In [None]:
Importance.sort_values(by = "Importance", 
                       axis = 0, 
                       ascending = True).plot(kind ="barh", color = "r")

plt.xlabel("Değişken Önem Düzeyleri")

# Gradient Boosting Machines

In [None]:
gbm_model = GradientBoostingClassifier().fit(X_train, y_train)

In [None]:
y_pred = rf_model.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
df = diabetes.copy()
df = df.dropna()
y = df["Outcome"]
X = df.drop(['Outcome'], axis=1)
#X = df["Pregnancies"]
X = pd.DataFrame(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.30, 
                                                    random_state=42)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gbm_model = GradientBoostingClassifier().fit(X_train, y_train)

In [None]:
y_pred = gbm_model.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
gbm_model

In [None]:
gbm_params = {"learning_rate" : [0.001, 0.01, 0.1, 0.05],
             "n_estimators": [100,500,100],
             "max_depth": [3,5,10],
             "min_samples_split": [2,5,10]}

In [None]:
gbm = GradientBoostingClassifier()

gbm_cv = GridSearchCV(gbm, gbm_params, cv = 10, n_jobs = -1, verbose = 2)

In [None]:
gbm_cv.fit(X_train, y_train)

In [None]:
gbm = GradientBoostingClassifier(learning_rate = 0.01, 
                                 max_depth = 3,
                                min_samples_split = 5,
                                n_estimators = 500)

In [None]:
gbm_tuned =  gbm.fit(X_train,y_train)

In [None]:
y_pred = gbm_tuned.predict(X_test)
accuracy_score(y_test, y_pred)

# XGBoost

In [None]:
#!pip install xgboost
from xgboost import XGBClassifier

In [None]:
xgb_model = XGBClassifier().fit(X_train, y_train)

In [None]:
xgb_model

In [None]:
y_pred = xgb_model.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
xgb_model

In [None]:
xgb_params = {
        'n_estimators': [100, 500, 1000, 2000],
        'subsample': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5,6],
        'learning_rate': [0.1,0.01,0.02,0.05],
        "min_samples_split": [2,5,10]}

In [None]:
xgb = XGBClassifier()

xgb_cv_model = GridSearchCV(xgb, xgb_params, cv = 10, n_jobs = -1, verbose = 2)

In [None]:
xgb_cv_model.fit(X_train, y_train)

In [None]:
xgb_cv_model.best_params_

In [None]:
xgb = XGBClassifier(learning_rate = 0.01, 
                    max_depth = 6,
                    min_samples_split = 2,
                    n_estimators = 100,
                    subsample = 0.8)

In [None]:
xgb_tuned =  xgb.fit(X_train,y_train)

In [None]:
y_pred = xgb_tuned.predict(X_test)
accuracy_score(y_test, y_pred)

# LightGBM

In [None]:
#!conda install -c conda-forge lightgbm
from lightgbm import LGBMClassifier

In [None]:
lgbm_model = LGBMClassifier().fit(X_train, y_train)

In [None]:
y_pred = lgbm_model.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
lgbm_model

In [None]:
lgbm_params = {
        'n_estimators': [100, 500, 1000, 2000],
        'subsample': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5,6],
        'learning_rate': [0.1,0.01,0.02,0.05],
        "min_child_samples": [5,10,20]}

In [None]:
lgbm = LGBMClassifier()

lgbm_cv_model = GridSearchCV(lgbm, lgbm_params, 
                             cv = 10, 
                             n_jobs = -1, 
                             verbose = 2)



In [None]:
lgbm_cv_model.fit(X_train, y_train)

In [None]:
lgbm_cv_model.best_params_

In [None]:
lgbm = LGBMClassifier(learning_rate = 0.01, 
                       max_depth = 3,
                       subsample = 0.6,
                       n_estimators = 500,
                       min_child_samples = 20)

In [None]:
lgbm_tuned = lgbm.fit(X_train,y_train)

In [None]:
y_pred = lgbm_tuned.predict(X_test)
accuracy_score(y_test, y_pred)

# Tüm Modellerin Karşılaştırılması

In [None]:
models = [
    loj_model,
    rf_tuned,
    gbm_tuned,
    lgbm_tuned,
    xgb_tuned
    
]


for model in models:
    names = model.__class__.__name__
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("-"*28)
    print(names + ":" )
    print("Accuracy: {:.4%}".format(accuracy))

In [None]:
result = []

results = pd.DataFrame(columns= ["Models","Accuracy"])

for model in models:
    names = model.__class__.__name__
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)    
    result = pd.DataFrame([[names, accuracy*100]], columns= ["Models","Accuracy"])
    results = results.append(result)
    
    
sns.barplot(x= 'Accuracy', y = 'Models', data=results, color="r")
plt.xlabel('Accuracy %')
plt.title('Accuracy Rates of Models');    