In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv("diabetes.csv")


In [11]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [2]:
X = df.drop(['diabetes'], axis=1)
y = df['diabetes']

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### LogisticRegression for classification problems

In [4]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=10000) # WHY?
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [6]:
# here all 0s are the negative class and all the 1s are the positive class
# this makes sense in case of binary classification
y_pred

array([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0])

#### Predict proba to get the exact predicted values

In [9]:
# using predict proba to get the exact predicted values for some of the examples
y_pred_proba = lr.predict_proba(X_test)
y_pred_proba[:5]

array([[0.7241515 , 0.2758485 ],
       [0.81156335, 0.18843665],
       [0.8855143 , 0.1144857 ],
       [0.83643907, 0.16356093],
       [0.52847335, 0.47152665]])

#### Using GridSearch to find best parameters

In [12]:
# same as linear regression, using grid search to find out best parameters for the model
from sklearn.model_selection import GridSearchCV
c_space = np.logspace(-5, 8, 15) # first param, WHY THE NUMBERS -5 and 8
param_grid = {"C": c_space, "penalty": ["l1", "l2"]}
logreg = LogisticRegression()
logreg_cv = GridSearchCV(logreg, param_grid, cv=5) # cv = no of folds
logreg_cv.fit(X_train, y_train)


In [14]:
logreg_cv.best_params_

{'C': 31.622776601683793, 'penalty': 'l2'}

In [16]:
logreg_cv.best_score_

0.7703985072637611

In [18]:
# this gives us the best model instance with maximum score and predetermined params
logreg_best = logreg_cv.best_estimator_

In [19]:
y_pred_best = logreg_best.predict(X_test)

In [20]:
y_pred

array([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0])

### Confusion Matrix for classification problems

In [25]:
from sklearn.metrics import confusion_matrix, classification_report

In [23]:
confusion_matrix(y_test, y_pred_best)

array([[80, 19],
       [18, 37]])

In [27]:
# the higher the precision the lower the recall and vice versa
print(classification_report(y_test, y_pred_best))

              precision    recall  f1-score   support

           0       0.82      0.81      0.81        99
           1       0.66      0.67      0.67        55

    accuracy                           0.76       154
   macro avg       0.74      0.74      0.74       154
weighted avg       0.76      0.76      0.76       154



An alternative to confusion matrix and classification report is **AUC Curve**