## Import Dataset


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

diabetes_df = pd.read_csv("datasets/diabetes_clean.csv")
print(diabetes_df.head())

## Train/ Test Split


In [None]:
X = diabetes_df.drop("diabetes", axis=1).values
y = diabetes_df["diabetes"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Fit the model


In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

## Predict Probilities passing test features


In [None]:
y_pred_prob = logreg.predict_proba(X_test)[:, 1]
print(y_pred_prob)

## Make Predictions

You can see the predictions made by this model by calling the `predict` function.


In [None]:
y_pred = logreg.predict(X_test)
print(y_pred)

combined = np.column_stack((y_pred_prob, y_pred))
print(combined)

## Calculate accuracy

You can calculate this accuracy of this model by calling the `score` function.


In [None]:
print("Accuracy on training set:", logreg.score(X_test, y_test))

## ROC (receiver operating characteristic)

In [None]:
# Import roc_curve
from sklearn.metrics import roc_curve

# Generate ROC curve values: fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

plt.plot([0, 1], [0, 1], 'k--')

# Plot tpr against fpr
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Diabetes Prediction')
plt.show()

## AUC (area under the curve)

In [None]:
# Import roc_auc_score
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report

# Calculate roc_auc_score
print(roc_auc_score(y_test, y_pred_prob))

# Calculate the confusion matrix
print(confusion_matrix(y_test, y_pred))

# Calculate the classification report
print(classification_report(y_test, y_pred))