In [None]:
'''
Adam Forestier
Last Updated: May 7, 2023
'''

# Imports
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd
import seaborn as sns 

from imblearn.over_sampling import SMOTE
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier) 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, recall_score, confusion_matrix, classification_report, ConfusionMatrixDisplay,RocCurveDisplay, roc_curve, auc)
from sklearn.model_selection import (train_test_split, GridSearchCV)
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### Initial Data Investigation and Tuning

In [None]:
df = pd.read_csv('/kaggle/input/diabetes-prediction-dataset/diabetes_prediction_dataset.csv')
df.head()

In [None]:
# Info
df.info()

In [None]:
# Can see there are no null values. Let's ensure there are no duplicates
df = df.drop_duplicates()

In [None]:
# Every age has .0 in decimal place. Let's convert to an integer as there is no month or day data; only age in years 
df['age'] = np.vectorize(lambda age: int(age))(df['age'])
df.info()

In [None]:
# general data statistics
df.describe().transpose()

In [None]:
# I see that both gender and smoking_history are string objects. 
# The model will perform better with both features are binary objects
# I keep the original data frame for more readable visualizations
final_df = pd.get_dummies(df, drop_first=True) 
final_df.head()

In [None]:
# Investigate the balance of the data
df['diabetes'].value_counts()

**The data is highly imbalanced. We will likely need to handle this imbalance in order to have high performing models..**

In [None]:
# Let's investigate the correlation of each feature to the label we are trying to predict: diabetes
diabetes_correlation = final_df.corr()['diabetes'].sort_values()[:-1]
diabetes_correlation

**It appears blood_glucose_level and HbA1c_level are the two highest correlated features to diabetes**

### Data Exploration through Visualization

In [None]:
# View Relationship between two strongest correlated feature and the label
sns.scatterplot(x='HbA1c_level', y='blood_glucose_level', data=final_df, hue='diabetes')
plt.xlabel('HbA1c Level')
plt.ylabel('Blood Glucose Level')
plt.title('Diabetic vs Non-Diabetic by HbA1c and Blood Glucose')

**We can see a very clear seperation between diabetics and non-diabetics based on HbA1c level and blood glucose level.**

**This indicates to me, that a simple KNN model, or may be a good algorithms for this classification task. Lets Explore this relationship further.**

In [None]:
# Boxplot of blood glucose and diabetes
sns.boxplot(x='diabetes', y='blood_glucose_level', data=final_df)
plt.xlabel('Diabetic')
plt.ylabel('Blood Glucose Level')
plt.title('Blood Glucose for Diabetic and Non-Diabetic')
plt.show()

In [None]:
# Boxplot of blood glucose and diabetes
sns.boxplot(x='diabetes', y='HbA1c_level', data=final_df)
plt.xlabel('Diabetic')
plt.ylabel('HbA1c Level')
plt.title('HbA1c for Diabetic and Non-Diabetic')
plt.show()

**high blood glucose and HbA1C_level are strong indicators of diabetes**

**This indicates to me, that a simple KNN model, or may be a good algorithms for this classification task. Lets Explore this relationship further.**

In [None]:
# Distribution of ages for those with and without diabetes
sns.displot(data=final_df, x='age', bins=50, col='diabetes', hue='diabetes')
plt.title('Distribution of Ages for Diabetics and Non-Diabetics')
plt.show()

In [None]:
# Distribution of BMI for those with and without diabetes
sns.displot(data=final_df, x='bmi', bins=50, col='diabetes', hue='diabetes')
plt.title('Distribution of Ages for Diabetics and Non-Diabetics')
plt.show()

In [None]:
# View count of diabetics and non-diabetics by hypertension 
sns.countplot(data=final_df, x='hypertension', hue='diabetes')
plt.xlabel('Hypertension')
plt.ylabel('Total Count')
plt.title('Hypertension and Diabetes')
plt.show()

In [None]:
# View count of diabetics and non-diabetics by heart disease 
sns.countplot(data=final_df, x='heart_disease', hue='diabetes')
plt.xlabel('Diabetic')
plt.ylabel('Total Count')
plt.title('Heart Disease and Diabetes')
plt.show()

In [None]:
# Count of diabetics and non diabetics by gender
sns.countplot(data=df, x='gender', hue='diabetes')
plt.xlabel('Diabetic')
plt.ylabel('Total Count')
plt.title('Gender and Diabetes')
plt.show()

In [None]:
# Count of diabetics and non diabetics by gender
sns.countplot(data=df, x='smoking_history', hue='diabetes')
plt.xlabel('Diabetic')
plt.ylabel('Total Count')
plt.title('Smoking History and Diabetes')
plt.show()

**Count for those with heart disease and hypertension with no diabetes exceeds those with both and heart disease...**
**HOWEVER - we must remember the unbalanced dataset. They are near equal AND there are only 1/10 the amount of those with diabetes in the dataset.**

### Classification Models

**With all of the following visualized, let's start training some models with the findings we have gathered**

**We are going to start with high bias and low variance (low complexity) and increase complexity**
**..The first model will be a very simple K Nearest Neighbors model utilizing only Blood Glucose and HbA1c**

In [None]:
# Seperate features and label. 
X = final_df[['blood_glucose_level', 'HbA1c_level']]
y = final_df['diabetes']

In [None]:
# Perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=101)

In [None]:
# Scale data. Only fit training data to prevent data leakage
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [None]:
# Create a K Nearest Neighbors model with search for optimum amount of neighbors between 1 - 10. Use Minkowski algorithm for distance calculation
# We are trying to accurately predict when someone has diabetes. Use recall as the scoring metric
knn_clf = KNeighborsClassifier()
knn_clf.fit(scaled_X_train, y_train)
y_pred = knn_clf.predict(scaled_X_test)

In [None]:
# Confusion matrix to display precision and recall. 
knn_cm = confusion_matrix(y_true=y_test, y_pred=y_pred)
p = ConfusionMatrixDisplay(confusion_matrix=knn_cm, display_labels=knn_clf.classes_)
p.plot()
plt.show()


In [None]:
# Classification Report
knn_cr = classification_report(y_true=y_test, y_pred=y_pred)
print(knn_cr)

**This classifier performs well in most facets. High precision and recall for non-diabetics. Overall accuracy of 97%**

**The classifier also always 100% correct when assigning the diabetic label to a patient. However it is missing 1/3 of the positive cases!**

**There are too many false negatives. Considering the task of this model, to identify when individuals have diabetes. We need to have higher recall for class 1, even if it is at the expense of other scores**

**Let us see if employing SMOTE, will help us improve our Recall...**

In [None]:
# Balance data using SMOTE
sm = SMOTE(random_state=2)
X, y = sm.fit_resample(X,y)

In [None]:
# Perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=101)

In [None]:
# Scale data. Only fit training data to prevent data leakage
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [None]:
# Create a K Nearest Neighbors model with search for optimum amount of neighbors between 1 - 10. Use Minkowski algorithm for distance calculation
# We are trying to accurately predict when someone has diabetes. Use recall as the scoring metric
knn_clf = KNeighborsClassifier()
knn_clf.fit(scaled_X_train, y_train)
y_pred = knn_clf.predict(scaled_X_test)

In [None]:
# Confusion matrix to display precision and recall. 
knn_cm = confusion_matrix(y_true=y_test, y_pred=y_pred)
p = ConfusionMatrixDisplay(confusion_matrix=knn_cm, display_labels=knn_clf.classes_)
p.plot()
plt.show()

In [None]:
# Classification Report
print(classification_report(y_true=y_test, y_pred=y_pred))

**Hmm... Our recall has vastly improved for class 1, but at the cost of all other scores.**

**Let's investigate if Hyperplanes perform stronger at seperating diabetics vs. non-diabetics than Neighbors**

In [None]:
# Support Vector Classifier using the 5 strongest correlated features
strongest_correlated_features = ['heart_disease', 'hypertension', 'bmi', 'age', 'HbA1c_level', 'blood_glucose_level']
X = final_df[strongest_correlated_features]
y = final_df['diabetes']
X, y = sm.fit_resample(X, y)

In [None]:
# Perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=101)

In [None]:
# Scale data. Only fit training data to prevent data leakage
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [None]:
# Create a Support Vector Machine Classifier with cross validation
svm_clf = SVC()
svm_clf.fit(scaled_X_train, y_train)
y_pred = svm_clf.predict(scaled_X_test)

In [None]:
# Confusion matrix to display precision and recall. Calculate accuracy as well
svm_clf_cm = confusion_matrix(y_true=y_test, y_pred=y_pred)
p = ConfusionMatrixDisplay(confusion_matrix=svm_clf_cm, display_labels=svm_clf.classes_)
p.plot()
plt.show()

In [None]:
# Classification Report
svm_clf_cr = classification_report(y_true=y_test, y_pred=y_pred)
print(svm_clf_cr)

**The Support Vector Machine Classifier is support the KNN classifier with higher Precision and Recall Scores for both diabetics and non-diabetics. I think that we can still do better though**

**Let's try some Ensemble approaches**

**We are going to train RandomForest, AdaBoost, and GradientBoost classifiers. For these models, we will use every feature to try to predict our label**

In [None]:
# Seperate features and label
X = final_df.drop('diabetes', axis=1)
y = final_df['diabetes']
X, y = sm.fit_resample(X, y)

In [None]:
# Perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=101)

**NOTE: We do not need to scale for tree based models**

In [None]:
# random forest
forest_clf = RandomForestClassifier(random_state=101, oob_score=True)
forest_clf.fit(X=X_train, y=y_train)
highest_y_pred = forest_clf.predict(X_test)

In [None]:
# Confusion matrix to display precision and recall.
forest_clf_cm = confusion_matrix(y_true=y_test, y_pred=highest_y_pred)
p = ConfusionMatrixDisplay(confusion_matrix=forest_clf_cm, display_labels=forest_clf.classes_)
p.plot()
plt.show()

In [None]:
# Classification Report
forest_clf_cr = classification_report(y_true=y_test, y_pred=highest_y_pred)
print(forest_clf_cr)

In [None]:
forest_clf.oob_score_

**That's more like it! 97% across the board for the classification report; including an out of bag score of 96.8%**

**AdaBoost Classifier**

In [None]:
# Classifier with the lowest recall error
ada_clf = AdaBoostClassifier(random_state=101)
ada_clf.fit(X_train, y_train)
y_pred = ada_clf.predict(X_test)

In [None]:
# Show the features with importance > 0
feature_imp = pd.DataFrame(index=X.columns, data=ada_clf.feature_importances_, columns=['Importance'])
feature_imp = feature_imp[feature_imp['Importance'] > 0.0001]
feature_imp = feature_imp.sort_values('Importance')
sns.barplot(x=feature_imp.index, y='Importance', data=feature_imp)
plt.xlabel('Feature')
plt.xlabel('Importance')
plt.xticks(rotation=90)
plt.title('Importance by Feature')

In [None]:
# Confusion matrix to display precision and recall.
ada_cm = confusion_matrix(y_true=y_test, y_pred=y_pred)
p = ConfusionMatrixDisplay(confusion_matrix=ada_cm, display_labels=ada_clf.classes_)
p.plot()
plt.show()

In [None]:
# Classification Report
ada_cr = classification_report(y_true=y_test, y_pred=y_pred)
print(ada_cr)

**Another high performing model. Ensemble models appear to be performing well** 

**Let's try a GradientBoost Model**

In [None]:
clf = GradientBoostingClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
# Confusion matrix to display precision and recall. 
gradient_cm = confusion_matrix(y_true=y_test, y_pred=y_pred)
p = ConfusionMatrixDisplay(confusion_matrix=gradient_cm, display_labels=clf.classes_)
p.plot()
plt.show()

In [None]:
# Classification Report
gradient_cr = classification_report(y_true=y_test, y_pred=y_pred)
print(gradient_cr)

**All three ensemble learners performed well. Gradient Boost is the second best model thus far, with excellent recall on class 0 and precision on class 1. It's overall accuracy is just below Random Forest however**

**Final Model: Cross Validated LogisticRegression utizing ElasticNet Regularization**

In [None]:
# Scale data for logistic regression
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [None]:
# Logistic Regression w/ ElasticNet Regularization
model = LogisticRegression(penalty='elasticnet', l1_ratio=.99, C=1, solver='saga')
model.fit(X=scaled_X_train, y=y_train)
y_pred = model.predict(scaled_X_test)

In [None]:
# Confusion matrix to display precision and recall. 
log_cm = confusion_matrix(y_true=y_test, y_pred=y_pred)
p = ConfusionMatrixDisplay(confusion_matrix=log_cm, display_labels=model.classes_)
p.plot()
plt.show()

In [None]:
# Classification Report
log_cr = classification_report(y_true=y_test, y_pred=y_pred)
print(log_cr)

**Logistic Regression is not as accurate for classifying this data set as ensemble learners**

### Summary of Findings

Prior to using SMOTE to balance the dataset,  models performed with a high level of accuracy, but low recall. This is the result of an imbalanced dataset, i.e. many more non-diabetic individuals than diabetic. 

After balancing the dataset using SMOTE, a Random Forest Classifier performed the best with 97% accuracy, precision and recall!

Thank you for taking the time to review my notebook! Please feel free to leave any questions, comments, or critiques! 