## Source: Pima Indians Diabetes Database
## TASK: Predict the probability of diabetes occurrence based on diagnostic measures.
## **First, Importing all necessary libraries**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# For Data Cleaning
from sklearn.impute import KNNImputer
from sklearn.neighbors import LocalOutlierFactor

# For Data Splitting
from sklearn.model_selection import train_test_split

# For Data Preprocessing
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

# For Model Building
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# For Model Evaluation
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, mean_squared_error
from sklearn import metrics

# For Model Tuning
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline

## Loading the dataset

In [None]:
diabetesDF = pd.read_csv('diabetes.csv')
diabetesDF.head()

## Plotting the data distributions

In [None]:
diabetesDF.hist(figsize=(15,12))
plt.show()

<h3><b>Description of the dataset</b></h3>

In [None]:
diabetesDF.info()

In [None]:
diabetesDF.describe().T

<h3>Interpretation</h3>
1. Pregnancies: Women in this dataset have an average of 3.85 pregnancies, with a range from 0 to 17.<br>
2. Glucose: The average glucose level is 120.89, with values ranging from 0 to 199. The presence of 0 values might indicate missing data.<br>
3. Blood Pressure: Average blood pressure is 69.11, with a range from 0 to 122. Similar to glucose, 0 values could indicate missing data.<br>
4. Skin Thickness: The average skin thickness is 20.54, with many 0 values indicating possible missing data.<br>
5. Insulin: Insulin levels vary widely (mean 79.80, std 115.24), with many 0 values, suggesting a lot of missing or unrecorded data.<br>
6. BMI: The average BMI is 31.99, which is in the overweight range, with values up to 67.1.<br>
7. Diabetes Pedigree Function: This variable measures genetic influence, with an average value of 0.47.<br>
8. Age: The average age is 33.24, ranging from 21 to 81, indicating a relatively young to middle-aged population.<br>
9. Outcome: About 34.9% of individuals have diabetes (mean outcome of 0.35).<br>

but........<br><br>
On the columns below, a value of zero does not make sense and thus indicates missing value;<br><br>
Glucose<br>
BloodPressure<br>
SkinThickness<br>
Insulin<br>
BMI<br>


## Replacing zeros with NaN (so that counting and manipulating them is easier)...

In [None]:
diabetesDF_copy = diabetesDF.copy(deep = True)
diabetesDF_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = diabetesDF_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)

In [None]:
#PRINTING THE NUMER OF MISSING VALUES IN EACH COLUMN
diabetesDF_copy.isnull().sum()

## Imputation...

Using the KNN Imputer instead of simple mean or median imputation methods because it leverages the relationships between features by considering the k-nearest neighbors, leading to more accurate and appropriate imputations.


In [None]:
knn_imputer = KNNImputer(n_neighbors=2, weights="uniform")
diabetesDF_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = knn_imputer.fit_transform(diabetesDF_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']])

In [None]:
diabetesDF_copy.describe().T

## **Exploratory Data Analysis**

In [None]:
col = ['Glucose','BloodPressure','Insulin','Age','Outcome','BMI']

sns.pairplot(diabetesDF_copy[col] ,hue='Outcome')

In [None]:
corr = diabetesDF_copy.corr()
sns.heatmap(corr, 
         xticklabels=corr.columns, 
         yticklabels=corr.columns,
         annot=True)

In [None]:
sns.regplot(x='BMI', y= 'Glucose', data=diabetesDF_copy)

In [None]:
sns.scatterplot(x='Glucose', y= 'Insulin', data=diabetesDF_copy, hue='Outcome')

In [None]:
sns.kdeplot(data=diabetesDF_copy, x='Insulin',hue='Outcome' ,fill=True)

## Checking for Outliers...

In [None]:
for feature in diabetesDF_copy:
    
    Q1 = diabetesDF_copy[feature].quantile(0.25)
    Q3 = diabetesDF_copy[feature].quantile(0.75)
    IQR = Q3-Q1
    lower = Q1- 1.5*IQR
    upper = Q3 + 1.5*IQR
    
    if diabetesDF_copy[(diabetesDF_copy[feature] > upper)].any(axis=None):
        print(feature,"yes")
    else:
        print(feature, "no")

## Plotting boxplots to visualize outliers...

In [None]:
plt.figure(figsize=(14,10))
sns.set_style(style='whitegrid')

plt.subplot(2,3,1)
sns.boxplot(x='BloodPressure',data=diabetesDF_copy)
plt.subplot(2,3,2)
sns.boxplot(x='Insulin',data=diabetesDF_copy)
plt.subplot(2,3,3)
sns.boxplot(x='BMI',data=diabetesDF_copy)
plt.subplot(2,3,4)
sns.boxplot(x='Age',data=diabetesDF_copy)
plt.subplot(2,3,5)
sns.boxplot(x='SkinThickness',data=diabetesDF_copy)

In [None]:
sns.boxplot(x='Outcome',y='Insulin',data=diabetesDF_copy)

## Detecting outliers using Local Outlier Factor (LOF)...

In [None]:
lof =LocalOutlierFactor(n_neighbors= 10)
lof.fit_predict(diabetesDF_copy)

In [None]:
df_scores = lof.negative_outlier_factor_
print(np.sort(df_scores)[0:30])

## Setting a threshold for outliers...

In [None]:
threshold = np.sort(df_scores)[7]
threshold

## Removing all outliers based on set threshold...

In [None]:
outlier = df_scores > threshold
diabetesDF_cleaned = diabetesDF_copy[outlier]
diabetesDF_cleaned.shape

## Splitting the data into target and features...

In [None]:
y = diabetesDF_cleaned['Outcome']
X = diabetesDF_cleaned.drop('Outcome', axis=1)

## Splitting the data into training and testing data...

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y )

## Model Evaluation and Hyperparameter Tuning

**Defining models and param grids**

In [None]:
# Define models and their parameter grids
models = {
    'Decision Tree': (DecisionTreeClassifier(random_state=90), {
        'classifier__max_depth': [None, 5, 10, 15]
    }),
    'Random Forest': (RandomForestClassifier(random_state=42), {
        'classifier__n_estimators': [50, 80, 100],
        'classifier__max_depth': [None, 10, 20, 30]
    }),
    'K-Nearest Neighbors': (KNeighborsClassifier(), {
        'classifier__n_neighbors': [3, 5, 7, 9]
    }),
    'Logistic Regression': (LogisticRegression(random_state=90), {
        'classifier__C': [0.01, 0.1, 1, 10, 100]
    })
}

**Defining function to create a pipeline to: perform grid search & evaluate the model's performance using cross-validation,**


In [None]:
# Function to create pipeline, perform grid search, and evaluate model
def train_and_evaluate_with_grid_search(model, param_grid, X_train, y_train, X_test, y_test):
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('smote', SMOTE(random_state=99)),
        ('classifier', model)
    ])
    
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    best_model = grid_search.best_estimator_
    
    # Evaluate on the test set
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    
    print(f"Best parameters for {model}: {best_params}")
    print(f"Best cross-validation score for {model}: {best_score}")
    print("Test Accuracy:", accuracy)
    print("F1 Score:", f1)
    print("Classification Report:\n", classification_report(y_test, y_pred))
    cnf_matrix = confusion_matrix(y_test, y_pred)
    sns.heatmap(cnf_matrix, annot=True, fmt='g')
    plt.title('Confusion matrix', y=1.1)
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()
    
    return accuracy, f1, best_params



# Store results for comparison
results = []

for model_name, (model, param_grid) in models.items():
    print(f"Evaluating {model_name}...")
    accuracy, f1, best_params = train_and_evaluate_with_grid_search(model, param_grid, X_train, y_train, X_test, y_test)
    results.append({
        'Model': model_name,
        'Best Parameters': best_params,
        'Test Accuracy': accuracy,
        'F1 Score': f1
    })

# Create a DataFrame to summarize results
results_df = pd.DataFrame(results)
print(results_df)

## Feature Importance Analysis

**Feature Importance Analysis for RandomForestClassifier**

In [None]:
# Fit the selected model on the entire dataset
best_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
best_model.fit(X_train, y_train)

# Get feature importances
importances = best_model.feature_importances_

# Sort feature importances in descending order
indices = np.argsort(importances)[::-1]

# Print feature importances
for i, feature_index in enumerate(indices):
    print(f"{i+1}. {X.columns[feature_index]}: {importances[feature_index]}")


In [None]:
# Plot feature importances
plt.figure(figsize=(10, 6))
plt.title("Feature Importances")
plt.bar(range(X.shape[1]), importances[indices], align="center")
plt.xticks(range(X.shape[1]), X.columns[indices], rotation=90)
plt.xlabel("Feature")
plt.ylabel("Importance")
plt.tight_layout()
plt.show()

**Feature Importance Analysis for DecisionTreeClassifier**

In [None]:
# Fit Decision Tree model on the entire dataset
decision_tree_model = DecisionTreeClassifier(max_depth=5, random_state=90)
decision_tree_model.fit(X_train, y_train)

# Get feature importances
importances_dt = decision_tree_model.feature_importances_

# Sort feature importances in descending order
indices_dt = np.argsort(importances_dt)[::-1]

# Print feature importances
for i, feature_index in enumerate(indices_dt):
    print(f"{i+1}. {X.columns[feature_index]}: {importances_dt[feature_index]}")


In [None]:
# Plot feature importances for Decision Tree
plt.figure(figsize=(10, 6))
plt.title("Feature Importances (Decision Tree)")
plt.bar(range(X.shape[1]), importances_dt[indices_dt], align="center")
plt.xticks(range(X.shape[1]), X.columns[indices_dt], rotation=90)
plt.xlabel("Feature")
plt.ylabel("Importance")
plt.tight_layout()
plt.show()

**Feature Importance Analysis for LogisticRegression**

In [None]:
# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_balanced)
X_test_scaled = scaler.transform(X_test)

# Fit Logistic Regression model on the entire dataset
logistic_regression_model = LogisticRegression(random_state=90)
logistic_regression_model.fit(X_train_balanced, y_train_balanced)

# Get feature coefficients
coefficients = logistic_regression_model.coef_[0]

# Sort feature coefficients in descending order
indices_lr = np.argsort(np.abs(coefficients))[::-1]

# Print feature coefficients
for i, feature_index in enumerate(indices_lr):
    print(f"{i+1}. {X.columns[feature_index]}: {coefficients[feature_index]}")

In [None]:
# Plot feature coefficients for Logistic Regression
plt.figure(figsize=(10, 6))
plt.title("Feature Coefficients (Logistic Regression)")
plt.bar(range(X.shape[1]), np.abs(coefficients[indices_lr]), align="center")
plt.xticks(range(X.shape[1]), X.columns[indices_lr], rotation=90)
plt.xlabel("Feature")
plt.ylabel("Coefficient (Absolute Value)")
plt.tight_layout()
plt.show()