In [49]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [50]:
df=pd.read_csv('diabetes.csv')

In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [52]:
df.shape

(768, 9)

In [53]:
zero_cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

df[zero_cols] = df[zero_cols].replace(0, np.nan)

df[zero_cols] = df[zero_cols].fillna(df[zero_cols].median())

In [54]:
 
import numpy as np
import pandas as pd
from scipy.stats import zscore

def is_skewed(series, threshold=0.5):
    return abs(series.skew()) > threshold

 
def handle_outliers_zscore(data, column, threshold=3):
    z_scores = zscore(data[column])
    outliers = (z_scores > threshold) | (z_scores < -threshold)
    data.loc[outliers, column] = data[column].median()
    return data
 
def handle_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = (data[column] < lower_bound) | (data[column] > upper_bound)
    data.loc[outliers, column] = data[column].median()
    return data

 
numerical_columns = df.select_dtypes(include=[np.number]).columns.tolist()

for column in numerical_columns:
    if is_skewed(df[column]):   # if skewed
        df = handle_outliers_iqr(df, column)
        print(f"IQR applied to {column}")
    else:                       # if approximately normal
        df = handle_outliers_zscore(df, column)
        print(f"Z-score applied to {column}")


IQR applied to Pregnancies
IQR applied to Glucose
Z-score applied to BloodPressure
IQR applied to SkinThickness
IQR applied to Insulin
IQR applied to BMI
IQR applied to DiabetesPedigreeFunction
IQR applied to Age
IQR applied to Outcome


In [55]:
!pip install imbalanced-learn




[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: C:\Users\zarka\AppData\Local\Programs\Python\Python312\python.exe -m pip install --upgrade pip


In [56]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import pandas as pd

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
 
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [57]:
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [58]:
models1 = {
    "Logistic Regression": LogisticRegression(max_iter=5000),
    "Linear DA": LinearDiscriminantAnalysis(),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100,random_state=42),
    "Extra Tree": ExtraTreesClassifier(random_state=42),
    "Support Vector Machine": SVC(probability=True),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "Naive Bayes": GaussianNB(),
    "XGBoost": xgb.XGBClassifier(eval_metric="logloss"),
    "LightGBM": lgb.LGBMClassifier(min_gain_to_split=0.01  )
}

In [59]:
trained_models = {}

for name, model in models1.items():
    model.fit(X_train_scaled, y_train)  # training on full training data
    trained_models[name] = model       


[LightGBM] [Info] Number of positive: 399, number of negative: 401
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000083 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 868
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498750 -> initscore=-0.005000
[LightGBM] [Info] Start training from score -0.005000


In [60]:
X_test_scaled = scaler.transform(X_test)  # Do NOT fit again on test data

# Evaluate all trained models on test set
from sklearn.metrics import accuracy_score
results_list = []
for name, model in trained_models.items():
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results_list.append([name, accuracy, precision, recall, f1])

results = pd.DataFrame(results_list, columns=["Model", "Accuracy", "Precision", "Recall", "F1-Score"])

print(results)
    


                     Model  Accuracy  Precision    Recall  F1-Score
0      Logistic Regression     0.750   0.742857  0.772277  0.757282
1                Linear DA     0.745   0.740385  0.762376  0.751220
2      K-Nearest Neighbors     0.730   0.694215  0.831683  0.756757
3            Decision Tree     0.745   0.731481  0.782178  0.755981
4            Random Forest     0.810   0.764706  0.900990  0.827273
5               Extra Tree     0.810   0.756098  0.920792  0.830357
6   Support Vector Machine     0.775   0.741379  0.851485  0.792627
7        Gradient Boosting     0.765   0.732759  0.841584  0.783410
8                 AdaBoost     0.780   0.766355  0.811881  0.788462
9              Naive Bayes     0.720   0.714286  0.742574  0.728155
10                 XGBoost     0.760   0.734513  0.821782  0.775701
11                LightGBM     0.780   0.752212  0.841584  0.794393




In [61]:
# Sort results by F1-Score (or any other preferred metric)
results_sorted = results.sort_values(by="F1-Score", ascending=False)
print("\nSorted Model Performance:\n", results_sorted)

# Select the best model based on F1-Score
best_model_name = results_sorted.iloc[0]['Model']
best_model = trained_models[best_model_name]

print(f"Best model selected: {best_model_name}")



Sorted Model Performance:
                      Model  Accuracy  Precision    Recall  F1-Score
5               Extra Tree     0.810   0.756098  0.920792  0.830357
4            Random Forest     0.810   0.764706  0.900990  0.827273
11                LightGBM     0.780   0.752212  0.841584  0.794393
6   Support Vector Machine     0.775   0.741379  0.851485  0.792627
8                 AdaBoost     0.780   0.766355  0.811881  0.788462
7        Gradient Boosting     0.765   0.732759  0.841584  0.783410
10                 XGBoost     0.760   0.734513  0.821782  0.775701
0      Logistic Regression     0.750   0.742857  0.772277  0.757282
2      K-Nearest Neighbors     0.730   0.694215  0.831683  0.756757
3            Decision Tree     0.745   0.731481  0.782178  0.755981
1                Linear DA     0.745   0.740385  0.762376  0.751220
9              Naive Bayes     0.720   0.714286  0.742574  0.728155
Best model selected: Extra Tree


In [62]:
import joblib
import numpy as np
import os

os.makedirs("static", exist_ok=True)
 
joblib.dump(best_model, "static/best_model.pkl")
print("Best model saved as 'best_model.pkl'!")
 
joblib.dump(scaler, "static/scaler.pkl")
print("Scaler saved!")

joblib.dump(X_train_scaled, "static/xtrain_scaled.pkl")
print("Scaled X_train saved for LIME!")

feature_names = X.columns.tolist()  
joblib.dump(feature_names, "static/feature_names.pkl")
print("Feature names saved!")
 
class_names = ['Non-Diabetic', 'Diabetic']
joblib.dump(class_names, "static/class_names.pkl")
print("Class names saved!")

Best model saved as 'best_model.pkl'!
Scaler saved!
Scaled X_train saved for LIME!
Feature names saved!
Class names saved!
