In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, fbeta_score
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.model_selection import train_test_split

In [9]:

# Load the dataset
data_path = '../data/diabetes_data.csv'
data= pd.read_csv(data_path, delimiter=';')


In [10]:
data['gender'] = LabelEncoder().fit_transform(data['gender'])

# Scale the features
features = data.drop('class', axis=1)  # excluding the target variable
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
X = pd.DataFrame(scaled_features, columns=features.columns)
y = data['class']

In [11]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Support Vector Machine': SVC(),
    'Random Forest': RandomForestClassifier(class_weight='balanced', random_state=1)
}

In [12]:
results = {}
for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    results[name] = scores

# Output the results
for name, scores in results.items():
    print(f"{name}: Mean accuracy = {np.mean(scores):.2f}%, Standard Deviation = {np.std(scores):.2f}%")

Logistic Regression: Mean accuracy = 0.93%, Standard Deviation = 0.03%
Support Vector Machine: Mean accuracy = 0.97%, Standard Deviation = 0.03%
Random Forest: Mean accuracy = 0.97%, Standard Deviation = 0.03%


In [13]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


In [15]:
for name, model in models.items():
    model.fit(X_train, y_train)  # Train model
    y_pred = model.predict(X_test)  # Predict on test data
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else model.decision_function(X_test)

    # Display confusion matrix and classification report
    print(f"Model: {name}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("ROC-AUC Score:", roc_auc_score(y_test, y_proba))
    print("\n" + "-"*60 + "\n")


Model: Logistic Regression
Confusion Matrix:
[[34  4]
 [ 4 62]]
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.89      0.89        38
           1       0.94      0.94      0.94        66

    accuracy                           0.92       104
   macro avg       0.92      0.92      0.92       104
weighted avg       0.92      0.92      0.92       104

ROC-AUC Score: 0.9712918660287081

------------------------------------------------------------

Model: Support Vector Machine
Confusion Matrix:
[[35  3]
 [ 3 63]]
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.92      0.92        38
           1       0.95      0.95      0.95        66

    accuracy                           0.94       104
   macro avg       0.94      0.94      0.94       104
weighted avg       0.94      0.94      0.94       104

ROC-AUC Score: 0.9956140350877193

-------------------------------------

In [1]:
params = {
    'n_estimators': 100,
    'max_depth': None,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'class_weight': 'balanced',
    'random_state': 1
}

In [6]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the Random Forest classifier on scaled data
clf_scaled = RandomForestClassifier(**params)
clf_scaled.fit(X_train_scaled, y_train)

In [7]:
feature_importances = clf_scaled.feature_importances_
features = X.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Display the feature importances
importance_df


Unnamed: 0,Feature,Importance
2,polydipsia,0.236022
1,polyuria,0.221739
0,age,0.087949
15,gender_Female,0.058933
3,sudden_weight_loss,0.050917
11,partial_paresis,0.041279
16,gender_Male,0.038583
13,alopecia,0.0382
9,irritability,0.036281
10,delayed_healing,0.029627


In [None]:

# Evaluate the model
predictions = clf.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
fscore = fbeta_score(y_test, predictions, beta=0.5)

accuracy, fscore


(0.9711538461538461, 0.9730538922155688)

In [None]:
model_path = '../models/random_forest_model.pkl'
joblib.dump(clf, model_path)

['../models/random_forest_model.pkl']

: 