In this notebook we are going to try and explore the various data preprocessing features provided in the scikit-learn library.

In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame

import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
raw_data = pd.read_csv('../../data/external/diabetes.csv')
X = raw_data.drop(columns=['Outcome'])
y = raw_data['Outcome']

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Initialize models for classification (since we're predicting diabetes outcome)
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
    'SVM': SVC(random_state=42)
}

# Store results for each model and fold
results = {model_name: [] for model_name in models.keys()}

print("Training models using cross-validation folds...")
print("=" * 50)

Training models using cross-validation folds...


In [6]:
from sklearn.model_selection import StratifiedShuffleSplit

# Create a new StratifiedShuffleSplit to get fresh splits
sss_cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=67)

# Perform cross-validation
fold_num = 1
for train_idx, val_idx in sss_cv.split(X_train, y_train):
    print(f"\nFold {fold_num}:")
    print("-" * 20)
    
    # Split the training data into train and validation sets
    X_fold_train, X_fold_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_fold_train, y_fold_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    # Train and evaluate each model
    for model_name, model in models.items():
        # Train the model
        model.fit(X_fold_train, y_fold_train)
        
        # Make predictions
        y_pred = model.predict(X_fold_val)
        
        # Calculate accuracy
        accuracy = accuracy_score(y_fold_val, y_pred)
        results[model_name].append(accuracy)
        
        print(f"{model_name}: {accuracy:.4f}")
    
    fold_num += 1


Fold 1:
--------------------
Logistic Regression: 0.7805
Random Forest: 0.7967
SVM: 0.7805

Fold 2:
--------------------
Logistic Regression: 0.8049
Random Forest: 0.8211
SVM: 0.7967

Fold 3:
--------------------
Logistic Regression: 0.7317
Random Forest: 0.7642
SVM: 0.6911

Fold 4:
--------------------
Logistic Regression: 0.6992
Random Forest: 0.7724
SVM: 0.7317

Fold 5:
--------------------
Logistic Regression: 0.7480
Random Forest: 0.7642
SVM: 0.7154


In [7]:
# Display cross-validation results summary
print("\n" + "=" * 50)
print("CROSS-VALIDATION RESULTS SUMMARY")
print("=" * 50)

for model_name, scores in results.items():
    mean_accuracy = np.mean(scores)
    std_accuracy = np.std(scores)
    print(f"\n{model_name}:")
    print(f"  Individual fold scores: {[f'{score:.4f}' for score in scores]}")
    print(f"  Mean accuracy: {mean_accuracy:.4f} (+/- {std_accuracy:.4f})")

# Find the best performing model
best_model_name = max(results.keys(), key=lambda k: np.mean(results[k]))
best_mean_score = np.mean(results[best_model_name])

print(f"\nBest performing model: {best_model_name}")
print(f"Best mean accuracy: {best_mean_score:.4f}")


CROSS-VALIDATION RESULTS SUMMARY

Logistic Regression:
  Individual fold scores: ['0.7805', '0.8049', '0.7317', '0.6992', '0.7480']
  Mean accuracy: 0.7528 (+/- 0.0369)

Random Forest:
  Individual fold scores: ['0.7967', '0.8211', '0.7642', '0.7724', '0.7642']
  Mean accuracy: 0.7837 (+/- 0.0222)

SVM:
  Individual fold scores: ['0.7805', '0.7967', '0.6911', '0.7317', '0.7154']
  Mean accuracy: 0.7431 (+/- 0.0397)

Best performing model: Random Forest
Best mean accuracy: 0.7837


In [8]:
# Train the best model on the full training set and evaluate on test set
print("\n" + "=" * 50)
print("FINAL MODEL EVALUATION")
print("=" * 50)

# Get the best model
best_model = models[best_model_name]

# Train on full training set
best_model.fit(X_train, y_train)

# Predict on test set
y_test_pred = best_model.predict(X_test)

# Calculate test accuracy
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Model: {best_model_name}")
print(f"Test Set Accuracy: {test_accuracy:.4f}")

# Display detailed classification report
print(f"\nClassification Report:")
print(classification_report(y_test, y_test_pred))

# Display confusion matrix
print(f"\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_test_pred)
print(cm)


FINAL MODEL EVALUATION
Model: Random Forest
Test Set Accuracy: 0.7208

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.78      0.78        99
           1       0.61      0.62      0.61        55

    accuracy                           0.72       154
   macro avg       0.70      0.70      0.70       154
weighted avg       0.72      0.72      0.72       154


Confusion Matrix:
[[77 22]
 [21 34]]
