In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

In [4]:
# Load your dataset
data = pd.read_csv('~/data/heart_disease_cleaned.csv')

# Preparing the data
X = data.drop(['target'], axis=1)
y = data['target']

# Splitting the data into training and test sets with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, stratify=y, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define models to train
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

# Grid search parameters for tuning
param_grid = {
    'Logistic Regression': {
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['liblinear', 'saga']
    },
    'Random Forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 10]
    }
}

# Train models and tune hyperparameters
best_models = {}
grid_searches = {}
for name, model in models.items():
    print(f"Training {name}...")
    grid_search = GridSearchCV(model, param_grid[name], cv=5, scoring='accuracy')
    grid_search.fit(X_train_scaled, y_train)
    best_models[name] = grid_search.best_estimator_
    grid_searches[name] = grid_search
    print(f"Best parameters for {name}: {grid_search.best_params_}")
    print(f"Best CV score for {name}: {grid_search.best_score_:.3f}")

# Evaluate on test data and save the best model
for name, model in best_models.items():
    predictions = model.predict(X_test_scaled)
    print(f"\n{name} - Test Set Performance:")
    print(f"Accuracy: {accuracy_score(y_test, predictions):.3f}")
    print(f"Confusion Matrix:\n{confusion_matrix(y_test, predictions)}")
    print(f"Classification Report:\n{classification_report(y_test, predictions)}")
    # Save the model
    joblib.dump(model, f'{name}_model.pkl')

# Identifying the best overall model
best_overall_model = max(best_models, key=lambda k: best_models[k].score(X_test_scaled, y_test))
best_model = best_models[best_overall_model]
best_grid_search = grid_searches[best_overall_model]

# Evaluate the best model on test data
predictions = best_model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)
class_report = classification_report(y_test, predictions)

# Print results for the best model
print(f"\nBest Model: {best_overall_model}")
print(f"Best Model Parameters: {best_grid_search.best_params_}")
print(f"Best Cross-Validation Score: {best_grid_search.best_score_:.3f}")
print(f"Accuracy: {accuracy:.3f}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{class_report}")

Training Logistic Regression...
Best parameters for Logistic Regression: {'C': 0.01, 'solver': 'saga'}
Best CV score for Logistic Regression: 0.817
Training Random Forest...
Best parameters for Random Forest: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 300}
Best CV score for Random Forest: 0.808
Training Gradient Boosting...
Best parameters for Gradient Boosting: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
Best CV score for Gradient Boosting: 0.804

Logistic Regression - Test Set Performance:
Accuracy: 0.733
Confusion Matrix:
[[27 13]
 [11 39]]
Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.68      0.69        40
           1       0.75      0.78      0.76        50

    accuracy                           0.73        90
   macro avg       0.73      0.73      0.73        90
weighted avg       0.73      0.73      0.73        90


Random Forest - Test Set Performance:
Accuracy: 0.733
Confusion Mat