In [33]:
from src.model_dev import *
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
import joblib

In [34]:
data = pd.read_csv('../data/hd_data_preprocessed.csv')

selected_features = ['ca', 'oldpeak', 'age', 'chol', 'thalch', 'thal', 'trestbps']
X = data[selected_features]
y = data['num']

In [35]:
Label_Encoder = LabelEncoder()

for col in X.columns:
    if X[col].dtype == 'object' or X[col].dtype == 'category':
        X[col] = Label_Encoder.fit_transform(X[col])
    else:
        pass

In [36]:
# Split the data
X_train, X_test, y_train, y_test = split_data(X, y)

In [37]:
models = [
    ('Logistic Regression', LogisticRegression(random_state=42)),
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42)),
    ('KNeighbors', KNeighborsClassifier()),
    ('SVM', SVC(probability=True, random_state=42)),
    ('XGBoost', XGBClassifier(random_state=42)),
    ('LightGBM', LGBMClassifier(random_state=42))
]

In [38]:
metric_weights = {
    'Accuracy': 0.2,
    'Precision': 0.2,
    'Recall': 0.2,
    'F1 Score': 0.3,
    'ROC-AUC': 0.1
}

results, best_model, best_model_name = evaluate_models(X_train, X_test, y_train, y_test, models, metric_weights)

In [39]:
for name, metrics in results.items():
    print(f"Model: {name}")
    for metric, value in metrics.items():
        if metric != "Confusion Matrix":
            print(f"{metric}: {value:.4f}")
        else:
            print(f"{metric}:\n{value}")
    print()


Model: Logistic Regression
Cross-Validation Accuracy: 0.5567
Test Accuracy: 0.6202
Precision: 0.5543
Recall: 0.4144
F1 Score: 0.4235
Confusion Matrix:
[[47  7  0  1  0]
 [11 24  0  0  0]
 [ 2  1  2  4  2]
 [ 0  9  0  7  4]
 [ 0  5  0  3  0]]
ROC-AUC: 0.8114
Weighted Score: 0.5260

Model: Random Forest
Cross-Validation Accuracy: 0.6033
Test Accuracy: 0.6667
Precision: 0.4519
Recall: 0.4740
F1 Score: 0.4621
Confusion Matrix:
[[47  6  2  0  0]
 [ 7 26  2  0  0]
 [ 2  2  3  4  0]
 [ 2  4  3 10  1]
 [ 0  2  1  5  0]]
ROC-AUC: 0.8532
Weighted Score: 0.5425

Model: Gradient Boosting
Cross-Validation Accuracy: 0.5767
Test Accuracy: 0.6589
Precision: 0.4524
Recall: 0.4577
F1 Score: 0.4541
Confusion Matrix:
[[48  5  1  0  1]
 [ 4 26  4  0  1]
 [ 2  2  3  4  0]
 [ 2  3  3  8  4]
 [ 0  2  1  5  0]]
ROC-AUC: 0.8287
Weighted Score: 0.5329

Model: KNeighbors
Cross-Validation Accuracy: 0.5233
Test Accuracy: 0.5349
Precision: 0.3489
Recall: 0.3516
F1 Score: 0.3440
Confusion Matrix:
[[41 13  1  0  0]
 [

In [40]:
print(f"Best Model: {best_model.named_steps['model']}")
print(f"Best Model Name: {best_model_name}")
print(f"Best Weighted Score: {results[best_model_name]['Weighted Score']:.4f}")

Best Model: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, objective='multi:softprob', ...)
Best Model Name: XGBoost
Best Weighted Score: 0.5788


In [41]:
tuned_models = tune_all_models(X_train, y_train, X_test, y_test, metric_weights=metric_weights)


Tuning Logistic Regression...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters for Logistic Regression:
{'model__solver': 'sag', 'model__penalty': 'l2', 'model__C': 545.5594781168514}
Weighted score: 0.5110

Tuning Random Forest...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters for Random Forest:
{'model__n_estimators': 800, 'model__min_samples_split': 11, 'model__min_samples_leaf': 2, 'model__max_features': None, 'model__max_depth': 11}
Weighted score: 0.5536

Tuning Gradient Boosting...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters for Gradient Boosting:
{'model__subsample': 0.5, 'model__n_estimators': 800, 'model__max_features': None, 'model__max_depth': 7, 'model__learning_rate': 0.046415888336127774}
Weighted score: 0.5463

Tuning KNeighbors...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters for KNeighbors:
{'model__weights': 'distance', 'model__n_neighbors': 

In [42]:
best_model_name = max(tuned_models, key=lambda x: tuned_models[x].get('best_score', float('-inf')))
best_model = tuned_models[best_model_name]['best_estimator']
print(f"\nBest overall model: {best_model_name}")
print(f"Best overall score: {tuned_models[best_model_name]['best_score']:.4f}")


Best overall model: XGBoost
Best overall score: 0.5834


In [43]:
sorted_models = sorted(tuned_models.items(), key=lambda x: x[1]['best_score'], reverse=True)

In [44]:
top_3_models = {}
for i, (model_name, model_info) in enumerate(sorted_models[:3]):
    rank = i + 1
    file_name = f'../models/rank_{rank}_{model_name.lower().replace(" ", "_")}_model_final.joblib'
    save_model(model_info['best_estimator'], file_name)
    print(f"Rank {rank} model ({model_name}) saved to '{file_name}'")
    print(f"Weighted Score: {model_info['best_score']:.4f}")
    print(f"Parameters: {model_info['best_params']}")
    print()

    top_3_models[model_name] = {
        'rank': rank,
        'score': model_info['best_score'],
        'file_name': file_name
    }
    
    print(f"Metrics for {model_name}:")
    for metric_name, metric_value in model_info['metrics'].items():
        if metric_name != "Confusion Matrix":
            print(f"{metric_name}: {metric_value:.4f}")
        else:
            print(f"{metric_name}:\n{metric_value}")
    print()

joblib.dump(top_3_models, '../models/top_3_models_info_final.joblib')
print("Information about top 3 models saved to '../models/top_3_models_info_final.joblib'")


Rank 1 model (XGBoost) saved to '../models/rank_1_xgboost_model_final.joblib'
Weighted Score: 0.5834
Parameters: {'model__subsample': 0.7999999999999999, 'model__n_estimators': 500, 'model__min_child_weight': 1, 'model__max_depth': 9, 'model__learning_rate': 0.01, 'model__colsample_bytree': 0.8999999999999999}

Metrics for XGBoost:
accuracy: 0.6977
precision: 0.4967
recall: 0.5304
f1: 0.5127
roc_auc: 0.8462

Rank 2 model (Random Forest) saved to '../models/rank_2_random_forest_model_final.joblib'
Weighted Score: 0.5536
Parameters: {'model__n_estimators': 800, 'model__min_samples_split': 11, 'model__min_samples_leaf': 2, 'model__max_features': None, 'model__max_depth': 11}

Metrics for Random Forest:
accuracy: 0.6822
precision: 0.4628
recall: 0.4813
f1: 0.4715
roc_auc: 0.8692

Rank 3 model (Gradient Boosting) saved to '../models/rank_3_gradient_boosting_model_final.joblib'
Weighted Score: 0.5463
Parameters: {'model__subsample': 0.5, 'model__n_estimators': 800, 'model__max_features': Non

In [45]:
print("\nTop 3 Models Summary:")
for model_name, info in top_3_models.items():
    print(f"Rank {info['rank']}: {model_name} (Score: {info['score']:.4f})")


Top 3 Models Summary:
Rank 1: XGBoost (Score: 0.5834)
Rank 2: Random Forest (Score: 0.5536)
Rank 3: Gradient Boosting (Score: 0.5463)
