In [23]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pickle

# 1. Load dataset
heart_data = pd.read_csv('/content/heart (1).csv')  # Change path accordingly

# 2. Split features and target
X = heart_data.drop(columns='target')
Y = heart_data['target']

# 3. Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 4. Stratified train-test split
X_train, X_test, Y_train, Y_test = train_test_split(
    X_scaled, Y, test_size=0.2, stratify=Y, random_state=42
)

# 5. Define XGBoost model
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# 6. Define hyperparameter grid for RandomizedSearchCV
param_grid = {
    'max_depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.05, 0.1, 0.15],
    'n_estimators': [50, 100, 150, 200],
    'subsample': [0.6, 0.7, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3],
    'reg_alpha': [0, 0.01, 0.1, 1],
    'reg_lambda': [1, 1.5, 2]
}

# 7. Set up RandomizedSearchCV with 5-fold Stratified CV
random_search = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=param_grid,
    n_iter=50,  # Number of parameter settings sampled
    scoring='accuracy',
    cv=StratifiedKFold(n_splits=5),
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# 8. Run hyperparameter tuning on training data only
random_search.fit(X_train, Y_train)

print("Best Parameters:", random_search.best_params_)
print("Best CV Accuracy:", random_search.best_score_)

# 9. Train best model on full training data
best_model = random_search.best_estimator_
best_model.fit(X_train, Y_train)

# 10. Evaluate model performance
train_accuracy = accuracy_score(Y_train, best_model.predict(X_train))
test_accuracy = accuracy_score(Y_test, best_model.predict(X_test))

print(f"✅ Training Accuracy: {train_accuracy * 100:.2f} %")
print(f"✅ Test Accuracy: {test_accuracy * 100:.2f} %")

# 11. Save model to file
filename = 'best_heart_disease_model.sav'
pickle.dump(best_model, open(filename, 'wb'))

print(f"Model saved to {filename}")


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Parameters: {'subsample': 0.8, 'reg_lambda': 1.5, 'reg_alpha': 0.1, 'n_estimators': 150, 'max_depth': 6, 'learning_rate': 0.15, 'gamma': 0, 'colsample_bytree': 0.7}
Best CV Accuracy: 0.9865853658536585
✅ Training Accuracy: 100.00 %
✅ Test Accuracy: 100.00 %
Model saved to best_heart_disease_model.sav


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

