In [18]:
import pandas as pd
from sklearn.model_selection import GridSearchCV,StratifiedKFold,train_test_split
from sklearn.preprocessing import StandardScaler,PolynomialFeatures,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

In [4]:
df = pd.read_csv('heart.csv')
df1= pd.read_csv('healthdata.csv')

In [8]:
df1.rename(columns={'num':'target'}, inplace=True)

In [9]:
df_merged = pd.concat([df1, df], axis=0, ignore_index=True)

In [10]:
df_merged.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          4
thal        2
target      0
dtype: int64

In [11]:
df_new = df_merged.copy()
df_new['ca'].fillna(df['ca'].mean(), inplace=True)
df_new['thal'].fillna(df['thal'].mean(), inplace=True)

In [12]:
df_new.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [21]:
df_new['target'].value_counts()

1    665
0    663
Name: target, dtype: int64

In [13]:
X = df_new.drop('target', axis=1)
y = df_new['target']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [20]:
numeric_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(drop='first'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [23]:
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

# Create the pipeline
pipeline_xgb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', xgb)
])


# Define the parameter grid for hyperparameter tuning
param_grid_xgb = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__learning_rate': [0.01, 0.1, 0.2],
    'classifier__max_depth': [3, 4, 5],
    'classifier__min_child_weight': [1, 2, 4],
    'classifier__subsample': [0.8, 0.9, 1.0],
    'classifier__colsample_bytree': [0.8, 0.9, 1.0]
}

# Create the cross-validation object
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

grid_search_xgb = GridSearchCV(pipeline_xgb, param_grid_xgb, cv=cv, scoring='accuracy', n_jobs=-1)

# Fit GridSearchCV
grid_search_xgb.fit(X_train, y_train)

# Get the best estimator and the best hyperparameters
best_xgb = grid_search_xgb.best_estimator_
best_xgb_hyperparams = grid_search_xgb.best_params_

# Print the optimal hyperparameters
print('XGBoost Optimal Hyperparameters: \n', best_xgb_hyperparams)

# Evaluate the optimized model on the train data
y_pred_train_xgb = best_xgb.predict(X_train)
print('Classification Report on Train Data:')
print(classification_report(y_train, y_pred_train_xgb))

# Evaluate the optimized model on the test data
y_pred_test_xgb = best_xgb.predict(X_test)
print('Classification Report on Test Data:')
print(classification_report(y_test, y_pred_test_xgb))



Traceback (most recent call last):
  File "/Users/ajaysrikar00/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ajaysrikar00/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
           ^^^^^^^^^^^^
  File "/Users/ajaysrikar00/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 276, in _score
    y_pred = method_caller(estimator, "predict", X)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ajaysrikar00/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 73, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ajaysrikar00/anaconda3/lib/python3.11/site-packages/sklearn/pipeline.py", line 480, in predict
    Xt = transfo

XGBoost Optimal Hyperparameters: 
 {'classifier__colsample_bytree': 0.8, 'classifier__learning_rate': 0.01, 'classifier__max_depth': 3, 'classifier__min_child_weight': 1, 'classifier__n_estimators': 100, 'classifier__subsample': 0.8}
Classification Report on Train Data:
              precision    recall  f1-score   support

           0       0.81      0.83      0.82       530
           1       0.82      0.81      0.82       532

    accuracy                           0.82      1062
   macro avg       0.82      0.82      0.82      1062
weighted avg       0.82      0.82      0.82      1062

Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.75      0.80      0.78       133
           1       0.79      0.74      0.76       133

    accuracy                           0.77       266
   macro avg       0.77      0.77      0.77       266
weighted avg       0.77      0.77      0.77       266



Traceback (most recent call last):
  File "/Users/ajaysrikar00/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ajaysrikar00/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
           ^^^^^^^^^^^^
  File "/Users/ajaysrikar00/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 276, in _score
    y_pred = method_caller(estimator, "predict", X)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ajaysrikar00/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 73, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ajaysrikar00/anaconda3/lib/python3.11/site-packages/sklearn/pipeline.py", line 480, in predict
    Xt = transfo