In [1]:
import pandas as pd

In [2]:
dataset = pd.read_csv('heart.csv')

In [5]:
dataset.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


**Attribute Information:**
1. age
2. sex (1 = male; 0 = female)
3. chest pain type (4 values)
4. resting blood pressure
5. serum cholestoral in mg/dl
6. fasting blood sugar (> 120 mg/dL, 1 = vrai, 0 = faux)
7. resting electrocardiographic results (values 0,1,2)
8. maximum heart rate achieved
9. exercise induced angina
10. oldpeak = ST depression induced by exercise relative to rest
11. the slope of the peak exercise ST segment
12. number of major vessels (0-3) colored by flourosopy
13. thal: 0 = normal; 1 = fixed defect; 2 = reversable defect
14. target: 0 = no disease, 1 = disease.

In [44]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB


In [6]:
dataset.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0
mean,54.42053,0.682119,0.963576,131.602649,246.5,0.149007,0.52649,149.569536,0.327815,1.043046,1.397351,0.718543,2.31457,0.543046
std,9.04797,0.466426,1.032044,17.563394,51.753489,0.356686,0.526027,22.903527,0.470196,1.161452,0.616274,1.006748,0.613026,0.49897
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,133.25,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.5,1.0,1.0,130.0,240.5,0.0,1.0,152.5,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.75,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [17]:
dataset = dataset.drop_duplicates()

In [6]:
dataset.isnull().sum()

Unnamed: 0,0
age,0
sex,0
cp,0
trestbps,0
chol,0
fbs,0
restecg,0
thalach,0
exang,0
oldpeak,0


In [3]:
dataset['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
1,526
0,499


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [18]:
X = dataset.drop(columns=['target'])
y = dataset['target']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=33)

In [20]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

**Fune tuning of Random Forest Model With GridSearchCV**

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [37]:
params = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [2, 4, 5]
}

In [63]:
grid_search = GridSearchCV(RandomForestClassifier(), params, cv=20, scoring='f1')

In [64]:
grid_search.fit(X_train_scaled, y_train)

In [65]:
best_params = grid_search.best_params_
print(best_params)

{'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 150}


In [66]:
rf = RandomForestClassifier(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    random_state=42
)

In [67]:
rf.fit(X_train_scaled, y_train)

In [68]:
y_pred = rf.predict(X_test_scaled)

In [69]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.86      0.81        28
           1       0.87      0.79      0.83        33

    accuracy                           0.82        61
   macro avg       0.82      0.82      0.82        61
weighted avg       0.82      0.82      0.82        61



In [70]:
rf.score(X_test_scaled, y_test)

0.819672131147541

**Documentation**
- The tuning was performed using GridSearchCV with 20-fold cross-validation, optimizing for F1-score.
After running GridSearchCV, the optimal hyperparameters were:
{'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 150}


*   The tuned Random Forest model achieved an F1-score of 82%, showing good performance in distinguishing between diseased and non-diseased patients.
*   Precision and Recall are well-balanced, making this model reliable for medical predictions.









**Fune tuning of XGBoost Model with Optuna**

In [72]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.1-py3-none-any.whl.metadata (7.2 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.1-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.8/231.8 kB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.1 colorlog-6.9.0 optuna-4.2.1


In [73]:
import xgboost as xgb
from optuna import create_study

In [74]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0)
    }
    model = xgb.XGBClassifier(**params)
    model.fit(X_train_scaled, y_train)
    return model.score(X_test_scaled, y_test)

In [75]:
study = create_study(direction="maximize")
study.optimize(objective, n_trials=20)

[I 2025-03-25 22:05:09,246] A new study created in memory with name: no-name-a95e4b13-0990-4b90-8722-838fbdcb753a
[I 2025-03-25 22:05:09,357] Trial 0 finished with value: 0.7868852459016393 and parameters: {'n_estimators': 217, 'max_depth': 8, 'learning_rate': 0.1383448385496694, 'subsample': 0.6639037834105985}. Best is trial 0 with value: 0.7868852459016393.
[I 2025-03-25 22:05:09,391] Trial 1 finished with value: 0.8032786885245902 and parameters: {'n_estimators': 131, 'max_depth': 6, 'learning_rate': 0.22043264735445425, 'subsample': 0.6720440292399346}. Best is trial 1 with value: 0.8032786885245902.
[I 2025-03-25 22:05:09,446] Trial 2 finished with value: 0.7868852459016393 and parameters: {'n_estimators': 408, 'max_depth': 12, 'learning_rate': 0.22190474568818616, 'subsample': 0.6868186860951315}. Best is trial 1 with value: 0.8032786885245902.
[I 2025-03-25 22:05:09,481] Trial 3 finished with value: 0.7868852459016393 and parameters: {'n_estimators': 123, 'max_depth': 13, 'lear

In [76]:
best_params = study.best_params
print(best_params)

{'n_estimators': 497, 'max_depth': 5, 'learning_rate': 0.11621157260018242, 'subsample': 0.597339113238641}


In [77]:
best_model = xgb.XGBClassifier(**best_params)

In [78]:
best_model.fit(X_train_scaled, y_train)

In [79]:
accuracy = best_model.score(X_test_scaled, y_test)
print(accuracy)

0.8360655737704918


In [80]:
y_pred = best_model.predict(X_test_scaled)

In [81]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.86      0.83        28
           1       0.87      0.82      0.84        33

    accuracy                           0.84        61
   macro avg       0.84      0.84      0.84        61
weighted avg       0.84      0.84      0.84        61



**Documentation**
- The XGBoost model achieved an accuracy of 84%, demonstrating a strong ability to distinguish between classes. The optimized hyperparameters, particularly 497 estimators, max depth of 5, learning rate of 0.116, and subsample of 0.597, contributed to this performance.
- Compared to Random Forest, XGBoost showed a slight improvement in both recall and precision, making it a preferable choice for this classification task.