In [32]:
import pandas as pd
import joblib
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

In [33]:
df = pd.read_csv('../data/heart_disease.csv')

In [34]:
df.head()

df.replace('?', np.nan, inplace=True)

In [35]:
# Convert columns to numeric
df['ca'] = pd.to_numeric(df['ca'], errors='coerce')
df['thal'] = pd.to_numeric(df['thal'], errors='coerce')


In [36]:
df['ca'].fillna(df['ca'].median(), inplace=True)
df['thal'].fillna(df['thal'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['ca'].fillna(df['ca'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['thal'].fillna(df['thal'].median(), inplace=True)


In [37]:
numerical_cols = df.columns[df.columns != 'target']
print("Numerical columns: ", numerical_cols)

Numerical columns:  Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal'],
      dtype='object')


In [38]:
scaler = StandardScaler()
df_scaled = df.copy()
df_scaled[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [39]:
X = df_scaled.drop('target', axis=1)
y = df_scaled['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [40]:
param_dist = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 3, 5, 7],
    'min_samples_split': [2, 4, 6]
}

random_search = RandomizedSearchCV(RandomForestClassifier(random_state=42), param_distributions=param_dist, n_iter=10, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
random_search.fit(X_train, y_train)

print("Best parameters found by Randomized Search: ", random_search.best_params_)
print("Best cross-validation score by Randomized Search: ", random_search.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters found by Randomized Search:  {'n_estimators': 100, 'min_samples_split': 4, 'max_depth': None}
Best cross-validation score by Randomized Search:  0.6156462585034014


In [41]:
best_model = random_search.best_estimator_

# Evaluate the best model on the test set
y_pred_best = best_model.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred_best))

Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.97      0.82        29
           1       0.12      0.08      0.10        12
           2       0.12      0.11      0.12         9
           3       0.00      0.00      0.00         7
           4       0.00      0.00      0.00         4

    accuracy                           0.49        61
   macro avg       0.19      0.23      0.21        61
weighted avg       0.38      0.49      0.43        61



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [42]:
joblib.dump(best_model, '../models/best_model.pkl')
joblib.dump(scaler, '../models/scaler.pkl')

['../models/scaler.pkl']

          age       sex        cp  trestbps      chol       fbs   restecg  \
179 -0.159462  0.686202 -0.165268 -0.096170 -0.013408  2.394438  1.016684   
228 -0.048643  0.686202  0.877985 -1.234430 -0.787231 -0.417635  1.016684   
111  0.172994  0.686202  0.877985 -0.380735  0.044629  2.394438  1.016684   
246  0.394632  0.686202  0.877985 -1.803559 -0.245555 -0.417635 -0.996749   
60  -0.381100 -1.457296  0.877985 -0.096170  1.127981 -0.417635 -0.996749   
..        ...       ...       ...       ...       ...       ...       ...   
249  0.837908  0.686202 -1.208521 -0.209996 -0.748540  2.394438  1.016684   
104 -0.602738  0.686202 -0.165268 -0.665300 -1.135451 -0.417635 -0.996749   
300  0.283813  0.686202  0.877985 -0.096170 -2.238149 -0.417635 -0.996749   
193  0.837908 -1.457296  0.877985  0.359134  0.915180  2.394438 -0.996749   
184  0.616270 -1.457296  0.877985  1.497394  1.127981 -0.417635  1.016684   

      thalach     exang   oldpeak     slope        ca      thal  
179  1.02