In [13]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from imblearn.over_sampling import SMOTE


df = pd.read_csv("heart_disease_data_imputed.csv")
X = df.drop(columns=['num'])
y = df['num'].apply(lambda x: 1 if x > 0 else 0)


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)


smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)


rfe_model = RandomForestClassifier(n_estimators=100, random_state=42)


num_features_to_select = min(7, X_train_scaled.shape[1])


rfe = RFE(estimator=rfe_model, 
          n_features_to_select=num_features_to_select, 
          step=1, 
          verbose=1)


rfe.fit(X_train_scaled, y_train_res)


selected_features_mask = rfe.support_
selected_features = X.columns[selected_features_mask]
print("\nTop features selected by RFE:")
print(selected_features.tolist())


X_train_selected = rfe.transform(X_train_scaled)
X_test_selected = rfe.transform(X_test_scaled)


final_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    min_samples_split=3,
    random_state=42,
    class_weight='balanced'
)
final_model.fit(X_train_selected, y_train_res)


y_pred = final_model.predict(X_test_selected)
y_proba = final_model.predict_proba(X_test_selected)[:, 1]

print("\nFinal Model Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_proba):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.

Top features selected by RFE:
['id', 'age', 'cp', 'chol', 'thalch', 'exang', 'oldpeak']

Final Model Performance:
Accuracy: 0.8478
ROC AUC: 0.9432

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.82      0.83        82
           1       0.86      0.87      0.86       102

    accuracy                           0.85       184
   macro avg       0.85      0.84      0.85       184
weighted avg       0.85      0.85      0.85       184



In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt


df = pd.read_csv("heart_disease_data_imputed.csv")
X = df.drop(columns=['num'])
y = df['num'].apply(lambda x: 1 if x > 0 else 0)


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)


smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)

lasso = LogisticRegression(penalty='l1', solver='liblinear', random_state=42, C=0.1, max_iter=1000)
lasso.fit(X_train_scaled, y_train_res)


lasso_importance = pd.DataFrame({
    'features': X.columns.tolist(),
    'importance': np.abs(lasso.coef_[0])
}).sort_values('importance', ascending=False)


top_features = lasso_importance.head(7)['features'].values
print("Top 7 features selected by LASSO:")
print(top_features)



col_indices = [X.columns.get_loc(col) for col in top_features]
X_train_selected = X_train_scaled[:, col_indices]
X_test_selected = X_test_scaled[:, col_indices]


final_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    min_samples_split=3,
    random_state=42,
    class_weight='balanced'
)
final_model.fit(X_train_selected, y_train_res)


y_pred = final_model.predict(X_test_selected)
y_proba = final_model.predict_proba(X_test_selected)[:, 1]

print("\nFinal Model Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_proba):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

feature_importance = pd.DataFrame({
    'features': top_features,
    'importance': final_model.feature_importances_
}).sort_values('importance', ascending=False)



Top 7 features selected by LASSO:
['oldpeak' 'ca' 'id' 'thal' 'sex' 'cp' 'fbs']

Final Model Performance:
Accuracy: 0.8750
ROC AUC: 0.9313

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.84      0.86        82
           1       0.88      0.90      0.89       102

    accuracy                           0.88       184
   macro avg       0.87      0.87      0.87       184
weighted avg       0.87      0.88      0.87       184



In [9]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE


df = pd.read_csv("heart_disease_data_imputed.csv")
X = df.drop(columns=['num'])
y = df['num'].apply(lambda x: 1 if x > 0 else 0)


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)


smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)


selector = SelectKBest(score_func=f_classif, k=7)
selector.fit(X_train_scaled, y_train_res)


anova_scores = pd.DataFrame({
    'features': X.columns.tolist(),
    'f_score': selector.scores_,
    'p_value': selector.pvalues_
}).sort_values('f_score', ascending=False)


top_n = min(7, len(anova_scores))
top_features = anova_scores['features'].head(top_n).values
print("Top 10 features selected by ANOVA F-test:")
print(top_features)


col_indices = [X.columns.get_loc(col) for col in top_features]
X_train_selected = X_train_scaled[:, col_indices]
X_test_selected = X_test_scaled[:, col_indices]


final_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    min_samples_split=2,
    random_state=42,
    class_weight='balanced'
)
final_model.fit(X_train_selected, y_train_res)


y_pred = final_model.predict(X_test_selected)
y_proba = final_model.predict_proba(X_test_selected)[:, 1]

print("\nFinal Model Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_proba):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Top 10 features selected by ANOVA F-test:
['thalch' 'id' 'oldpeak' 'exang' 'cp' 'age' 'sex']

Final Model Performance:
Accuracy: 0.8641
ROC AUC: 0.9470

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.80      0.84        82
           1       0.85      0.91      0.88       102

    accuracy                           0.86       184
   macro avg       0.87      0.86      0.86       184
weighted avg       0.87      0.86      0.86       184


Feature Importance from Final Model:
   feature  importance
1       id    0.362995
5      age    0.133777
0   thalch    0.129817
2  oldpeak    0.129186
4       cp    0.118257
3    exang    0.092783
6      sex    0.033185
