In [25]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from imblearn.over_sampling import SMOTE


df = pd.read_csv("heart_disease_data_imputed.csv")
X = df.drop(columns=['num'])
y = df['num'].apply(lambda x: 1 if x > 0 else 0)


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)


smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)


rfe_model = RandomForestClassifier(n_estimators=100, random_state=42)


num_features_to_select = min(7, X_train_scaled.shape[1])


rfe = RFE(estimator=rfe_model, 
          n_features_to_select=num_features_to_select, 
          step=1, 
          verbose=1)

rfe.fit(X_train_scaled, y_train_res)


selected_features_mask = rfe.support_
selected_features = X.columns[selected_features_mask]
print("\nTop features selected by RFE:")
print(selected_features.tolist())


X_train_selected = rfe.transform(X_train_scaled)
X_test_selected = rfe.transform(X_test_scaled)


final_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    min_samples_split=3,
    random_state=42,
    class_weight='balanced'
)
final_model.fit(X_train_selected, y_train_res)

def get_user_input_and_predict(selected_features):
    print("\nPlease enter the following health information:")
    user_data = {}

    for feature in selected_features:
        while True:
            try:
                value = float(input(f"Enter value for {feature}: "))
                user_data[feature] = value
                break
            except ValueError:
                print("Please enter a valid number.")
    

    user_df = pd.DataFrame([user_data])[selected_features]
    
 
    user_array = user_df.values
    
   
    full_features = np.zeros((1, X_train_scaled.shape[1]))
    for i, feature in enumerate(X.columns):
        if feature in selected_features:
            col_idx = np.where(selected_features == feature)[0][0]
            full_features[0, i] = user_array[0, col_idx]
    
  
    user_scaled = scaler.transform(full_features)
    
   
    user_selected = rfe.transform(user_scaled)
    
    prediction = final_model.predict(user_selected)
    proba = final_model.predict_proba(user_selected)[0][1]
    
   
    print("\nPrediction Results:")
    if prediction[0] == 1:
        print("🔴 High risk of heart disease")
    else:
        print("🟢 Low risk of heart disease")
    print(f"Probability of heart disease: {proba:.1%}")
    
    return prediction, proba


print("\n" + "="*50)
print("HEART DISEASE RISK PREDICTION TOOL")
print("="*50)


prediction, probability = get_user_input_and_predict(selected_features)


Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.

Top features selected by RFE:
['id', 'age', 'cp', 'chol', 'thalch', 'exang', 'oldpeak']

HEART DISEASE RISK PREDICTION TOOL

Please enter the following health information:


Enter value for id:  234
Enter value for age:  23
Enter value for cp:  0
Enter value for chol:  245
Enter value for thalch:  136
Enter value for exang:  0
Enter value for oldpeak:  2.6



Prediction Results:
🔴 High risk of heart disease
Probability of heart disease: 65.9%




In [31]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt


df = pd.read_csv("heart_disease_data_imputed.csv")
X = df.drop(columns=['num'])
y = df['num'].apply(lambda x: 1 if x > 0 else 0)


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)


smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)

lasso = LogisticRegression(penalty='l1', solver='liblinear', random_state=42, C=0.1, max_iter=1000)
lasso.fit(X_train_scaled, y_train_res)


lasso_importance = pd.DataFrame({
    'features': X.columns.tolist(),
    'importance': np.abs(lasso.coef_[0])
}).sort_values('importance', ascending=False)


top_features = lasso_importance.head(7)['features'].values
print("Top 7 features selected by LASSO:")
print(top_features)


col_indices = [X.columns.get_loc(col) for col in top_features]
X_train_selected = X_train_scaled[:, col_indices]
X_test_selected = X_test_scaled[:, col_indices]


final_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    min_samples_split=3,
    random_state=42,
    class_weight='balanced'
)
final_model.fit(X_train_selected, y_train_res)


def get_user_input_and_predict(selected_features):
    print("\nPlease enter the following health information:")
    user_data = {}
    
   
    for feature in selected_features:
        while True:
            try:
                value = float(input(f"Enter value for {feature}: "))
                user_data[feature] = value
                break
            except ValueError:
                print("Please enter a valid number.")
    
  
    user_df = pd.DataFrame([user_data])[selected_features]
    
   
    user_array = user_df.values
    
  
    full_features = np.zeros((1, X_train_scaled.shape[1]))
    for i, feature in enumerate(X.columns):
        if feature in selected_features:
            col_idx = np.where(selected_features == feature)[0][0]
            full_features[0, i] = user_array[0, col_idx]
    
    
    user_scaled = scaler.transform(full_features)
    
   
    user_selected = rfe.transform(user_scaled)
    
  
    prediction = final_model.predict(user_selected)
    proba = final_model.predict_proba(user_selected)[0][1]
    
    
    print("\nPrediction Results:")
    if prediction[0] == 1:
        print("🔴 High risk of heart disease")
    else:
        print("🟢 Low risk of heart disease")
    print(f"Probability of heart disease: {proba:.1%}")
    
    return prediction, proba


print("\n" + "="*50)
print("HEART DISEASE RISK PREDICTION TOOL")
print("="*50)


prediction, probability = get_user_input_and_predict(selected_features)


Top 7 features selected by LASSO:
['oldpeak' 'ca' 'id' 'thal' 'sex' 'cp' 'fbs']

HEART DISEASE RISK PREDICTION TOOL

Please enter the following health information:


Enter value for id:  129999
Enter value for age:  23
Enter value for cp:  0
Enter value for chol:  245
Enter value for thalch:  136
Enter value for exang:  0
Enter value for oldpeak:  2.6



Prediction Results:
🔴 High risk of heart disease
Probability of heart disease: 61.8%




In [17]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE


df = pd.read_csv("heart_disease_data_imputed.csv")
X = df.drop(columns=['num'])
y = df['num'].apply(lambda x: 1 if x > 0 else 0)


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)


smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)


selector = SelectKBest(score_func=f_classif, k=7)
X_train_selected = selector.fit_transform(X_train_scaled, y_train_res)
X_test_selected = selector.transform(X_test_scaled)


anova_scores = pd.DataFrame({
    'features': X.columns.tolist(),
    'f_score': selector.scores_,
    'p_value': selector.pvalues_
}).sort_values('f_score', ascending=False)


top_n = min(7, len(anova_scores))
top_features = anova_scores['features'].head(top_n).values
print("Top features selected by ANOVA F-test:")
print(top_features)


final_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    min_samples_split=2,
    random_state=42,
    class_weight='balanced'
)
final_model.fit(X_train_selected, y_train_res)


y_pred = final_model.predict(X_test_selected)
y_proba = final_model.predict_proba(X_test_selected)[:, 1]

def get_user_input_and_predict(selected_features):
    print("\nPlease enter the following health information:")
    user_data = {}

 
    for feature in selected_features:
        while True:
            try:
                value = float(input(f"Enter value for {feature}: "))
                user_data[feature] = value
                break
            except ValueError:
                print("Please enter a valid number.")

    user_df = pd.DataFrame([user_data])

   
    full_features = np.zeros((1, X.shape[1]))  
    for i, feature in enumerate(X.columns):
        if feature in selected_features:
            col_idx = np.where(selected_features == feature)[0][0]
            full_features[0, i] = user_df.iloc[0, col_idx]  

    
    user_scaled = scaler.transform(full_features)

   
    user_selected = selector.transform(user_scaled)

    
    prediction = final_model.predict(user_selected)
    proba = final_model.predict_proba(user_selected)[0][1]

    print("\nPrediction Results:")
    if prediction[0] == 1:
        print("🔴 High risk of heart disease")
    else:
        print("🟢 Low risk of heart disease")
    print(f"Probability of heart disease: {proba:.1%}")

    return prediction, proba


print("\n" + "=" * 50)
print("HEART DISEASE RISK PREDICTION TOOL")
print("=" * 50)

prediction, probability = get_user_input_and_predict(top_features)


Top features selected by ANOVA F-test:
['thalch' 'id' 'oldpeak' 'exang' 'cp' 'age' 'sex']

HEART DISEASE RISK PREDICTION TOOL

Please enter the following health information:


Enter value for thalch:  126
Enter value for id:  456577
Enter value for oldpeak:  2.6
Enter value for exang:  0
Enter value for cp:  0
Enter value for age:  23
Enter value for sex:  0



Prediction Results:
🔴 High risk of heart disease
Probability of heart disease: 77.5%


