In [27]:
from google.colab import files

# Upload the file
uploaded = files.upload()

Saving heart_attack_risk_dataset.csv to heart_attack_risk_dataset (1).csv


In [28]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectKBest

In [39]:
# Step 1: Load your dataset
data = pd.read_csv('heart_attack_risk_dataset.csv')

In [40]:
# Step 2: Separate features (X) and target (y)
TARGET_VARIABLE = 'Heart_Attack_Risk'
X = data.drop(columns=[TARGET_VARIABLE])
y = data[TARGET_VARIABLE]

In [41]:
# Step 3: Identify and encode categorical columns

# Identify categorical
categorical_cols = X.select_dtypes(include=['object']).columns

# Step 3: Encode categorical columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le

# Encode the target variable as well
y = LabelEncoder().fit_transform(y)

In [42]:
def get_top_k_features(selector, feature_names, k, algorithm):
  """
  Get the top k features based on their scores from a SelectKBest selector.

  Parameters:
  selector (SelectKBest): Fitted SelectKBest object.
  feature_names (list): List of feature names (columns of X).
  k (int): Number of top features to select.
  algorithm (str): The name of the feature selection algorithm.

  Returns:
  list: Names of the top k features.
  """
  # Retrieve feature scores
  scores = selector.scores_

  # Create a DataFrame for ranked features
  feature_ranking = pd.DataFrame({
    'Feature': feature_names,
    'Score': scores
  }).sort_values(by='Score', ascending=False)

  # Display top-ranked features
  print(f"Feature Rankings using {algorithm}:")
  print(feature_ranking)

  # Return selected top k features
  return feature_ranking.head(k)['Feature'].tolist()

In [43]:
def train_and_fit_model(X_train, X_test, y_train, y_test):
  """
  Builds, trains, and evaluates a Random Forest classification model.

  Parameters:
  ----------
  X_train : pd.DataFrame or np.ndarray
      Feature matrix for training the model.
  X_test : pd.DataFrame or np.ndarray
      Feature matrix for testing the model.
  y_train : pd.Series or np.ndarray
      Target labels for training the model.
  y_test : pd.Series or np.ndarray
      True target labels for testing the model.

  Returns:
  -------
  None
      Prints the model's accuracy and a detailed classification report.
  """
  # Build a simple classification model
  model = RandomForestClassifier(random_state=42)
  model.fit(X_train, y_train)

  # Make predictions
  y_pred = model.predict(X_test)

  # Evaluate the model
  accuracy = accuracy_score(y_test, y_pred)
  print("\nModel Accuracy with Top 10 Features:", accuracy)

  # Detailed performance metrics
  print("\nClassification Report:")
  print(classification_report(y_test, y_pred))

In [34]:
from sklearn.feature_selection import chi2
# Step 4: Apply Chi-Square

# SelectKBest with chi2 evaluates all features
selector = SelectKBest(score_func=chi2, k='all')
selector.fit(X, y)

# Rank the features using Chi-Square algorithm
top_features = get_top_k_features(selector=selector, feature_names=X.columns,
                                  k=10, algorithm="Chi-Square")

# Reduce the dataset to the top 10 features
X_top = X[top_features]

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_top, y, test_size=0.2, random_state=42)

# Train and fit random forest classification model based on feature selected
train_and_fit_model(X_train, X_test, y_train, y_test)

Feature Rankings using Chi-Square:
                    Feature      Score
10               Heart_Rate  13.173117
9                Resting_BP  10.379242
8         Cholesterol_Level   6.796015
18  Max_Heart_Rate_Achieved   5.940807
7              Hypertension   5.329900
17  Exercise_Induced_Angina   3.586085
6                  Diabetes   2.054292
11           Family_History   1.808392
15      Fasting_Blood_Sugar   1.702425
0                       Age   0.943490
16              ECG_Results   0.778266
3       Alcohol_Consumption   0.750080
2                   Smoking   0.704773
4   Physical_Activity_Level   0.463231
13          Chest_Pain_Type   0.302287
14              Thalassemia   0.285877
1                    Gender   0.281376
12             Stress_Level   0.157433
5                       BMI   0.050753

Model Accuracy with Top 10 Features: 0.4769

Classification Report:
              precision    recall  f1-score   support

           0       0.17      0.02      0.03      1987
       

In [35]:
from sklearn.feature_selection import mutual_info_classif
# Step 5: Apply Mutual Information Classification (MIC)

# Wrapping mutual_info_classif with a fixed random_state
mutual_info_classif_with_random_state = lambda X, y: mutual_info_classif(X, y, random_state=42)

# SelectKBest with mutual_info_classif evaluates all features
selector = SelectKBest(score_func=mutual_info_classif_with_random_state, k='all')
selector.fit(X, y)

# Rank the features using Mutual Information Classification algorithm
top_features = get_top_k_features(selector=selector, feature_names=X.columns,
                                  k=10, algorithm="Mutual Information Classification")

# Reduce the dataset to the top 10 features
X_top = X[top_features]

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_top, y, test_size=0.2, random_state=42)

# Train and fit random forest classification model based on feature selected
train_and_fit_model(X_train, X_test, y_train, y_test)

Feature Rankings using Mutual Information Classification:
                    Feature     Score
4   Physical_Activity_Level  0.005621
12             Stress_Level  0.005556
16              ECG_Results  0.005047
3       Alcohol_Consumption  0.004469
9                Resting_BP  0.003267
1                    Gender  0.003077
18  Max_Heart_Rate_Achieved  0.003034
6                  Diabetes  0.002938
10               Heart_Rate  0.001328
11           Family_History  0.000977
14              Thalassemia  0.000749
7              Hypertension  0.000748
0                       Age  0.000475
8         Cholesterol_Level  0.000000
5                       BMI  0.000000
13          Chest_Pain_Type  0.000000
15      Fasting_Blood_Sugar  0.000000
2                   Smoking  0.000000
17  Exercise_Induced_Angina  0.000000

Model Accuracy with Top 10 Features: 0.4598

Classification Report:
              precision    recall  f1-score   support

           0       0.20      0.05      0.08      1987
    

In [36]:
from sklearn.feature_selection import mutual_info_regression
# Step 6: Apply Mutual Information Regression (MIR)

# Wrapping mutual_info_regression with a fixed random_state
mutual_info_regression_with_random_state = lambda X, y: mutual_info_regression(X, y, random_state=42)

# SelectKBest with mutual_info_regression evaluates all features
selector = SelectKBest(score_func=mutual_info_regression_with_random_state, k='all')
selector.fit(X, y)

# Rank the features using Mutual Information Regression algorithm
top_features = get_top_k_features(selector=selector, feature_names=X.columns,
                                  k=10, algorithm="Mutual Information Regression")

# Reduce the dataset to the top 10 features
X_top = X[top_features]

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_top, y, test_size=0.2, random_state=42)

# Train and fit random forest classification model based on feature selected
train_and_fit_model(X_train, X_test, y_train, y_test)

Feature Rankings using Mutual Information Regression:
                    Feature     Score
13          Chest_Pain_Type  0.007408
15      Fasting_Blood_Sugar  0.005844
18  Max_Heart_Rate_Achieved  0.003900
7              Hypertension  0.003736
0                       Age  0.002434
1                    Gender  0.002167
16              ECG_Results  0.002055
17  Exercise_Induced_Angina  0.001980
12             Stress_Level  0.001417
8         Cholesterol_Level  0.001408
14              Thalassemia  0.001362
6                  Diabetes  0.000678
5                       BMI  0.000658
4   Physical_Activity_Level  0.000437
10               Heart_Rate  0.000000
11           Family_History  0.000000
3       Alcohol_Consumption  0.000000
2                   Smoking  0.000000
9                Resting_BP  0.000000

Model Accuracy with Top 10 Features: 0.4605

Classification Report:
              precision    recall  f1-score   support

           0       0.21      0.06      0.09      1987
        

In [37]:
from sklearn.feature_selection import f_classif
# Step 7: Apply ANOVA F-value Classificaiton

# SelectKBest with f_classif evaluates all features
selector = SelectKBest(score_func=f_classif, k='all')
selector.fit(X, y)

# Rank the features using ANOVA F-value Classificaiton algorithm
top_features = get_top_k_features(selector=selector, feature_names=X.columns,
                                  k=10, algorithm="ANOVA F-value Classificaiton")

# Reduce the dataset to the top 10 features
X_top = X[top_features]

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_top, y, test_size=0.2, random_state=42)

# Train and fit random forest classification model based on feature selected
train_and_fit_model(X_train, X_test, y_train, y_test)

Feature Rankings using ANOVA F-value Classificaiton:
                    Feature     Score
7              Hypertension  3.793223
17  Exercise_Induced_Angina  2.245974
10               Heart_Rate  1.527885
11           Family_History  1.292325
6                  Diabetes  1.282062
9                Resting_BP  1.039392
15      Fasting_Blood_Sugar  1.000653
3       Alcohol_Consumption  0.627114
16              ECG_Results  0.586862
18  Max_Heart_Rate_Achieved  0.533806
2                   Smoking  0.502255
4   Physical_Activity_Level  0.498139
8         Cholesterol_Level  0.407514
1                    Gender  0.280398
14              Thalassemia  0.215926
13          Chest_Pain_Type  0.182233
12             Stress_Level  0.168870
0                       Age  0.058229
5                       BMI  0.013377

Model Accuracy with Top 10 Features: 0.4485

Classification Report:
              precision    recall  f1-score   support

           0       0.21      0.07      0.11      1987
         

In [44]:
from sklearn.feature_selection import f_regression
# Step 8: Apply ANOVA F-value Regression

# SelectKBest with f_regression evaluates all features
selector = SelectKBest(score_func=f_regression, k='all')
selector.fit(X, y)

# Rank the features using ANOVA F-value Classificaiton algorithm
top_features = get_top_k_features(selector=selector, feature_names=X.columns,
                                  k=10, algorithm="ANOVA F-value Regression")

# Reduce the dataset to the top 10 features
X_top = X[top_features]

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_top, y, test_size=0.2, random_state=42)

# Train and fit random forest classification model based on feature selected
train_and_fit_model(X_train, X_test, y_train, y_test)

Feature Rankings using ANOVA F-value Regression:
                    Feature     Score
7              Hypertension  7.203770
17  Exercise_Induced_Angina  3.669618
11           Family_History  2.578735
15      Fasting_Blood_Sugar  1.792340
10               Heart_Rate  1.642481
18  Max_Heart_Rate_Achieved  1.026298
6                  Diabetes  0.595148
13          Chest_Pain_Type  0.337989
2                   Smoking  0.327031
14              Thalassemia  0.246699
16              ECG_Results  0.165795
8         Cholesterol_Level  0.157636
0                       Age  0.113814
4   Physical_Activity_Level  0.111908
1                    Gender  0.099478
12             Stress_Level  0.033642
3       Alcohol_Consumption  0.021931
5                       BMI  0.012390
9                Resting_BP  0.001871

Model Accuracy with Top 10 Features: 0.419

Classification Report:
              precision    recall  f1-score   support

           0       0.20      0.11      0.14      1987
           1  