## Use Heart Disease [Dataset](https://github.com/cksajil/DSAIRP25/blob/main/datasets/heart_disease.csv) and answer the following questions

## 1. Find the top 5 important features to the target column

In [6]:
import pandas as pd
from sklearn.feature_selection import mutual_info_classif

df = pd.read_csv('heart_disease.csv')

X = df.iloc[:, :-1]
y = df.iloc[:, -1]

cat_cols = [col for col in X.columns if X[col].dtype == 'object' or X[col].nunique() < 10 and X[col].dtype == 'int64']

exp_cat_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

cat_cols_to_encode = [col for col in exp_cat_cols if col in X.columns]

X_encoded = pd.get_dummies(X, columns=cat_cols_to_encode, drop_first=True)

mi_scores = mutual_info_classif(X_encoded, y, random_state=42)


mi_series = pd.Series(mi_scores, index=X_encoded.columns)

top_5 = mi_series.nlargest(5)

print("Top 5 most important features (using Mutual Information):")
print(top_5)

Top 5 most important features (using Mutual Information):
chol       0.262836
oldpeak    0.171186
thalach    0.151613
thal_2     0.134578
thal_3     0.108855
dtype: float64


## 2. Perform Box-Cox Transformations to relevant features

In [7]:
from scipy import stats

numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns.tolist()

numerical_cols_to_transform = [col for col in numerical_cols if col not in categorical_cols_to_encode]

transformed_data = pd.DataFrame()
for col in numerical_cols_to_transform:
    if (X[col] > 0).all():
        transformed_data[col + '_boxcox'], lambda_value = stats.boxcox(X[col])
        print(f"Applied Box-Cox to '{col}' with lambda = {lambda_value:.4f}")
    else:
        print(f"Skipping Box-Cox for '{col}' as it contains non-positive values.")
        transformed_data[col] = X[col]

X_transformed = pd.concat([X_encoded, transformed_data], axis=1)

print("\nDataFrame after Box-Cox transformation:")
print(X_transformed.head())

Applied Box-Cox to 'age' with lambda = 1.5268
Applied Box-Cox to 'trestbps' with lambda = -0.7397
Applied Box-Cox to 'chol' with lambda = -0.1009
Applied Box-Cox to 'thalach' with lambda = 2.1716
Skipping Box-Cox for 'oldpeak' as it contains non-positive values.

DataFrame after Box-Cox transformation:
   age  trestbps  chol  thalach  oldpeak  sex_1   cp_1   cp_2   cp_3  fbs_1  \
0   52       125   212      168      1.0   True  False  False  False  False   
1   53       140   203      155      3.1   True  False  False  False   True   
2   70       145   174      125      2.6   True  False  False  False  False   
3   61       148   203      161      0.0   True  False  False  False  False   
4   62       138   294      106      1.9  False  False  False  False   True   

   ...   ca_3   ca_4  thal_1  thal_2  thal_3  age_boxcox  trestbps_boxcox  \
0  ...  False  False   False   False    True  272.372422         1.313869   
1  ...  False  False   False   False    True  280.429390         1.

## 3. Perform Feature Binning to Age Column and add it as a new column to the dataset

In [8]:
bins = [29, 40, 50, 60, 70, 78]
labels = ['29-39', '40-49', '50-59', '60-69', '70-78']
X['age_binned'] = pd.cut(X['age'], bins=bins, labels=labels, right=True)

print("\nDataFrame with 'age_binned' column:")
print(X.head())



DataFrame with 'age_binned' column:
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   52    1   0       125   212    0        1      168      0      1.0      2   
1   53    1   0       140   203    1        0      155      1      3.1      0   
2   70    1   0       145   174    0        1      125      1      2.6      0   
3   61    1   0       148   203    0        1      161      0      0.0      2   
4   62    0   0       138   294    1        1      106      0      1.9      1   

   ca  thal age_binned  
0   2     3      50-59  
1   0     3      50-59  
2   0     3      60-69  
3   1     3      60-69  
4   3     2      60-69  


## 4. Find the most orthogonal feature to the 'chol' feature

In [9]:
def find_most_orthogonal_feature(df, target_feature):
    if target_feature not in df.columns:
        print(f"Error: Target feature '{target_feature}' not found in the DataFrame.")
        return None, None
    correlation_matrix = df.corr()

    if target_feature not in correlation_matrix.columns:
         print(f"Error: Target feature '{target_feature}' not found in the correlation matrix.")
         return None, None

    target_correlations = correlation_matrix[target_feature].abs()

    target_correlations = target_correlations.drop(target_feature, errors='ignore')

    if target_correlations.empty:
        print("No other features to compare with.")
        return None, None

    most_orthogonal_feature = target_correlations.idxmin()
    min_correlation_value = target_correlations.min()

    return most_orthogonal_feature, min_correlation_value

most_orthogonal_feature, correlation_value = find_most_orthogonal_feature(X_transformed, 'chol')

if most_orthogonal_feature:
    print(f"\nThe most orthogonal feature to 'chol' is '{most_orthogonal_feature}'")
    print(f"Its absolute Pearson correlation coefficient with 'chol' is: {correlation_value:.4f}")


The most orthogonal feature to 'chol' is 'cp_1'
Its absolute Pearson correlation coefficient with 'chol' is: 0.0111
