In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import spearmanr
from scipy.cluster import hierarchy
from collections import defaultdict

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression

from statsmodels.stats.outliers_influence import variance_inflation_factor
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold

import custom_map

In [2]:
import importlib

importlib.reload(custom_map)

<module 'custom_map' from '/Users/dominikmika/PycharmProjects/Ridge-Hill-Climbing/src/custom_map.py'>

# Data cleaning

In [3]:
data = pd.read_csv("../dataset/train.csv")
data.info()

target = "Heart Disease"

<class 'pandas.DataFrame'>
RangeIndex: 630000 entries, 0 to 629999
Data columns (total 15 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       630000 non-null  int64  
 1   Age                      630000 non-null  int64  
 2   Sex                      630000 non-null  int64  
 3   Chest pain type          630000 non-null  int64  
 4   BP                       630000 non-null  int64  
 5   Cholesterol              630000 non-null  int64  
 6   FBS over 120             630000 non-null  int64  
 7   EKG results              630000 non-null  int64  
 8   Max HR                   630000 non-null  int64  
 9   Exercise angina          630000 non-null  int64  
 10  ST depression            630000 non-null  float64
 11  Slope of ST              630000 non-null  int64  
 12  Number of vessels fluro  630000 non-null  int64  
 13  Thallium                 630000 non-null  int64  
 14  Heart Disease  

In [4]:
data.drop('id', axis=1, inplace=True)

data['Heart Disease'] = np.where(data['Heart Disease'] == 'Presence', 1, 0)

In [5]:
binary_cols = data.columns[data.nunique() == 2]
print("Binary features: ", binary_cols)

categorical_cols = data.columns[(data.nunique() <= 10) & (data.nunique() > 2)]
print("Categorical features: ", categorical_cols)

data.nunique()
print("Number of unique values per column:\n", data.nunique())

numerical_cols = list(set(data.columns).difference(binary_cols, categorical_cols))

numerical_features = pd.concat([data[numerical_cols], data[target]], axis=1)
binary_features = data[binary_cols]
categorical_features = pd.concat([data[categorical_cols], data[target]], axis=1)
numerical_binary_features = pd.concat([data[numerical_cols], data[binary_cols]], axis=1)

Binary features:  Index(['Sex', 'FBS over 120', 'Exercise angina', 'Heart Disease'], dtype='str')
Categorical features:  Index(['Chest pain type', 'EKG results', 'Slope of ST',
       'Number of vessels fluro', 'Thallium'],
      dtype='str')
Number of unique values per column:
 Age                         42
Sex                          2
Chest pain type              4
BP                          66
Cholesterol                150
FBS over 120                 2
EKG results                  3
Max HR                      93
Exercise angina              2
ST depression               66
Slope of ST                  3
Number of vessels fluro      4
Thallium                     3
Heart Disease                2
dtype: int64
