# 1. Set Up

### 1.1 Import Libraries

In [63]:
import pandas as pd
import numpy as np

from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split

### 1.2 Import Dataset

In [35]:
df = pd.read_csv('../data/heart.csv')

### 1.3 Summary of Dataset

In [36]:
def create_summary_df(dataframe):
    num_samples = 5
    summary_data = []

    for column in dataframe.columns:
        null_count = dataframe[column].isnull().sum()
        unique_count = dataframe[column].nunique()
        data_type = dataframe[column].dtype
        min_value = dataframe[column].min() if pd.api.types.is_numeric_dtype(data_type) else None
        q25 = dataframe[column].quantile(0.25) if pd.api.types.is_numeric_dtype(data_type) else None
        q50 = dataframe[column].quantile(0.50) if pd.api.types.is_numeric_dtype(data_type) else None
        q75 = dataframe[column].quantile(0.75) if pd.api.types.is_numeric_dtype(data_type) else None
        max_value = dataframe[column].max() if pd.api.types.is_numeric_dtype(data_type) else None
        mean = dataframe[column].mean() if pd.api.types.is_numeric_dtype(data_type) else None
        std_dev = dataframe[column].std() if pd.api.types.is_numeric_dtype(data_type) else None
        top_value = dataframe[column].mode()[0] if pd.api.types.is_object_dtype(data_type) else None
        top_value_freq = dataframe[column].value_counts().max() if pd.api.types.is_object_dtype(data_type) else None
        sample_values = df[column].sample(num_samples).tolist()
        
        summary_data.append([column, null_count, unique_count, data_type, min_value, q25, q50, q75, max_value, mean, std_dev, top_value, top_value_freq, sample_values])

    summary_df = pd.DataFrame(summary_data, columns=['Column', 'Missing Values', 'Unique Count', 'Data Type', 'Min Value','%25','Median','%75',
                                                    'Max Value', 'Mean', 'Std Dev', 'Top Value', 'Top Value Frequency', 'Sample Values'])
    
    return summary_df

def shape_of_df(dataframe):
    print(f'Shape of the dataset: {dataframe.shape}')

def sum_of_duplicate(dataframe):
    print(f'Sum of duplicated rows: {dataframe.duplicated().sum()}')

In [37]:
summary_df = create_summary_df(df)

shape_of_df(df)
sum_of_duplicate(df)
summary_df

Shape of the dataset: (918, 12)
Sum of duplicated rows: 0


Unnamed: 0,Column,Missing Values,Unique Count,Data Type,Min Value,%25,Median,%75,Max Value,Mean,Std Dev,Top Value,Top Value Frequency,Sample Values
0,Age,0,50,int64,28.0,47.0,54.0,60.0,77.0,53.510893,9.432617,,,"[53, 43, 55, 74, 60]"
1,Sex,0,2,object,,,,,,,,M,725.0,"[M, F, M, F, M]"
2,ChestPainType,0,4,object,,,,,,,,ASY,496.0,"[ASY, ASY, ASY, NAP, ASY]"
3,RestingBP,0,67,int64,0.0,120.0,130.0,140.0,200.0,132.396514,18.514154,,,"[120, 112, 160, 120, 128]"
4,Cholesterol,0,222,int64,0.0,173.25,223.0,267.0,603.0,198.799564,109.384145,,,"[199, 259, 283, 250, 277]"
5,FastingBS,0,2,int64,0.0,0.0,0.0,0.0,1.0,0.233115,0.423046,,,"[1, 1, 1, 0, 0]"
6,RestingECG,0,3,object,,,,,,,,Normal,552.0,"[Normal, Normal, ST, Normal, Normal]"
7,MaxHR,0,119,int64,60.0,120.0,138.0,156.0,202.0,136.809368,25.460334,,,"[146, 135, 169, 63, 142]"
8,ExerciseAngina,0,2,object,,,,,,,,N,547.0,"[N, N, N, N, Y]"
9,Oldpeak,0,53,float64,-2.6,0.0,0.6,1.5,6.2,0.887364,1.06657,,,"[1.0, 0.0, 0.0, 0.0, 2.9]"


- There are no duplicated rows.
- Seems there are no missing values but still we need the check categorical features for values like ' ', '?' etc.
- There are no unnecessary columns.
- <b>A resting blood pressure of 0 is not possible in a living human being and it is not possible for a human to have a serum cholesterol level of 0 mg/dL. We have to deal with them.</b>

-----------------------------------

# 2. Data Preprocessing

### 2.1 Categorical and Numerical Features

If we look at the summary, the most unique feature is the chest pain type and it is 4. So we can set threshold to 5.

We can simply split features into numerical and categorical by using for loop and lenght of every features unique values.

In [38]:
def split_features(dataframe):

    numerical = []
    categorical = []

    for col in dataframe.drop('HeartDisease', axis=1).columns:
        if len(dataframe[col].unique()) > 5:
            numerical.append(col)
        else:
            categorical.append(col)
    return numerical, categorical


def print_categorical_features(dataframe, categorical_features):

    for col in dataframe[categorical_features].columns:
        print(f'{col} column unique values: {dataframe[col].unique()}')

In [39]:
numerical_features, categorical_features = split_features(df)

print_categorical_features(df, categorical_features)

Sex column unique values: ['M' 'F']
ChestPainType column unique values: ['ATA' 'NAP' 'ASY' 'TA']
FastingBS column unique values: [0 1]
RestingECG column unique values: ['Normal' 'ST' 'LVH']
ExerciseAngina column unique values: ['N' 'Y']
ST_Slope column unique values: ['Up' 'Flat' 'Down']


### 2.2 Dealing with Incorrect Values

In [40]:
def incorrect_values_count_and_percentage(dataframe, column, incorrect_value):
    print(f'{column} has {dataframe.column.value_counts()[incorrect_value]} zero values and it is %{round(dataframe.column.value_counts()[incorrect_value]/len(dataframe.column) * 100,2)} of total')

In [41]:
print(f'RestingBP has {df.RestingBP.value_counts()[0]} zero values and it is %{round(df.RestingBP.value_counts()[0]/len(df.RestingBP) * 100,2)} of total')
print(f'Cholesterol has {df.Cholesterol.value_counts()[0]} zero values and it is %{round(df.Cholesterol.value_counts()[0]/len(df.Cholesterol) * 100,2)} of total')

RestingBP has 1 zero values and it is %0.11 of total
Cholesterol has 172 zero values and it is %18.74 of total


In [42]:
# #Replace restingBP 0 value with mean
df['RestingBP'] = df['RestingBP'].replace(0,df['RestingBP'].mean())

<b>If we will look many scientific researches we can clearly say that cholesterol is positive correleted with heart disease.</b>

Now we will try several methods to get that positive correlation between cholesterol and heart disease but we will be careful not to influence too much the correlation between heart disease and other features.

- Drop 0 values from dataset.
- Imputation with mean or median after replace 0 values with np.nan.
- Imputation with KNN imputer.

In [43]:
# Calculating correlation before applying any method
before_corr = round(df['Cholesterol'].corr(df['HeartDisease']),3)

##### 2.2.1 Dropping 0 values

In [44]:
drop_corr = round(df['Cholesterol'].replace(0, np.nan).dropna().corr(df['HeartDisease']),3)

##### 2.2.2 Fill with mean

In [45]:
mean_corr = round(df['Cholesterol'].replace(0, np.nan).fillna(df['Cholesterol'].mean()).corr(df['HeartDisease']),3)

##### 2.2.3 Fill with median

In [46]:
median_corr = round(df['Cholesterol'].replace(0, np.nan).fillna(df['Cholesterol'].median()).corr(df['HeartDisease']),3)

##### 2.2.4 KNN imputation

In [48]:
temp_df = df.copy()
temp_df['Cholesterol'].replace(0, np.nan, inplace=True)

knn_imputer = KNNImputer(missing_values=np.nan, n_neighbors=5)
temp_df[['Cholesterol']] = knn_imputer.fit_transform(temp_df[['Cholesterol']])
knn_corr = round(temp_df['Cholesterol'].corr(temp_df['HeartDisease']),3)

##### 2.2.5 Comparing Results

In [49]:
# Create correlation df to show each correlation after applying methods
corr_df = pd.DataFrame(data=[[before_corr, drop_corr,mean_corr,median_corr,knn_corr]], columns=['before_corr','corr_after_drop','corr_after_mean','corr_after_median','corr_after_knn'])
corr_df

Unnamed: 0,before_corr,corr_after_drop,corr_after_mean,corr_after_median,corr_after_knn
0,-0.233,0.104,-0.012,0.043,0.094


<span>As we can see dropping 0 values makes the correlation between cholesterol and heart disease much better of all of methods. 0.104 is not good enough but it is what we have for now.</span><b><span style=';color:red'>But does dropping zero values(means dropping 172 rows) affect other correlation?</span></b>

In [53]:
# Copy df to two different new df
df_before_drop = df.copy()
df_after_drop = df.copy()

# Dropping 0 values in Cholesterol feature in after df
df_after_drop['Cholesterol'] = df_after_drop['Cholesterol'].replace(0, np.nan)
df_after_drop.dropna(inplace=True)

# Create function to apply one hot encoding for categorical features
def encoder(df):
    transformer = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'),categorical_features))
    transformed = transformer.fit_transform(df)
    transformed_df = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())
    transformed_df.index = df.index

    df = pd.concat([df, transformed_df], axis=1)
    df.drop(categorical_features, axis=1, inplace=True)
    
    # Renaming columns
    new_columns = []
    for i,col in enumerate(df.columns):
        if 'onehotencoder__' in col:
            col = col.replace('onehotencoder__','')
            new_columns.append(col)
        else:
            new_columns.append(col)

    df.columns = new_columns
    return df
    
df_before_drop = encoder(df_before_drop)
df_after_drop = encoder(df_after_drop)

# Find correlation between HeartDisease and every other features in both df
corr_before = round(df_before_drop.corr().sort_values('HeartDisease',ascending=False).iloc[1:, :]['HeartDisease'],3)
corr_after = round(df_after_drop.corr().sort_values('HeartDisease',ascending=False).iloc[1:, :]['HeartDisease'],3)

In [54]:
# Combining before_corr, after_corr and difference between them into dataframe
difference_df = pd.concat([corr_before, corr_after,(corr_after - corr_before)], axis=1)
difference_df.columns = ['corr_before_drop_0','corr_after_drop_0','difference']
difference_df

Unnamed: 0,corr_before_drop_0,corr_after_drop_0,difference
ST_Slope_Flat,0.554,0.592,0.038
ChestPainType_ASY,0.517,0.523,0.006
ExerciseAngina_Y,0.494,0.552,0.058
Oldpeak,0.404,0.496,0.092
Sex_M,0.305,0.293,-0.012
Age,0.282,0.299,0.017
FastingBS_1,0.267,0.161,-0.106
ST_Slope_Down,0.123,0.132,0.009
RestingBP,0.118,0.173,0.055
RestingECG_ST,0.103,0.096,-0.007


##### 2.2.6 Final process

In [55]:
# We decided to drop all zero values in cholesterol so we will do it for final process
df['Cholesterol'].replace(0, np.nan, inplace=True)
df.dropna(inplace=True)

### 2.3 Encoding Categorical Features

In [57]:
# Create make_column_transformer object
transformer = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'),categorical_features))

In [58]:
# Create a new dataframe with transformed features and join the new dataframe with original dataframe
# Also drop encoded categorical columns
transformed = transformer.fit_transform(df)
transformed_df = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())
transformed_df.index = df.index

df = pd.concat([df, transformed_df], axis=1)
df.drop(categorical_features, axis=1, inplace=True)

In [59]:
# Renaming columns
new_columns = []
for i,col in enumerate(df.columns):
    if 'onehotencoder__' in col:
        col = col.replace('onehotencoder__','')
        new_columns.append(col)
    else:
        new_columns.append(col)
        
df.columns = new_columns
df.columns

Index(['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak', 'HeartDisease',
       'Sex_F', 'Sex_M', 'ChestPainType_ASY', 'ChestPainType_ATA',
       'ChestPainType_NAP', 'ChestPainType_TA', 'FastingBS_0', 'FastingBS_1',
       'RestingECG_LVH', 'RestingECG_Normal', 'RestingECG_ST',
       'ExerciseAngina_N', 'ExerciseAngina_Y', 'ST_Slope_Down',
       'ST_Slope_Flat', 'ST_Slope_Up'],
      dtype='object')

### 2.4 Train-Test Sets Split

In [62]:
X = df.drop('HeartDisease',axis=1)
y = df['HeartDisease']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### 2.5 Feature Scaling

In [66]:
def Standard_Scaler(df, numerical_features):
    scaler = StandardScaler()
    df[numerical_features] = scaler.fit_transform(df[numerical_features])
    
    return df

X_train = Standard_Scaler(X_train, numerical_features)
X_test = Standard_Scaler(X_test, numerical_features)

### 2.6 Export Train-Test Sets

In [74]:
X_train.to_csv('../data/X_train.csv', index=False)
X_test.to_csv('../data/X_test.csv', index=False)
y_train.to_csv('../data/y_train.csv', index=False)
y_test.to_csv('../data/y_test.csv', index=False)