In [62]:
import numpy as np
import pandas as pd
import sys
import os
from sklearn.datasets import fetch_covtype
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from fancyimpute import KNN

sys.path.append(os.path.abspath(".."))
from common import common

In [63]:
base_path = common.base_path

In [67]:
def get_heart_df():
    config = {
        'NORMAL_TARGET': 0,
        'TARGET_COLUMN': 'num',
        'TARGET_DICT': {
            0: 'Absense',
            1: 'Slight Presence',
            2: 'Presence',
            3: 'Moderate Presence',
            4: 'High Presence'
        },
        'INV_TARGET_DICT': {
            'Absense': 0,
            'Slight Presence': 1,
            'Presence': 2,
            'Moderate Presence': 3,
            'High Presence': 4
        },
        
        # List of numerical columns (these are to be standardized)
        'NUMERICAL_COLUMNS': ['age', 'trestbps', 'chol', 'thalch', 'oldpeak'],
        # List of categorical columns (these are to be one hot encoded)
        'CATEGORICAL_COLUMNS': ['sex','cp', 'fbs', 'restecg', 'exang'],
        # List of ordinal columns (these are to be label encoded)
        'ORDINAL_COLUMNS': [],
    }
    target_column = config['TARGET_COLUMN']
    all_df = pd.read_csv(f'{base_path}/datasources/heart/heart_disease_uci.csv')
    
    # Headers of column
    main_labels = all_df.columns
    
    print('Normal class: ', all_df[target_column].mode())
    return (all_df, main_labels, config)

In [68]:
def get_processed_heart_df():
    all_df, main_labels, config = get_heart_df()
    # print('main_labels', main_labels)
    target_column = config['TARGET_COLUMN']

    # Drop columns
    all_df = all_df.drop(['id','dataset'], axis=1)
    
    # Fill values
    median_chol = all_df.loc[all_df['chol']!=0, 'chol'].median()
    all_df = all_df.fillna(value={'chol': median_chol})
    all_df.loc[all_df['chol']==0, 'chol'] = median_chol 

    mean_bp = all_df.loc[all_df['trestbps']!=0,'trestbps'].mean()
    all_df = all_df.fillna(value={'trestbps': mean_bp})
    all_df.loc[all_df['trestbps']==0, 'trestbps'] = mean_bp

    mean_hr = all_df.loc[all_df['thalch']!=0,'thalch'].mean()
    all_df = all_df.fillna(value={'thalch': mean_hr})
    all_df.loc[all_df['thalch']==0, 'thalch'] = mean_hr

    mean_peak = all_df.oldpeak.mean()
    all_df = all_df.fillna(value={'oldpeak': mean_peak})
    all_df.loc[all_df['oldpeak']==0, 'oldpeak'] = mean_peak

    all_df.drop(labels=['ca','thal','slope'], axis=1, inplace=True)
    all_df = all_df.astype({'sex':'category', 'cp':'category', 'fbs':'bool', 'restecg':'category', 'exang':'bool'})
    all_df.dropna(inplace=True)

    # One Hot Encoder
    ohe, all_df = common.one_hot_encode(all_df, config['CATEGORICAL_COLUMNS'])

    main_labels = all_df.columns
    # print('main_labels', main_labels)
    
    return (all_df, main_labels, config)

In [69]:
df,m,c = get_processed_heart_df()

Normal class:  0    0
Name: num, dtype: int64


In [70]:
df.columns

Index(['age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'num', 'sex_Female',
       'sex_Male', 'cp_asymptomatic', 'cp_atypical angina', 'cp_non-anginal',
       'cp_typical angina', 'fbs_False', 'fbs_True', 'restecg_lv hypertrophy',
       'restecg_normal', 'restecg_st-t abnormality', 'exang_False',
       'exang_True'],
      dtype='object')

In [71]:
df

Unnamed: 0,age,trestbps,chol,thalch,oldpeak,num,sex_Female,sex_Male,cp_asymptomatic,cp_atypical angina,cp_non-anginal,cp_typical angina,fbs_False,fbs_True,restecg_lv hypertrophy,restecg_normal,restecg_st-t abnormality,exang_False,exang_True
0,63,145.000000,233.0,150.000000,2.300000,0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
1,67,160.000000,286.0,108.000000,1.500000,2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
2,67,120.000000,229.0,129.000000,2.600000,1,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
3,37,130.000000,250.0,187.000000,3.500000,0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
4,41,130.000000,204.0,172.000000,1.400000,0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,54,127.000000,333.0,154.000000,0.878788,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
916,62,132.286047,139.0,137.545665,0.878788,0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
917,55,122.000000,223.0,100.000000,0.878788,2,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
918,58,132.286047,385.0,137.545665,0.878788,0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
