# Prepare UCI Auto MPG Dataset
 - This dataset was originally based on the data provided by the Cleveland Clinic Foundation for Heart Disease [soured from UCI](https://archive.ics.uci.edu/dataset/45/heart+disease)
 - The data comprises of 7 categorical and 6 continuous attributes
 - The label/target feild  refers to the presence of heart disease in the patient
 - Here is adescription of the dataset features:
 
Column| Description| Feature Type
------------|--------------------|----------------------
Age | Age in years | Numerical
Sex | (1 = male; 0 = female) | Categorical
CP | Chest pain type (0, 1, 2, 3, 4) | Categorical
Trestbpd | Resting blood pressure (in mm Hg on admission) | Numerical
Chol | Serum cholesterol in mg/dl | Numerical
FBS | fasting blood sugar in 120 mg/dl (1 = true; 0 = false) | Categorical
RestECG | Resting electrocardiogram results (0, 1, 2) | Categorical
Thalach | Maximum heart rate achieved | Numerical
Exang | Exercise induced angina (1 = yes; 0 = no) | Categorical
Oldpeak | ST depression induced by exercise relative to rest | Numerical
Slope | Slope of the peak exercise ST segment | Numerical
CA | Number of major vessels (0-3) colored by fluoroscopy | Both numerical & categorical
Thal | 3 = normal; 6 = fixed defect; 7 = reversible defect | Categorical
Target | Diagnosis of heart disease (1 = true; 0 = f    alse) | Target
 

### Imports

In [1]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer

### Global constants

In [2]:
DATA_SOURCE_URL = 'http://storage.googleapis.com/download.tensorflow.org/data/heart.csv'
COLUMN_TYPES = {
    "age": "continuous",
    "trestbps":  "continuous",
    "chol": "continuous",
    "thalach": "continuous",
    "oldpeak": "continuous",
    "slope": "continuous",
    "sex": "categorical",
    "cp": "categorical",
    "fbs": "categorical",
    "restecg":  "categorical",
    "exang": "categorical",
    "ca": "categorical",
    "thal": "categorical",
    "target": "label",
}
DATA_LOCAL_FILE_PATH = "../data/heart_disease_dataset.csv"
TARGET_DELIMITER = ","

### Get the data 

In [3]:
def getData():
    raw_dataset_df = pd.read_csv(
        DATA_SOURCE_URL        
    )    
    return raw_dataset_df
    
raw_dataset_df = getData()

In [4]:
raw_dataset_df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0,fixed,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,normal,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2,reversible,0
3,37,1,3,130,250,0,0,187,0,3.5,3,0,normal,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,normal,0


In [5]:
raw_dataset_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,303.0,54.594059,9.01637,29.0,48.0,56.0,61.0,77.0
sex,303.0,0.676568,0.46856,0.0,0.0,1.0,1.0,1.0
cp,303.0,3.108911,1.028414,0.0,2.0,3.0,4.0,4.0
trestbps,303.0,131.785479,17.748338,94.0,120.0,130.0,140.0,200.0
chol,303.0,246.547855,52.175933,126.0,211.0,241.0,275.0,564.0
fbs,303.0,0.148515,0.356198,0.0,0.0,0.0,0.0,1.0
restecg,303.0,0.990099,0.988293,0.0,0.0,1.0,2.0,2.0
thalach,303.0,149.194719,23.173368,71.0,132.0,152.0,165.5,202.0
exang,303.0,0.326733,0.469794,0.0,0.0,0.0,1.0,1.0
oldpeak,303.0,1.057756,1.165025,0.0,0.0,0.8,1.6,6.2


### Preprocess dataset
 - Encode the 'thal' feature to an ordinal value
 - Convert categorical features to one-hot-encoding values

In [6]:
encoder = OrdinalEncoder()
raw_dataset_df["thal"] = encoder.fit_transform(raw_dataset_df.thal.values.reshape(-1,1))
raw_dataset_df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0,2.0,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,3.0,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2,4.0,0
3,37,1,3,130,250,0,0,187,0,3.5,3,0,3.0,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,3.0,0


In [7]:
def onehotEncoding(data_df, column):
    encoder= OneHotEncoder(sparse_output=False).set_output(transform="pandas")
    one_hot_encoded=encoder.fit_transform(data_df[[column]])
    data_df = pd.concat([data_df, one_hot_encoded],axis=1).drop(columns=[column])
    return data_df

def onehotEncodeCategoricalColumns():
    raw_dataset_modified_df = raw_dataset_df.copy()
    for column, v in COLUMN_TYPES.items():
        if v == "categorical":
           raw_dataset_modified_df = onehotEncoding(raw_dataset_modified_df, column)
    return raw_dataset_modified_df

raw_dataset_modified_df = onehotEncodeCategoricalColumns()
raw_dataset_modified_df.head()

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,slope,target,sex_0,sex_1,cp_0,...,exang_1,ca_0,ca_1,ca_2,ca_3,thal_0.0,thal_1.0,thal_2.0,thal_3.0,thal_4.0
0,63,145,233,150,2.3,3,0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,67,160,286,108,1.5,2,1,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,67,120,229,129,2.6,2,0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,37,130,250,187,3.5,3,0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,41,130,204,172,1.4,1,0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


### Analysis of data basic stats

In [8]:
raw_dataset_modified_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,303.0,54.594059,9.01637,29.0,48.0,56.0,61.0,77.0
trestbps,303.0,131.785479,17.748338,94.0,120.0,130.0,140.0,200.0
chol,303.0,246.547855,52.175933,126.0,211.0,241.0,275.0,564.0
thalach,303.0,149.194719,23.173368,71.0,132.0,152.0,165.5,202.0
oldpeak,303.0,1.057756,1.165025,0.0,0.0,0.8,1.6,6.2
slope,303.0,1.590759,0.617767,1.0,1.0,2.0,2.0,3.0
target,303.0,0.273927,0.44671,0.0,0.0,0.0,1.0,1.0
sex_0,303.0,0.323432,0.46856,0.0,0.0,0.0,1.0,1.0
sex_1,303.0,0.676568,0.46856,0.0,0.0,1.0,1.0,1.0
cp_0,303.0,0.013201,0.114325,0.0,0.0,0.0,0.0,1.0


### Clean the data (with no column exclusions) and persist to disc
 - Remove any rows with NaNs
 - Persist the data to local disc

In [9]:
def cleanAndPersisDat():
    raw_dataset_all_clean_df = raw_dataset_modified_df.copy()
    raw_dataset_all_clean_df.dropna(inplace=True)    
    print(f"Read {DATA_LOCAL_FILE_PATH} data..")
    print(f"The data has these columns:\n{list(raw_dataset_all_clean_df.columns)}")
    print(f"\nWith {raw_dataset_all_clean_df.shape[0]} rows and {raw_dataset_all_clean_df.shape[1]} columns")
    raw_dataset_all_clean_df.to_csv(DATA_LOCAL_FILE_PATH, index=False, sep=TARGET_DELIMITER)
    return raw_dataset_all_clean_df

_ = cleanAndPersisDat()

Read ../data/heart_disease_dataset.csv data..
The data has these columns:
['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'target', 'sex_0', 'sex_1', 'cp_0', 'cp_1', 'cp_2', 'cp_3', 'cp_4', 'fbs_0', 'fbs_1', 'restecg_0', 'restecg_1', 'restecg_2', 'exang_0', 'exang_1', 'ca_0', 'ca_1', 'ca_2', 'ca_3', 'thal_0.0', 'thal_1.0', 'thal_2.0', 'thal_3.0', 'thal_4.0']

With 303 rows and 30 columns
