In [7]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

# global variables
dataset_file_path = 'data/processed.cleveland.data'

In [2]:
def load_data(filename):
    '''
    Description: reads specified .csv file and returns an X and y dataframe.
    Arguments: .csv file to name
    Return: data - dataframe
            X - dataframe containing features
            y - dataframe containing labels
    
    > 0. age
    > 1. sex
    > 2. chest pain type(4 values)
    > 3. resting blood pressure
    > 4. serum cholestoral in mg/dl
    > 5. fasting blood sugar > 120 mg/dl
    > 6. resting electrocardiographic results(values 0, 1, 2)
    > 7. maximum heart rate achieved
    > 8. exercise induced angina
    > 9. oldpeak = ST depression induced by exercise relative to rest
    > 10. the slope of the peak exercise ST segment
    > 11. number of major vessels(0-3) colored by flourosopy
    > 12. thal: 3 = normal, 6 = fixed defect, 7 = reversable defect
    > 13. num: 0 = no presence, 4 = present
    '''

    # reading the data
    try:
        print("Reading .csv")
        data = pd.read_csv(filename, header=None)
        print("Finished reading .csv")
    except:
        print("Unable to read .csv")

    # set column names
    attributes = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
                'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num']
    data.columns = attributes

    X, y = data.iloc[:, 0:-1], data.iloc[:, -1]

    return data, X, y

In [3]:
# loading data
data, X, y = load_data(dataset_file_path)

y = y>0 # setting label outputs to 0 for no heart disease and 1 for heart disease

Reading .csv
Finished reading .csv


In [10]:
# splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, 
                                                        shuffle=True)
X_train.to_csv('data/train_data.csv')
X_test.to_csv('data/test_data.csv')
y_train.to_csv('data/train_labels.csv')
y_test.to_csv('data/test_labels.csv')

  
  import sys
