In [1]:
# Import dependencies
from pathlib import Path
import pandas as pd
import sqlite3

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
import keras_tuner as kt

In [2]:
# Connect to the database
con = sqlite3.connect('resources/heart.sqlite') 

# Run SQL on table         
sql_query = pd.read_sql('SELECT * FROM original', con)

# Display query results
sql_query.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [3]:
# Convert SQL to Pandas DataFrame
df = pd.DataFrame(sql_query, columns = ['Age',
                                        'Sex',
                                        'ChestPainType',
                                        'RestingBP',
                                        'Cholesterol',
                                        'FastingBS',
                                        'RestingECG',
                                        'MaxHR',
                                        'ExerciseAngina',
                                        'Oldpeak',
                                        'ST_Slope',
                                        'HeartDisease']
                 )

# Preview newly created DF
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [4]:
# Check columns and data types
print(df.columns)
df.dtypes

# Upon initial review, no columns need recasting (.astype)

Index(['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope',
       'HeartDisease'],
      dtype='object')


Age                 int64
Sex                object
ChestPainType      object
RestingBP           int64
Cholesterol         int64
FastingBS           int64
RestingECG         object
MaxHR               int64
ExerciseAngina     object
Oldpeak           float64
ST_Slope           object
HeartDisease        int64
dtype: object

In [5]:
# Generate summary statistics for dataset
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [6]:
# Check for nulls
df.isnull().sum()

# The dataset does not appear to contain any explicit null (NaN) values

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [7]:
# Check for values of zero in features that are int or float, as these may be spurious
age_zero = (df[['Age', 'RestingBP', 'Cholesterol', 'FastingBS',	'MaxHR', 'Oldpeak']] == 0).sum()
print(age_zero)
# The zeros in cholesterol and RestingBP are undesirbale. These zeros are addressed in Data_Cleaning.ipynb

Age              0
RestingBP        1
Cholesterol    172
FastingBS      704
MaxHR            0
Oldpeak        368
dtype: int64


In [None]:
#Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh','sigmoid'])
    
    # Allow kerastuner to decide number of neurons in first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=10,
        step=2), activation=activation, input_dim=20))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 6)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=10,
            step=2),
            activation=activation))
    
    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])
    
    return nn_model

In [None]:
def hyperband_tuner_best_model(df, directory):
    # Split our preprocessed data into our features and target arrays
    y = df['HeartDisease']
    X = df.drop(columns='HeartDisease')
    
    # Convert categorical data to numeric with `pd.get_dummies`
    X = pd.get_dummies(X)
    X.head()

    # Split the preprocessed data into a training and testing dataset, used random_state = 1 for inital results
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    # Create a StandardScaler instances
    scaler = StandardScaler()

    # Fit the StandardScaler
    X_scaler = scaler.fit(X_train)

    # Scale the data
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)

    X_train.shape
    # Our models will be training on 20 parameters.

    #Create the tuner with the hyperband method

    tuner = kt.Hyperband(
        create_model,
        objective="val_accuracy",
        max_epochs=20,
        overwrite=True,
        directory = directory,
        hyperband_iterations=1)

    #Run the kerastuner search for best hyperparameters
    tuner.search(X_train_scaled,y_train,epochs=20,validation_data=(X_test_scaled,y_test))

    #Get best model hyperparameters
    best_hyper = tuner.get_best_hyperparameters(1)[0]
    best_hyper.values

    # Evaluate best model against full test data
    best_model = tuner.get_best_models(1)[0]
    model_loss, model_accuracy = best_model.evaluate(X_test_scaled,y_test,verbose=2)
    print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
#Run the hyperband tuner with the initial data
hyperband_tuner_best_model(df, "initial_data")

### Initial Accuracy: 91.74%, Loss: 30.31%

### Attempt 2 with No Zeros

In [None]:
# Create a path reference to the file 
heart_path_no_zero = Path("resources/heart_no_zeros.csv")

# Read CSV into DataFrame
df_no_zeros = pd.read_csv(heart_path_no_zero, delimiter=",")

# Preview DataFrame
df_no_zeros.head()

In [None]:
#Run the hyperband tuner with the data with the zeros removed
hyperband_tuner_best_model(df_no_zeros, "no_zeros")

### Accuracy 89.30%, Loss 45.07%

### Attempt 3 with Replaced Values

In [None]:
# Create a path reference to the file 
heart_path_replaced = Path("resources/heart_replaced_values.csv")

# Read CSV into DataFrame
df_replaced = pd.read_csv(heart_path_replaced, delimiter=",")

# Preview DataFrame
df_replaced.head()

In [None]:
#Run the hyperband tuner with the data with the zeros replaced
hyperband_tuner_best_model(df_replaced, "replaced_values")

### Accuracy 90.9%, Loss 30.3%

In [None]:
#Make a new df with one-hot encoding in order to find the correlation between the variables and HeartDisease
df_corr = pd.get_dummies(df_replaced)
correlations = df_corr.corr()["HeartDisease"]
correlations.sort_values(ascending = False)