In [1]:
#Dependencies
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as skl
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
import keras_tuner as kt
from pathlib import Path
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func


Load Data:

In [2]:
#import data

# Create Engine
engine = create_engine("sqlite:///../lifestyle_sleep_data.sqlite")
# reflect an existing database into a new model
Base = automap_base()
# reflect the tables
Base.prepare(autoload_with=engine)

# Save reference to the table
Sleep = Base.classes.lifestyle_sleep_data


In [3]:
# Create our session (link) from Python to the DB
session = Session(engine)
conn = engine.connect()

In [4]:
# Query All Records in the the Database
sleep_df = pd.read_sql("SELECT * FROM lifestyle_sleep_data", conn)
sleep_df=sleep_df.drop(columns = "person_id")
sleep_df.head()

Unnamed: 0,gender,age,occupation,sleep_duration,quality_of_sleep,physical_activity_level,stress_level,BMI_category,heart_rate,daily_steps,sleep_disorder,systolic_blood_pressure,diastolic_blood_pressure
0,Male,27,Software Engineer,6.1,6,42,6,Overweight,77,4200,,126,83
1,Male,28,Doctor,6.2,6,60,8,Normal,75,10000,,125,80
2,Male,28,Doctor,6.2,6,60,8,Normal,75,10000,,125,80
3,Male,28,Sales Representative,5.9,4,30,8,Obese,85,3000,Sleep Apnea,140,90
4,Male,28,Sales Representative,5.9,4,30,8,Obese,85,3000,Sleep Apnea,140,90


PreProcess Data:

In [5]:
# Verify the categories of the "occupation" column
occupations=sleep_df['occupation'].value_counts()
occupations

occupation
Nurse                   73
Doctor                  71
Engineer                63
Lawyer                  47
Teacher                 40
Accountant              37
Salesperson             32
Software Engineer        4
Scientist                4
Sales Representative     2
Manager                  1
Name: count, dtype: int64

In [6]:
#bin occupations with fewer than 30 subjects
#determine which occupations qualify and add to a list
occupations_replace=[]
for occ in occupations.items():
    if occ[1]<30:
        occupations_replace.append(occ[0])
#replace said occupations with "Other"
for occ in occupations_replace:
    sleep_df['occupation']=sleep_df['occupation'].replace(occ,"Other")
#verify successful binning
sleep_df['occupation'].value_counts()

occupation
Nurse          73
Doctor         71
Engineer       63
Lawyer         47
Teacher        40
Accountant     37
Salesperson    32
Other          11
Name: count, dtype: int64

In [7]:
# Transform the occupation column using get_dummies
occupation_dummies = pd.get_dummies(sleep_df['occupation'])

# Concatenate the sleep_df and the occupation_dummies DataFrames
sleep_df = pd.concat([sleep_df, occupation_dummies], axis=1)

# Drop the original occupation column
sleep_df =sleep_df.drop(columns=["occupation"])

# Display the DataFrame
sleep_df.head()

Unnamed: 0,gender,age,sleep_duration,quality_of_sleep,physical_activity_level,stress_level,BMI_category,heart_rate,daily_steps,sleep_disorder,systolic_blood_pressure,diastolic_blood_pressure,Accountant,Doctor,Engineer,Lawyer,Nurse,Other,Salesperson,Teacher
0,Male,27,6.1,6,42,6,Overweight,77,4200,,126,83,False,False,False,False,False,True,False,False
1,Male,28,6.2,6,60,8,Normal,75,10000,,125,80,False,True,False,False,False,False,False,False
2,Male,28,6.2,6,60,8,Normal,75,10000,,125,80,False,True,False,False,False,False,False,False
3,Male,28,5.9,4,30,8,Obese,85,3000,Sleep Apnea,140,90,False,False,False,False,False,True,False,False
4,Male,28,5.9,4,30,8,Obese,85,3000,Sleep Apnea,140,90,False,False,False,False,False,True,False,False


In [8]:
# Clean-up the "BMI-category"
sleep_df["BMI_category"] = sleep_df["BMI_category"].replace({'Normal': 'Normal Weight', 'Obese': 'Overweight'})
sleep_df['BMI_category'].value_counts()

BMI_category
Normal Weight    216
Overweight       158
Name: count, dtype: int64

In [9]:
# Transform the "BMI_category" column using get_dummies
bmi_dummies = pd.get_dummies(sleep_df['BMI_category'])

# Concatenate the sleep_df and the bmi_dummies DataFrames
sleep_df = pd.concat([sleep_df, bmi_dummies], axis=1)

# Drop the original "BMI_category"  column
sleep_df =sleep_df.drop(columns=["BMI_category"])

# Display the DataFrame
sleep_df.head()

Unnamed: 0,gender,age,sleep_duration,quality_of_sleep,physical_activity_level,stress_level,heart_rate,daily_steps,sleep_disorder,systolic_blood_pressure,...,Accountant,Doctor,Engineer,Lawyer,Nurse,Other,Salesperson,Teacher,Normal Weight,Overweight
0,Male,27,6.1,6,42,6,77,4200,,126,...,False,False,False,False,False,True,False,False,False,True
1,Male,28,6.2,6,60,8,75,10000,,125,...,False,True,False,False,False,False,False,False,True,False
2,Male,28,6.2,6,60,8,75,10000,,125,...,False,True,False,False,False,False,False,False,True,False
3,Male,28,5.9,4,30,8,85,3000,Sleep Apnea,140,...,False,False,False,False,False,True,False,False,False,True
4,Male,28,5.9,4,30,8,85,3000,Sleep Apnea,140,...,False,False,False,False,False,True,False,False,False,True


In [10]:
# Verify the values of the "gender" column
sleep_df['gender'].value_counts()

gender
Male      189
Female    185
Name: count, dtype: int64

In [11]:
# Transform the "gender" column using get_dummies
gender_dummies = pd.get_dummies(sleep_df['gender'])

# Concatenate the sleep_df and the gender_dummies DataFrames
sleep_df = pd.concat([sleep_df, gender_dummies], axis=1)

# Drop the original "gender" column
sleep_df =sleep_df.drop(columns=["gender"])

# Display the DataFrame
sleep_df.head()

Unnamed: 0,age,sleep_duration,quality_of_sleep,physical_activity_level,stress_level,heart_rate,daily_steps,sleep_disorder,systolic_blood_pressure,diastolic_blood_pressure,...,Engineer,Lawyer,Nurse,Other,Salesperson,Teacher,Normal Weight,Overweight,Female,Male
0,27,6.1,6,42,6,77,4200,,126,83,...,False,False,False,True,False,False,False,True,False,True
1,28,6.2,6,60,8,75,10000,,125,80,...,False,False,False,False,False,False,True,False,False,True
2,28,6.2,6,60,8,75,10000,,125,80,...,False,False,False,False,False,False,True,False,False,True
3,28,5.9,4,30,8,85,3000,Sleep Apnea,140,90,...,False,False,False,True,False,False,False,True,False,True
4,28,5.9,4,30,8,85,3000,Sleep Apnea,140,90,...,False,False,False,True,False,False,False,True,False,True


In [12]:
# Verify the values of the "sleep_disorder" column
sleep_df['sleep_disorder'].value_counts()

sleep_disorder
None           219
Sleep Apnea     78
Insomnia        77
Name: count, dtype: int64

In [13]:
#diverge sheets with one preserving Sleep Apnea and Insomnia, vs combining into a single bin
sleep_df2=sleep_df.copy()

In [14]:
# Clean-up the "sleep_disorder" column
sleep_df["sleep_disorder"] = sleep_df["sleep_disorder"].replace({'Sleep Apnea': 'Sleep Disorder', 'Insomnia': 'Sleep Disorder'})
# Verify the values of the "sleep_disorder" column
sleep_df['sleep_disorder'].value_counts()

sleep_disorder
None              219
Sleep Disorder    155
Name: count, dtype: int64

In [15]:
# Encoding the sleep_disorder column using a custom function
def encode_disorder(disorder):
    """
    This function encodes sleep disorder status by setting sleep apnea as 1 and no as 0.
    """
    if disorder == "Sleep Disorder":
        return 1
    else:
        return 0

# Call the encode_marriage function on the marriage column
sleep_df["sleep_disorder"] = sleep_df["sleep_disorder"].apply(encode_disorder)

# Review the DataFrame 
sleep_df.head()

Unnamed: 0,age,sleep_duration,quality_of_sleep,physical_activity_level,stress_level,heart_rate,daily_steps,sleep_disorder,systolic_blood_pressure,diastolic_blood_pressure,...,Engineer,Lawyer,Nurse,Other,Salesperson,Teacher,Normal Weight,Overweight,Female,Male
0,27,6.1,6,42,6,77,4200,0,126,83,...,False,False,False,True,False,False,False,True,False,True
1,28,6.2,6,60,8,75,10000,0,125,80,...,False,False,False,False,False,False,True,False,False,True
2,28,6.2,6,60,8,75,10000,0,125,80,...,False,False,False,False,False,False,True,False,False,True
3,28,5.9,4,30,8,85,3000,1,140,90,...,False,False,False,True,False,False,False,True,False,True
4,28,5.9,4,30,8,85,3000,1,140,90,...,False,False,False,True,False,False,False,True,False,True


In [16]:
# Transform the "sleep_disorder" column using get_dummies
disorder_dummies = pd.get_dummies(sleep_df2['sleep_disorder'])

# Identify target columns for later
target_col=list(disorder_dummies.columns)

# Concatenate the sleep_df and the disorder_dummies DataFrames
sleep_df2 = pd.concat([sleep_df2, disorder_dummies], axis=1)

# Drop the original "sleep_disorder" column
sleep_df2 = sleep_df2.drop(columns=["sleep_disorder"])

# Display the DataFrame
sleep_df2.head()

Unnamed: 0,age,sleep_duration,quality_of_sleep,physical_activity_level,stress_level,heart_rate,daily_steps,systolic_blood_pressure,diastolic_blood_pressure,Accountant,...,Other,Salesperson,Teacher,Normal Weight,Overweight,Female,Male,Insomnia,None,Sleep Apnea
0,27,6.1,6,42,6,77,4200,126,83,False,...,True,False,False,False,True,False,True,False,True,False
1,28,6.2,6,60,8,75,10000,125,80,False,...,False,False,False,True,False,False,True,False,True,False
2,28,6.2,6,60,8,75,10000,125,80,False,...,False,False,False,True,False,False,True,False,True,False
3,28,5.9,4,30,8,85,3000,140,90,False,...,True,False,False,False,True,False,True,False,False,True
4,28,5.9,4,30,8,85,3000,140,90,False,...,True,False,False,False,True,False,True,False,False,True


Set Up Optimizers:

In [17]:
# Create a method that creates a new Sequential model with hyperparameter options
# from code used in activity 22.2.4
def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh','sigmoid'])
    
    # Allow kerastuner to decide number of neurons in first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=10,
        step=2), activation=activation, input_dim=21))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 6)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=10,
            step=2),
            activation=activation))
    
    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])
    
    return nn_model

In [18]:
# Create a method that creates a new Sequential model with hyperparameter options for 3 outputs
# adapted from code used in activity 22.2.4
def create_model2(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh','sigmoid'])
    
    # Allow kerastuner to decide number of neurons in first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=10,
        step=2), activation=activation, input_dim=21))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 6)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=10,
            step=2),
            activation=activation))
    
    nn_model.add(tf.keras.layers.Dense(units=3, activation="sigmoid"))

    # Compile the model
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])
    
    return nn_model

In [19]:
#Tuner for binary output
tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=25,
    hyperband_iterations=2,
    overwrite=True,
    directory='project')

In [20]:
#tuner for 3 potential outputs
tuner2 = kt.Hyperband(
    create_model2,
    objective="val_accuracy",
    max_epochs=25,
    hyperband_iterations=2,
    overwrite=True,
    directory='project2'
    )

Set Up Neural Network:

In [21]:
# Seperate the features, X,  from the target variable, y
y = sleep_df['sleep_disorder']
X = sleep_df.drop(columns='sleep_disorder')
# Again, for 3 output variables:
y2 = sleep_df2[target_col]
X2 = sleep_df2.drop(columns=target_col)

In [22]:
#split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
#split dataset again, for 3 output variables
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, random_state=1)

In [23]:
#Scale X
scaler = StandardScaler()
scaler2 = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)
X_scaler2 = scaler2.fit(X_train2)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
X_train_scaled2 = X_scaler2.transform(X_train2)
X_test_scaled2 = X_scaler2.transform(X_test2)

In [25]:
# Run the kerastuner search for best hyperparameters for single output:
tuner.search(X_train_scaled,y_train,epochs=25,validation_data=(X_test_scaled,y_test))

INFO:tensorflow:Oracle triggered exit


In [25]:
# Get best model hyperparameters for single output:
best_hyper = tuner.get_best_hyperparameters(1)[0]
best_hyper.values

{'activation': 'tanh',
 'first_units': 5,
 'num_layers': 6,
 'units_0': 7,
 'units_1': 3,
 'units_2': 1,
 'units_3': 5,
 'units_4': 9,
 'units_5': 3,
 'tuner/epochs': 20,
 'tuner/initial_epoch': 0,
 'tuner/bracket': 0,
 'tuner/round': 0}

In [26]:
# Evaluate best model against full test data for single output:
best_model = tuner.get_best_models(1)[0]
model_loss, model_accuracy = best_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

3/3 - 0s - loss: 0.3741 - accuracy: 0.9255 - 271ms/epoch - 90ms/step
Loss: 0.3741263449192047, Accuracy: 0.9255319237709045


In [27]:
# Run the kerastuner search for best hyperparameters for 3 category output:
tuner2.search(X_train_scaled2,y_train2,epochs=20,validation_data=(X_test_scaled2,y_test2))

Trial 60 Complete [00h 00m 03s]
val_accuracy: 0.563829779624939

Best val_accuracy So Far: 0.8829787373542786
Total elapsed time: 00h 02m 25s
INFO:tensorflow:Oracle triggered exit


In [28]:
# Get best model hyperparameters for 3 category output:
best_hyper2 = tuner2.get_best_hyperparameters(1)[0]
best_hyper2.values

{'activation': 'tanh',
 'first_units': 9,
 'num_layers': 1,
 'units_0': 7,
 'units_1': 5,
 'units_2': 1,
 'units_3': 5,
 'units_4': 1,
 'units_5': 7,
 'tuner/epochs': 20,
 'tuner/initial_epoch': 0,
 'tuner/bracket': 0,
 'tuner/round': 0}

In [29]:
# Evaluate best model against full test data for 3 category output:
best_model2 = tuner2.get_best_models(1)[0]
model_loss2, model_accuracy2 = best_model2.evaluate(X_test_scaled2,y_test2,verbose=2)
print(f"Loss: {model_loss2}, Accuracy: {model_accuracy2}")

3/3 - 0s - loss: 0.4203 - accuracy: 0.8830 - 197ms/epoch - 66ms/step
Loss: 0.4202548861503601, Accuracy: 0.8829787373542786
