In [26]:
# Import the modules
import numpy as np
import pandas as pd

from pathlib import Path
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func

from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

# Import the Data from the SQLite Database and Place in Dataframe

In [2]:
# Create Engine
engine = create_engine("sqlite:///lifestyle_sleep_data.sqlite")
# reflect an existing database into a new model
Base = automap_base()
# reflect the tables
Base.prepare(autoload_with=engine)

# Save reference to the table
Sleep = Base.classes.lifestyle_sleep_data

In [3]:
# Create our session (link) from Python to the DB
session = Session(engine)
conn = engine.connect()

In [4]:
# Query All Records in the the Database
sleep_df = pd.read_sql("SELECT * FROM lifestyle_sleep_data", conn)
#sleep_df=sleep_df.drop(columns = "person_id")
sleep_df.head()

Unnamed: 0,person_id,gender,age,occupation,sleep_duration,quality_of_sleep,physical_activity_level,stress_level,BMI_category,blood_pressure,heart_rate,daily_steps,sleep_disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea


In [5]:
#Drop the person_id column
sleep_df=sleep_df.drop(columns = "person_id")

In [6]:
# Get all data types
sleep_df.dtypes

gender                      object
age                          int64
occupation                  object
sleep_duration             float64
quality_of_sleep             int64
physical_activity_level      int64
stress_level                 int64
BMI_category                object
blood_pressure              object
heart_rate                   int64
daily_steps                  int64
sleep_disorder              object
dtype: object

In [7]:
# Split 'blood_pressure' on '/' into two seperate columns and convert to integers.
sleep_df[['systolic_blood_pressure', 'diastolic_blood_pressure']] = sleep_df['blood_pressure'].str.split('/', n=1, expand= True)
sleep_df = sleep_df.drop(columns = 'blood_pressure')
sleep_df = sleep_df.astype({'systolic_blood_pressure': int, 'diastolic_blood_pressure': int})
sleep_df.head()

Unnamed: 0,gender,age,occupation,sleep_duration,quality_of_sleep,physical_activity_level,stress_level,BMI_category,heart_rate,daily_steps,sleep_disorder,systolic_blood_pressure,diastolic_blood_pressure
0,Male,27,Software Engineer,6.1,6,42,6,Overweight,77,4200,,126,83
1,Male,28,Doctor,6.2,6,60,8,Normal,75,10000,,125,80
2,Male,28,Doctor,6.2,6,60,8,Normal,75,10000,,125,80
3,Male,28,Sales Representative,5.9,4,30,8,Obese,85,3000,Sleep Apnea,140,90
4,Male,28,Sales Representative,5.9,4,30,8,Obese,85,3000,Sleep Apnea,140,90


# Preprocess Data for Model

In [8]:
# Verify the categories of the "occupation" column
sleep_df['occupation'].value_counts()

Nurse                   73
Doctor                  71
Engineer                63
Lawyer                  47
Teacher                 40
Accountant              37
Salesperson             32
Software Engineer        4
Scientist                4
Sales Representative     2
Manager                  1
Name: occupation, dtype: int64

In [9]:
# Clean up occupation category.
sleep_df['occupation'] = sleep_df['occupation'].replace({'Software Engineer': 'Other', 
                                                         'Scientist': "Other", 
                                                         'Sales Representative': 'Other', 
                                                         'Manager':'Other'})

In [10]:
# Verify clean-up
sleep_df['occupation'].value_counts()

Nurse          73
Doctor         71
Engineer       63
Lawyer         47
Teacher        40
Accountant     37
Salesperson    32
Other          11
Name: occupation, dtype: int64

In [11]:
# Transform the occupation column using get_dummies
occupation_dummies = pd.get_dummies(sleep_df['occupation'])

# Concatenate the sleep_df and the occupation_dummies DataFrames
sleep_df = pd.concat([sleep_df, occupation_dummies], axis=1)

# Drop the original occupation column
sleep_df =sleep_df.drop(columns=["occupation"])

# Display the DataFrame
sleep_df.head()


Unnamed: 0,gender,age,sleep_duration,quality_of_sleep,physical_activity_level,stress_level,BMI_category,heart_rate,daily_steps,sleep_disorder,systolic_blood_pressure,diastolic_blood_pressure,Accountant,Doctor,Engineer,Lawyer,Nurse,Other,Salesperson,Teacher
0,Male,27,6.1,6,42,6,Overweight,77,4200,,126,83,0,0,0,0,0,1,0,0
1,Male,28,6.2,6,60,8,Normal,75,10000,,125,80,0,1,0,0,0,0,0,0
2,Male,28,6.2,6,60,8,Normal,75,10000,,125,80,0,1,0,0,0,0,0,0
3,Male,28,5.9,4,30,8,Obese,85,3000,Sleep Apnea,140,90,0,0,0,0,0,1,0,0
4,Male,28,5.9,4,30,8,Obese,85,3000,Sleep Apnea,140,90,0,0,0,0,0,1,0,0


In [12]:
# Verify the categories of the "BMI_category" column
sleep_df['BMI_category'].value_counts()

Normal           195
Overweight       148
Normal Weight     21
Obese             10
Name: BMI_category, dtype: int64

In [13]:
# Clean-up the "BMI-category"
sleep_df["BMI_category"] = sleep_df["BMI_category"].replace({'Normal': 'Normal Weight', 'Obese': 'Overweight'})

In [14]:
# Verify clean-up
sleep_df['BMI_category'].value_counts()

Normal Weight    216
Overweight       158
Name: BMI_category, dtype: int64

In [15]:
# Transform the "BMI_category" column using get_dummies
bmi_dummies = pd.get_dummies(sleep_df['BMI_category'])

# Concatenate the sleep_df and the bmi_dummies DataFrames
sleep_df = pd.concat([sleep_df, bmi_dummies], axis=1)

# Drop the original "BMI_category"  column
sleep_df =sleep_df.drop(columns=["BMI_category"])

# Display the DataFrame
sleep_df.head()


Unnamed: 0,gender,age,sleep_duration,quality_of_sleep,physical_activity_level,stress_level,heart_rate,daily_steps,sleep_disorder,systolic_blood_pressure,...,Accountant,Doctor,Engineer,Lawyer,Nurse,Other,Salesperson,Teacher,Normal Weight,Overweight
0,Male,27,6.1,6,42,6,77,4200,,126,...,0,0,0,0,0,1,0,0,0,1
1,Male,28,6.2,6,60,8,75,10000,,125,...,0,1,0,0,0,0,0,0,1,0
2,Male,28,6.2,6,60,8,75,10000,,125,...,0,1,0,0,0,0,0,0,1,0
3,Male,28,5.9,4,30,8,85,3000,Sleep Apnea,140,...,0,0,0,0,0,1,0,0,0,1
4,Male,28,5.9,4,30,8,85,3000,Sleep Apnea,140,...,0,0,0,0,0,1,0,0,0,1


In [16]:
# Verify the values of the "gender" column
sleep_df['gender'].value_counts()

Male      189
Female    185
Name: gender, dtype: int64

In [17]:
# Transform the "gender" column using get_dummies
gender_dummies = pd.get_dummies(sleep_df['gender'])

# Concatenate the sleep_df and the gender_dummies DataFrames
sleep_df = pd.concat([sleep_df, gender_dummies], axis=1)

# Drop the original "gender" column
sleep_df =sleep_df.drop(columns=["gender"])

# Display the DataFrame
sleep_df.head()

Unnamed: 0,age,sleep_duration,quality_of_sleep,physical_activity_level,stress_level,heart_rate,daily_steps,sleep_disorder,systolic_blood_pressure,diastolic_blood_pressure,...,Engineer,Lawyer,Nurse,Other,Salesperson,Teacher,Normal Weight,Overweight,Female,Male
0,27,6.1,6,42,6,77,4200,,126,83,...,0,0,0,1,0,0,0,1,0,1
1,28,6.2,6,60,8,75,10000,,125,80,...,0,0,0,0,0,0,1,0,0,1
2,28,6.2,6,60,8,75,10000,,125,80,...,0,0,0,0,0,0,1,0,0,1
3,28,5.9,4,30,8,85,3000,Sleep Apnea,140,90,...,0,0,0,1,0,0,0,1,0,1
4,28,5.9,4,30,8,85,3000,Sleep Apnea,140,90,...,0,0,0,1,0,0,0,1,0,1


In [18]:
# Verify the values of the "sleep_disorder" column
sleep_df['sleep_disorder'].value_counts()

None           219
Sleep Apnea     78
Insomnia        77
Name: sleep_disorder, dtype: int64

In [19]:
# Clean-up the "sleep_disorder" column
sleep_df["sleep_disorder"] = sleep_df["sleep_disorder"].replace({'Sleep Apnea': 'Sleep Disorder', 'Insomnia': 'Sleep Disorder'})
# Verify the values of the "sleep_disorder" column
sleep_df['sleep_disorder'].value_counts()

None              219
Sleep Disorder    155
Name: sleep_disorder, dtype: int64

In [20]:
# Encoding the sleep_disorder column using a custom function
def encode_disorder(disorder):
    """
    This function encodes sleep disorder status by setting sleep apnea as 1 and no as 0.
    """
    if disorder == "Sleep Disorder":
        return 1
    else:
        return 0

# Call the encode_marriage function on the marriage column
sleep_df["sleep_disorder"] = sleep_df["sleep_disorder"].apply(encode_disorder)

# Review the DataFrame 
sleep_df.head()

Unnamed: 0,age,sleep_duration,quality_of_sleep,physical_activity_level,stress_level,heart_rate,daily_steps,sleep_disorder,systolic_blood_pressure,diastolic_blood_pressure,...,Engineer,Lawyer,Nurse,Other,Salesperson,Teacher,Normal Weight,Overweight,Female,Male
0,27,6.1,6,42,6,77,4200,0,126,83,...,0,0,0,1,0,0,0,1,0,1
1,28,6.2,6,60,8,75,10000,0,125,80,...,0,0,0,0,0,0,1,0,0,1
2,28,6.2,6,60,8,75,10000,0,125,80,...,0,0,0,0,0,0,1,0,0,1
3,28,5.9,4,30,8,85,3000,1,140,90,...,0,0,0,1,0,0,0,1,0,1
4,28,5.9,4,30,8,85,3000,1,140,90,...,0,0,0,1,0,0,0,1,0,1


In [27]:
 # Scaling the numeric columns
# sleep_data_scaled = StandardScaler().fit_transform(sleep_df[["age", "sleep_duration", "quality_of_sleep", "physical_activity_level", "stress_level", "heart_rate", "diastolic_blood_pressure", "systolic_blood_pressure", "daily_steps"]])


# Split the Data into Training and Testing Sets

In [21]:
# Seperate the features, X,  from the target variable, y
y = sleep_df['sleep_disorder']
X = sleep_df.drop(columns='sleep_disorder')

In [22]:
# Preview the features data
X.head()

Unnamed: 0,age,sleep_duration,quality_of_sleep,physical_activity_level,stress_level,heart_rate,daily_steps,systolic_blood_pressure,diastolic_blood_pressure,Accountant,...,Engineer,Lawyer,Nurse,Other,Salesperson,Teacher,Normal Weight,Overweight,Female,Male
0,27,6.1,6,42,6,77,4200,126,83,0,...,0,0,0,1,0,0,0,1,0,1
1,28,6.2,6,60,8,75,10000,125,80,0,...,0,0,0,0,0,0,1,0,0,1
2,28,6.2,6,60,8,75,10000,125,80,0,...,0,0,0,0,0,0,1,0,0,1
3,28,5.9,4,30,8,85,3000,140,90,0,...,0,0,0,1,0,0,0,1,0,1
4,28,5.9,4,30,8,85,3000,140,90,0,...,0,0,0,1,0,0,0,1,0,1


In [23]:
# Preview the first five entries for the target variable
y[:5]

0    0
1    0
2    0
3    1
4    1
Name: sleep_disorder, dtype: int64

In [24]:
# Split the data using train_test_split
# Assign a random_state of 1 to the function

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)

## Create a Logistic Regression Model 

In [25]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression_model = LogisticRegression(random_state=1)

# Fit the model using training data
lr_model = logistic_regression_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Predictions

In [34]:
# Make a prediction using the testing data
testing_predictions = logistic_regression_model.predict(X_test)
pd.DataFrame({"Prediction": testing_predictions, "Actual": y_test}).head()

Unnamed: 0,Prediction,Actual
342,0,0
256,1,1
226,1,1
317,0,0
288,1,1


## Accuracy Score

In [29]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, testing_predictions)

0.9086247086247086

## Confusion Matrix

In [30]:
# Generate a confusion matrix for the model
test_matrix = confusion_matrix(y_test, testing_predictions)

# Print the confusion matrix for the testing data
print(test_matrix)

[[52  3]
 [ 5 34]]


## Classification Report

In [33]:
# Print the classification report for the model
target_names = ['No Sleep Disorder', "Sleep Disorder"]
testing_report = classification_report(y_test, testing_predictions, target_names = target_names)

# Print the testing classification report
print(testing_report)

                   precision    recall  f1-score   support

No Sleep Disorder       0.91      0.95      0.93        55
   Sleep Disorder       0.92      0.87      0.89        39

         accuracy                           0.91        94
        macro avg       0.92      0.91      0.91        94
     weighted avg       0.92      0.91      0.91        94



In [35]:
session.close()