<p style="font-weight:bold; letter-spacing: 2px; color:#F5F5DC; font-size:140%; text-align:left; max-width: 1050px; padding: 10px; border-bottom: 3px solid #D2B48C"> Naive Bayes Classification</p>

*Import Libraries*

In [89]:
# import custom classes, functions and variables. Reload file in to memory on cell excution.
import importlib
import settings
importlib.reload(settings)

# import data frameworks
import pandas as pd
import numpy as np

# import viz
import matplotlib.pyplot as plt
import seaborn as sns

# import ML
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as Pipeline_imb
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import roc_curve, auc
import statsmodels.api as sm
from sklearn.base import BaseEstimator, TransformerMixin # for custom classes
from imblearn.over_sampling import SMOTENC


# import others
import os
import pickle

*Import data from initial EDA*

In [78]:
df = pd.read_csv(os.path.join(settings.DATA_EDA_DIR, settings.DATA_EDA_FILE), sep=",")
if len(df) > 0:
    print("df loaded")

df loaded


*Binning numerical features*

In [80]:
# Bin age to new column and drop original column
df.loc[:,'Age Bin'] = pd.cut(df.loc[:,'Age'], 
[0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, float('inf')], 
labels=['0-4', '5-9', '10-15', '10-19', '20-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59', '60-64', '65-69', '70-74', '75-79', '80-84', '85-89', '90-94', '95-99', '>100']).astype("object")
df = df.drop("Age", axis=1)
# check new values in consolidated column
settings.examine_values(df).loc["Age Bin",:]

Unique Values    [25-29, 30-34, 35-39, 40-44, 45-49, 50-54, 55-59]
Name: Age Bin, dtype: object

In [81]:
# Bin sleep duration to new column and drop original column
df.loc[:,'Sleep Duration Bin'] = pd.cut(df.loc[:,'Sleep Duration'], 
[0, 4, 4.5, 5, 5.5, 6, 6.5, 7, 7.5, 8, 8.5, 9, 9.5, 10, float('inf')], 
labels=['<4', '4.0-4.4', '4.5-4.9', '5.0-5.4', '5.5-5.9', '6.0-6.4', '6.5-6.9', '7.0-7.4', '7.5-7.9', '8.0-8.4', '8.5-8.9', '9.0-9.4', '9.5-9.9', '>10']).astype("object")
df = df.drop("Sleep Duration", axis=1)
# check new values in consolidated column
settings.examine_values(df).loc["Sleep Duration Bin",:]

Unique Values    [6.0-6.4, 5.5-5.9, 7.5-7.9, 7.0-7.4, 6.5-6.9, 8.0-8.4]
Name: Sleep Duration Bin, dtype: object

In [82]:
# Bin physical activity level to new column and drop original column
df.loc[:,'Physical Activity Level Bin'] = pd.cut(df.loc[:,'Physical Activity Level'], 
[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, float('inf')], 
labels=['0-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89', '90-99', '>100']).astype("object")
df = df.drop("Physical Activity Level", axis=1)
# check new values in consolidated column
settings.examine_values(df).loc["Physical Activity Level Bin",:]

Unique Values    [40-49, 50-59, 20-29, 30-39, 70-79, 60-69, 80-89]
Name: Physical Activity Level Bin, dtype: object

In [83]:
# Bin heart rate to new column and drop original column
df.loc[:,'Heart Rate Bin'] = pd.cut(df.loc[:,'Heart Rate'], 
[0, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, float('inf')], 
labels=['<40', '40-44', '45-49', '50-54', '55-59', '60-64', '65-69', '70-74', '75-79', '80-84', '85-89', '90-94', '95-99', '>100']).astype("object")
df = df.drop("Heart Rate", axis=1)
# check new values in consolidated column
settings.examine_values(df).loc["Heart Rate Bin",:]

Unique Values    [75-79, 70-74, 80-84, 65-69, 60-64, 85-89]
Name: Heart Rate Bin, dtype: object

In [84]:
# Bin daily steps to new column and drop original column
df.loc[:,'Daily Steps Bin'] = pd.cut(df.loc[:,'Daily Steps'], 
[0, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500, 100000, float('inf')], 
labels=['<3000', '3000-3499', '3500-3999', '4000-4499', '4500-4999', '5000-5499', '5500-5999', '6000-6499', '6500-6999', '7000-7499', '7500-7999', '8000-8499', '8500-8999', '9000-9499', '9500-9999', '>100']).astype("object")
df = df.drop("Daily Steps", axis=1)
# check new values in consolidated column
settings.examine_values(df).loc["Daily Steps Bin",:]

Unique Values    [4000-4499, 9500-9999, <3000, 3000-3499, 7500-7999, 3500-3999, 6500-6999, 4500-4999, 5000-5499, 5500-5999, 7000-7499, 6000-6499]
Name: Daily Steps Bin, dtype: object

*Split data and prepare labels*

In [85]:
ALL_FEATURES = ['Gender', 'Occupation', 'Stress Level', 'BMI Category', 'Blood Pressure Category', 'Age Bin',
       'Sleep Duration Bin', 'Quality of Sleep', 'Physical Activity Level Bin', 'Heart Rate Bin', 'Daily Steps Bin']

X = df[ALL_FEATURES]
y = df[settings.LABEL]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=43, stratify=y)

In [86]:
# Encode y vector 
map = {"None":0, "Sleep Apnea":1, "Insomnia":2}
# apply mapping function
settings.column_mapper(y_train, "Sleep Disorder", map)
settings.column_mapper(y_test, "Sleep Disorder", map)

In [87]:
# reshape labels to 1-D array (vector) 
y_train, y_test = y_train.values.reshape(-1), y_test.values.reshape(-1)

In [113]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 261 entries, 19 to 95
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Gender                       261 non-null    object
 1   Occupation                   261 non-null    object
 2   Stress Level                 261 non-null    int64 
 3   BMI Category                 261 non-null    object
 4   Blood Pressure Category      261 non-null    object
 5   Age Bin                      261 non-null    object
 6   Sleep Duration Bin           261 non-null    object
 7   Quality of Sleep             261 non-null    int64 
 8   Physical Activity Level Bin  261 non-null    object
 9   Heart Rate Bin               261 non-null    object
 10  Daily Steps Bin              261 non-null    object
dtypes: int64(2), object(9)
memory usage: 24.5+ KB


*Preprocess steps*

In [115]:
# Create a column transformer for label encoding
column_trans = ColumnTransformer(
    transformers=[
        ('labelencoder', LabelEncoder(), X_train.select_dtypes(include=['object']).columns)  # slice(None) represents all columns
    ],
    remainder='passthrough')

# # Convert to df as a sense check. Use only for interpretibility before using for pipeline. 
# transformed_X_train_df = settings.convert_transformed_features_to_df(column_trans, column_trans.fit_transform(X_train))
# transformed_X_test_df = settings.convert_transformed_features_to_df(column_trans, column_trans.transform(X_test))

# Check ohe and scaled datasets
# display("train: check for dummy encoded columns and scaled values (mean 0 and std 1)",transformed_X_train_df.shape, transformed_X_train_df.describe().loc[["mean", "std"],:])
# display("test: check for same dummy encoded columns and scaled values (mean close to 0 and std close to 1)",transformed_X_test_df.shape, transformed_X_test_df.describe().loc[["mean", "std"],:])