# Modeling: Designing and evaluating a classifier

Imports and definitions

In [36]:
import pandas as pd
import numpy as np
import glob, os
from datetime import timedelta
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, cross_validate, cross_val_predict

from src.utils import compute_metrics
%matplotlib inline

plt.rcParams['figure.figsize'] = (10, 5)
plt.rcParams['figure.dpi'] = 200
#from src.utils import read_sleep_file
PWD = os.getcwd()
DATA_FILE = os.path.join(PWD, 'out', 'processed_data.csv')

Read data

In [37]:
sleep_data = pd.read_csv(DATA_FILE, sep=';')
sleep_data.head()

Unnamed: 0.1,Unnamed: 0,index,calendar_date,local_start_time,local_end_time,duration_in_seconds,awake_time,deep_sleep_time,rem_sleep_time,total_sleep_hours,...,awake_percentage,local_start_time_seconds,local_end_time_seconds,total_sleep_hours_round,perceived_sleep_quality,bad_sleep_outlier_binary,good_night,median_hr_sleep,median_stress_sleep,sleep_event_number
0,0,0.0,2022-04-06,2022-04-06 22:14:00,2022-04-07 07:33:00,33540.0,0.0,4200.0,6660.0,9.317,...,0.0,1649283240.0,1649316780.0,9.0,4,0,1,46.0,12.0,1
1,1,0.0,2022-04-07,2022-04-08 03:31:00,2022-04-08 08:48:00,19020.0,300.0,4800.0,4440.0,5.283,...,0.016,1649388660.0,1649407680.0,5.0,3,0,0,54.0,18.0,2
2,2,0.0,2022-04-08,2022-04-08 22:33:00,2022-04-09 07:23:00,31800.0,120.0,5040.0,7380.0,8.833,...,0.004,1649457180.0,1649488980.0,8.0,5,0,1,53.0,19.0,3
3,3,0.0,2022-04-09,2022-04-09 23:31:00,2022-04-10 06:44:00,25980.0,420.0,4140.0,6180.0,7.217,...,0.016,1649547060.0,1649573040.0,7.0,4,0,1,51.0,15.0,4
4,4,0.0,2022-04-10,2022-04-11 02:24:00,2022-04-11 06:43:00,15540.0,1200.0,1680.0,2100.0,4.317,...,0.077,1649643840.0,1649659380.0,4.0,2,1,0,70.0,51.0,5


Create a histogram and time series

In [38]:
PREDICTORS = ['awake_time', 'deep_sleep_time', 'rem_sleep_time']
TARGET = ['bad_sleep_outlier_binary']
df = sleep_data[PREDICTORS + TARGET].dropna()
X = df[PREDICTORS]
y = df[TARGET]

X = sm.add_constant(X)

mod = sm.Logit(y, X)
res = mod.fit()
pred = res
print(res.summary())


Optimization terminated successfully.
         Current function value: 0.214740
         Iterations 9
                              Logit Regression Results                              
Dep. Variable:     bad_sleep_outlier_binary   No. Observations:                   41
Model:                                Logit   Df Residuals:                       37
Method:                                 MLE   Df Model:                            3
Date:                      Fri, 07 Oct 2022   Pseudo R-squ.:                  0.4209
Time:                              15:48:30   Log-Likelihood:                -8.8044
converged:                             True   LL-Null:                       -15.203
Covariance Type:                  nonrobust   LLR p-value:                  0.005098
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               5.1256      3.121      1.642  

In [39]:
pd.options.display.float_format = '{:,.3f}'.format

def do_analysis(predictors, target, penalty='l2', cv=0, st=False):
    
    print(80*'-')
    print("Predictors:", predictors)
    print("Target:", target)
    print("Penalty:", penalty)
    
    df = sleep_data[predictors + target].dropna()
    X = df[predictors]
    y = df[target].values.ravel()

    mod = LogisticRegression(random_state=0, penalty=penalty)

    if cv > 0:
        #mod = LogisticRegressionCV(random_state=0, penalty=penalty, cv=cv)
        #mod.fit(X, y)
        #accuracy = cross_val_score(mod, X, y, cv=cv).mean()
        if st:
            kf = StratifiedKFold(n_splits=cv)
            print("Using stratified cross-validation")
        else:
            kf = KFold(n_splits=cv)
            print("Using regular cross-validation")
                
        ypred = cross_val_predict(mod, X, y, cv=kf)
        results = cross_validate(mod, X, y, cv=kf)

        micro_accuracy = accuracy_score(y, ypred)
        macro_accuracy = results['test_score'].mean()
        print(f"Macro accuracy = {macro_accuracy:.3f}")       
        print(f"Micro accuracy = {micro_accuracy:.3f}")       
    else:
        mod.fit(X, y)
        accuracy = mod.score(X, y)
        ypred = mod.predict(X)
        print(f"Accuracy = {accuracy:.3f}")       
        
    cm = confusion_matrix(y, ypred)
    compute_metrics(cm)
    
for target in [['bad_sleep_outlier_binary'], ['good_night']]:
    for predictors in [['awake_time', 'deep_sleep_time', 'rem_sleep_time'],
                   ['total_sleep_hours_round', 'median_stress_sleep', 'awake_percentage'],
                   ['total_sleep_hours_round', 'median_hr_sleep', 'median_stress_sleep' ]]:
        
        do_analysis(predictors, target)
        do_analysis(predictors, target, cv=5)
        do_analysis(predictors, target, cv=5, st=True)


--------------------------------------------------------------------------------
Predictors: ['awake_time', 'deep_sleep_time', 'rem_sleep_time']
Target: ['bad_sleep_outlier_binary']
Penalty: l2
Accuracy = 0.854
Confusion matrix:
[[34  2]
 [ 4  1]]
Precision = 0.333, Recall = 0.200, F1 = 0.250
Sensitivity = 0.200, Specificity =  0.944
Balanced accuracy =  0.572
--------------------------------------------------------------------------------
Predictors: ['awake_time', 'deep_sleep_time', 'rem_sleep_time']
Target: ['bad_sleep_outlier_binary']
Penalty: l2
Using regular cross-validation
Macro accuracy = 0.808
Micro accuracy = 0.805
Confusion matrix:
[[32  4]
 [ 4  1]]
Precision = 0.200, Recall = 0.200, F1 = 0.200
Sensitivity = 0.200, Specificity =  0.889
Balanced accuracy =  0.544
--------------------------------------------------------------------------------
Predictors: ['awake_time', 'deep_sleep_time', 'rem_sleep_time']
Target: ['bad_sleep_outlier_binary']
Penalty: l2
Using stratified cro