# Modeling: Designing and evaluating a classifier

Imports and definitions

In [1]:
import pandas as pd
import numpy as np
import glob, os
from datetime import timedelta
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, cross_validate, cross_val_predict

from src.utils import compute_metrics
%matplotlib inline

plt.rcParams['figure.figsize'] = (10, 5)
plt.rcParams['figure.dpi'] = 200
#from src.utils import read_sleep_file
PWD = os.getcwd()
DATA_FILE = os.path.join(PWD, 'out', 'processed_data.csv')

Read data

In [2]:
sleep_data = pd.read_csv(DATA_FILE, sep=';')
sleep_data.head()

Unnamed: 0.1,Unnamed: 0,index,calendar_date,local_start_time,local_end_time,duration_in_seconds,awake_time,deep_sleep_time,rem_sleep_time,total_sleep_hours,...,awake_percentage,local_start_time_seconds,local_end_time_seconds,total_sleep_hours_round,perceived_sleep_quality,bad_sleep_outlier_binary,good_night,median_hr_sleep,median_stress_sleep,sleep_event_number
0,0,0.0,2022-07-10,2022-07-10 23:32:00,2022-07-11 08:10:00,31080.0,0.0,7980.0,4860.0,8.633333,...,0.0,1657496000.0,1657527000.0,8.0,4,0,1,46.0,8.0,1
1,1,0.0,2022-07-11,2022-07-11 23:52:00,2022-07-12 08:02:00,29400.0,120.0,5520.0,4860.0,8.166667,...,0.004082,1657584000.0,1657613000.0,8.0,3,0,0,49.0,16.0,2
2,2,0.0,2022-07-12,2022-07-12 23:20:00,2022-07-13 07:25:00,29100.0,180.0,3660.0,5640.0,8.083333,...,0.006186,1657668000.0,1657697000.0,8.0,2,1,0,46.0,11.0,3
3,3,0.0,2022-07-13,2022-07-13 23:34:00,2022-07-14 07:47:00,29580.0,480.0,2700.0,4500.0,8.216667,...,0.016227,1657755000.0,1657785000.0,8.0,3,0,0,45.0,10.0,4
4,4,0.0,2022-07-14,2022-07-15 00:00:00,2022-07-15 07:52:00,28320.0,180.0,1320.0,8040.0,7.866667,...,0.006356,1657843000.0,1657872000.0,7.0,2,1,0,45.0,9.5,5


Create a histogram and time series

In [3]:
PREDICTORS = ['awake_time', 'deep_sleep_time', 'rem_sleep_time']
TARGET = ['bad_sleep_outlier_binary']
df = sleep_data[PREDICTORS + TARGET].dropna()
X = df[PREDICTORS]
y = df[TARGET]

X = sm.add_constant(X)

mod = sm.Logit(y, X)
res = mod.fit()
pred = res
print(res.summary())


Optimization terminated successfully.
         Current function value: 0.551579
         Iterations 5
                              Logit Regression Results                              
Dep. Variable:     bad_sleep_outlier_binary   No. Observations:                   62
Model:                                Logit   Df Residuals:                       58
Method:                                 MLE   Df Model:                            3
Date:                      Sun, 16 Oct 2022   Pseudo R-squ.:                 0.08443
Time:                              14:57:04   Log-Likelihood:                -34.198
converged:                             True   LL-Null:                       -37.351
Covariance Type:                  nonrobust   LLR p-value:                   0.09760
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              -1.0856      1.229     -0.883  

In [4]:
pd.options.display.float_format = '{:,.3f}'.format

def do_analysis(predictors, target, penalty='l2', cv=0, st=False):
    
    print(80*'-')
    print("Predictors:", predictors)
    print("Target:", target)
    print("Penalty:", penalty)
    
    df = sleep_data[predictors + target].dropna()
    X = df[predictors]
    y = df[target].values.ravel()

    mod = LogisticRegression(random_state=0, penalty=penalty)

    if cv > 0:
        #mod = LogisticRegressionCV(random_state=0, penalty=penalty, cv=cv)
        #mod.fit(X, y)
        #accuracy = cross_val_score(mod, X, y, cv=cv).mean()
        if st:
            kf = StratifiedKFold(n_splits=cv)
            print("Using stratified cross-validation")
        else:
            kf = KFold(n_splits=cv)
            print("Using regular cross-validation")
                
        ypred = cross_val_predict(mod, X, y, cv=kf)
        results = cross_validate(mod, X, y, cv=kf)

        micro_accuracy = accuracy_score(y, ypred)
        macro_accuracy = results['test_score'].mean()
        print(f"Macro accuracy = {macro_accuracy:.3f}")       
        print(f"Micro accuracy = {micro_accuracy:.3f}")       
    else:
        mod.fit(X, y)
        accuracy = mod.score(X, y)
        ypred = mod.predict(X)
        print(f"Accuracy = {accuracy:.3f}")       
        
    cm = confusion_matrix(y, ypred)
    compute_metrics(cm)
    
for target in [['bad_sleep_outlier_binary'], ['good_night']]:
    for predictors in [['awake_time', 'deep_sleep_time', 'rem_sleep_time'],
                   ['total_sleep_hours_round', 'median_stress_sleep', 'awake_percentage'],
                   ['total_sleep_hours_round', 'median_hr_sleep', 'median_stress_sleep' ]]:
        
        do_analysis(predictors, target)
        do_analysis(predictors, target, cv=5)
        do_analysis(predictors, target, cv=5, st=True)


--------------------------------------------------------------------------------
Predictors: ['awake_time', 'deep_sleep_time', 'rem_sleep_time']
Target: ['bad_sleep_outlier_binary']
Penalty: l2
Accuracy = 0.758
Confusion matrix:
[[43  1]
 [14  4]]
Precision = 0.800, Recall = 0.222, F1 = 0.348
Sensitivity = 0.222, Specificity =  0.977
Balanced accuracy =  0.600
--------------------------------------------------------------------------------
Predictors: ['awake_time', 'deep_sleep_time', 'rem_sleep_time']
Target: ['bad_sleep_outlier_binary']
Penalty: l2
Using regular cross-validation
Macro accuracy = 0.713
Micro accuracy = 0.710
Confusion matrix:
[[41  3]
 [15  3]]
Precision = 0.500, Recall = 0.167, F1 = 0.250
Sensitivity = 0.167, Specificity =  0.932
Balanced accuracy =  0.549
--------------------------------------------------------------------------------
Predictors: ['awake_time', 'deep_sleep_time', 'rem_sleep_time']
Target: ['bad_sleep_outlier_binary']
Penalty: l2
Using stratified cro

  f1 = 2*(precision*recall)/(precision + recall)
  f1 = 2*(precision*recall)/(precision + recall)


Confusion matrix:
[[44  0]
 [15  3]]
Precision = 1.000, Recall = 0.167, F1 = 0.286
Sensitivity = 0.167, Specificity =  1.000
Balanced accuracy =  0.583
--------------------------------------------------------------------------------
Predictors: ['total_sleep_hours_round', 'median_hr_sleep', 'median_stress_sleep']
Target: ['bad_sleep_outlier_binary']
Penalty: l2
Using regular cross-validation
Macro accuracy = 0.728
Micro accuracy = 0.726
Confusion matrix:
[[43  1]
 [16  2]]
Precision = 0.667, Recall = 0.111, F1 = 0.190
Sensitivity = 0.111, Specificity =  0.977
Balanced accuracy =  0.544
--------------------------------------------------------------------------------
Predictors: ['total_sleep_hours_round', 'median_hr_sleep', 'median_stress_sleep']
Target: ['bad_sleep_outlier_binary']
Penalty: l2
Using stratified cross-validation
Macro accuracy = 0.709
Micro accuracy = 0.710
Confusion matrix:
[[43  1]
 [17  1]]
Precision = 0.500, Recall = 0.056, F1 = 0.100
Sensitivity = 0.056, Specificity

  precision = tp/(tp+fp)
  precision = tp/(tp+fp)
  precision = tp/(tp+fp)


Macro accuracy = 0.705
Micro accuracy = 0.710
Confusion matrix:
[[44  2]
 [16  0]]
Precision = 0.000, Recall = 0.000, F1 = nan
Sensitivity = 0.000, Specificity =  0.957
Balanced accuracy =  0.478
--------------------------------------------------------------------------------
Predictors: ['total_sleep_hours_round', 'median_stress_sleep', 'awake_percentage']
Target: ['good_night']
Penalty: l2
Using stratified cross-validation
Macro accuracy = 0.709
Micro accuracy = 0.710
Confusion matrix:
[[44  2]
 [16  0]]
Precision = 0.000, Recall = 0.000, F1 = nan
Sensitivity = 0.000, Specificity =  0.957
Balanced accuracy =  0.478
--------------------------------------------------------------------------------
Predictors: ['total_sleep_hours_round', 'median_hr_sleep', 'median_stress_sleep']
Target: ['good_night']
Penalty: l2
Accuracy = 0.710
Confusion matrix:
[[43  3]
 [15  1]]
Precision = 0.250, Recall = 0.062, F1 = 0.100
Sensitivity = 0.062, Specificity =  0.935
Balanced accuracy =  0.499
--------

  f1 = 2*(precision*recall)/(precision + recall)
  f1 = 2*(precision*recall)/(precision + recall)


Macro accuracy = 0.690
Micro accuracy = 0.694
Confusion matrix:
[[42  4]
 [15  1]]
Precision = 0.200, Recall = 0.062, F1 = 0.095
Sensitivity = 0.062, Specificity =  0.913
Balanced accuracy =  0.488
--------------------------------------------------------------------------------
Predictors: ['total_sleep_hours_round', 'median_hr_sleep', 'median_stress_sleep']
Target: ['good_night']
Penalty: l2
Using stratified cross-validation
Macro accuracy = 0.676
Micro accuracy = 0.677
Confusion matrix:
[[41  5]
 [15  1]]
Precision = 0.167, Recall = 0.062, F1 = 0.091
Sensitivity = 0.062, Specificity =  0.891
Balanced accuracy =  0.477
