# Modeling: Designing and evaluating a classifier

Imports and definitions

In [1]:
import pandas as pd
import numpy as np
import glob, os
from datetime import timedelta
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, cross_validate, cross_val_predict

from src.utils import compute_metrics
%matplotlib inline

plt.rcParams['figure.figsize'] = (10, 5)
plt.rcParams['figure.dpi'] = 200
#from src.utils import read_sleep_file
PWD = os.getcwd()
DATA_FILE = os.path.join(PWD, 'out', 'processed_data.csv')

Read data

In [2]:
sleep_data = pd.read_csv(DATA_FILE, sep=';')
sleep_data.head()

Unnamed: 0.1,Unnamed: 0,index,calendar_date,local_start_time,local_end_time,duration_in_seconds,awake_time,deep_sleep_time,rem_sleep_time,total_sleep_hours,...,awake_percentage,local_start_time_seconds,local_end_time_seconds,total_sleep_hours_round,perceived_sleep_quality,bad_sleep_outlier_binary,good_night,median_hr_sleep,median_stress_sleep,sleep_event_number
0,0,0.0,2022-07-10,2022-07-10 23:32:00,2022-07-11 08:10:00,31080.0,0.0,7980.0,4860.0,8.633333,...,0.0,1657496000.0,1657527000.0,8.0,4,0,1,46.0,8.0,1
1,1,0.0,2022-07-10,2022-07-10 23:38:00,2022-07-11 08:10:00,30720.0,0.0,6540.0,5700.0,8.533333,...,0.0,1657496000.0,1657527000.0,8.0,4,0,1,46.0,8.0,2
2,2,0.0,2022-07-11,2022-07-11 23:49:00,2022-07-12 07:51:00,28920.0,420.0,16920.0,0.0,8.033333,...,0.014523,1657583000.0,1657612000.0,8.0,3,0,0,49.0,16.0,3
3,3,0.0,2022-07-11,2022-07-11 23:52:00,2022-07-12 08:02:00,29400.0,120.0,5520.0,4860.0,8.166667,...,0.004082,1657584000.0,1657613000.0,8.0,3,0,0,49.0,16.0,4
4,4,0.0,2022-07-12,2022-07-12 23:17:00,2022-07-13 06:14:00,25020.0,300.0,12660.0,0.0,6.95,...,0.01199,1657668000.0,1657693000.0,6.0,2,1,0,45.0,10.0,5


Create a histogram and time series

In [3]:
PREDICTORS = ['awake_time', 'deep_sleep_time', 'rem_sleep_time']
TARGET = ['bad_sleep_outlier_binary']
df = sleep_data[PREDICTORS + TARGET].dropna()
X = df[PREDICTORS]
y = df[TARGET]

X = sm.add_constant(X)

mod = sm.Logit(y, X)
res = mod.fit()
pred = res
print(res.summary())


Optimization terminated successfully.
         Current function value: 0.558012
         Iterations 6
                              Logit Regression Results                              
Dep. Variable:     bad_sleep_outlier_binary   No. Observations:                  159
Model:                                Logit   Df Residuals:                      155
Method:                                 MLE   Df Model:                            3
Date:                      Thu, 13 Oct 2022   Pseudo R-squ.:                 0.09653
Time:                              18:19:28   Log-Likelihood:                -88.724
converged:                             True   LL-Null:                       -98.204
Covariance Type:                  nonrobust   LLR p-value:                 0.0002787
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               1.0536      0.759      1.388  

In [4]:
pd.options.display.float_format = '{:,.3f}'.format

def do_analysis(predictors, target, penalty='l2', cv=0, st=False):
    
    print(80*'-')
    print("Predictors:", predictors)
    print("Target:", target)
    print("Penalty:", penalty)
    
    df = sleep_data[predictors + target].dropna()
    X = df[predictors]
    y = df[target].values.ravel()

    mod = LogisticRegression(random_state=0, penalty=penalty)

    if cv > 0:
        #mod = LogisticRegressionCV(random_state=0, penalty=penalty, cv=cv)
        #mod.fit(X, y)
        #accuracy = cross_val_score(mod, X, y, cv=cv).mean()
        if st:
            kf = StratifiedKFold(n_splits=cv)
            print("Using stratified cross-validation")
        else:
            kf = KFold(n_splits=cv)
            print("Using regular cross-validation")
                
        ypred = cross_val_predict(mod, X, y, cv=kf)
        results = cross_validate(mod, X, y, cv=kf)

        micro_accuracy = accuracy_score(y, ypred)
        macro_accuracy = results['test_score'].mean()
        print(f"Macro accuracy = {macro_accuracy:.3f}")       
        print(f"Micro accuracy = {micro_accuracy:.3f}")       
    else:
        mod.fit(X, y)
        accuracy = mod.score(X, y)
        ypred = mod.predict(X)
        print(f"Accuracy = {accuracy:.3f}")       
        
    cm = confusion_matrix(y, ypred)
    compute_metrics(cm)
    
for target in [['bad_sleep_outlier_binary'], ['good_night']]:
    for predictors in [['awake_time', 'deep_sleep_time', 'rem_sleep_time'],
                   ['total_sleep_hours_round', 'median_stress_sleep', 'awake_percentage'],
                   ['total_sleep_hours_round', 'median_hr_sleep', 'median_stress_sleep' ]]:
        
        do_analysis(predictors, target)
        do_analysis(predictors, target, cv=5)
        do_analysis(predictors, target, cv=5, st=True)


--------------------------------------------------------------------------------
Predictors: ['awake_time', 'deep_sleep_time', 'rem_sleep_time']
Target: ['bad_sleep_outlier_binary']
Penalty: l2
Accuracy = 0.730
Confusion matrix:
[[107   3]
 [ 40   9]]
Precision = 0.750, Recall = 0.184, F1 = 0.295
Sensitivity = 0.184, Specificity =  0.973
Balanced accuracy =  0.578
--------------------------------------------------------------------------------
Predictors: ['awake_time', 'deep_sleep_time', 'rem_sleep_time']
Target: ['bad_sleep_outlier_binary']
Penalty: l2
Using regular cross-validation
Macro accuracy = 0.712
Micro accuracy = 0.711
Confusion matrix:
[[106   4]
 [ 42   7]]
Precision = 0.636, Recall = 0.143, F1 = 0.233
Sensitivity = 0.143, Specificity =  0.964
Balanced accuracy =  0.553
--------------------------------------------------------------------------------
Predictors: ['awake_time', 'deep_sleep_time', 'rem_sleep_time']
Target: ['bad_sleep_outlier_binary']
Penalty: l2
Using strati

  precision = tp/(tp+fp)
  f1 = 2*(precision*recall)/(precision + recall)


Confusion matrix:
[[118   1]
 [ 39   1]]
Precision = 0.500, Recall = 0.025, F1 = 0.048
Sensitivity = 0.025, Specificity =  0.992
Balanced accuracy =  0.508
--------------------------------------------------------------------------------
Predictors: ['total_sleep_hours_round', 'median_stress_sleep', 'awake_percentage']
Target: ['good_night']
Penalty: l2
Accuracy = 0.761
Confusion matrix:
[[118   1]
 [ 37   3]]
Precision = 0.750, Recall = 0.075, F1 = 0.136
Sensitivity = 0.075, Specificity =  0.992
Balanced accuracy =  0.533
--------------------------------------------------------------------------------
Predictors: ['total_sleep_hours_round', 'median_stress_sleep', 'awake_percentage']
Target: ['good_night']
Penalty: l2
Using regular cross-validation
Macro accuracy = 0.710
Micro accuracy = 0.711
Confusion matrix:
[[113   6]
 [ 40   0]]
Precision = 0.000, Recall = 0.000, F1 = nan
Sensitivity = 0.000, Specificity =  0.950
Balanced accuracy =  0.475
------------------------------------------