# Modeling: Designing a classifier

Imports and definitions

In [8]:
import pandas as pd
import numpy as np
import glob, os
from datetime import timedelta
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, cross_validate, cross_val_predict

%matplotlib inline

plt.rcParams['figure.figsize'] = (10, 5)
plt.rcParams['figure.dpi'] = 200
#from src.utils import read_sleep_file
PWD = os.getcwd()
DATA_FILE = os.path.join(PWD, 'out', 'processed_data.csv')

Read data

In [9]:
sleep_data = pd.read_csv(DATA_FILE, sep=';')
sleep_data.head()

Unnamed: 0.1,Unnamed: 0,index,calendar_date,local_start_time,local_end_time,duration_in_seconds,awake_time,deep_sleep_time,rem_sleep_time,total_sleep_hours,...,awake_percentage,local_start_time_seconds,local_end_time_seconds,total_sleep_hours_round,perceived_sleep_quality,bad_sleep_outlier_binary,good_night,median_hr_sleep,median_stress_sleep,sleep_event_number
0,0,0.0,2022-04-06,2022-04-06 22:14:00,2022-04-07 07:33:00,33540.0,0.0,4200.0,6660.0,9.316667,...,0.0,1649283000.0,1649317000.0,9.0,4,0,1,46.0,12.0,1
1,1,0.0,2022-04-07,2022-04-08 03:31:00,2022-04-08 08:48:00,19020.0,300.0,4800.0,4440.0,5.283333,...,0.015773,1649389000.0,1649408000.0,5.0,3,0,0,54.0,18.0,2
2,2,0.0,2022-04-08,2022-04-08 22:33:00,2022-04-09 07:23:00,31800.0,120.0,5040.0,7380.0,8.833333,...,0.003774,1649457000.0,1649489000.0,8.0,5,0,1,53.0,19.0,3
3,3,0.0,2022-04-09,2022-04-09 23:31:00,2022-04-10 06:44:00,25980.0,420.0,4140.0,6180.0,7.216667,...,0.016166,1649547000.0,1649573000.0,7.0,4,0,1,51.0,15.0,4
4,4,0.0,2022-04-10,2022-04-11 02:24:00,2022-04-11 06:43:00,15540.0,1200.0,1680.0,2100.0,4.316667,...,0.07722,1649644000.0,1649659000.0,4.0,2,1,0,70.0,51.0,5


Create a histogram and time series

In [10]:
PREDICTORS = ['awake_time', 'deep_sleep_time', 'rem_sleep_time']
TARGET = ['bad_sleep_outlier_binary']
df = sleep_data[PREDICTORS + TARGET].dropna()
X = df[PREDICTORS]
y = df[TARGET]

X = sm.add_constant(X)

mod = sm.Logit(y, X)
res = mod.fit()
pred = res
print(res.summary())


Optimization terminated successfully.
         Current function value: 0.214740
         Iterations 9
                              Logit Regression Results                              
Dep. Variable:     bad_sleep_outlier_binary   No. Observations:                   41
Model:                                Logit   Df Residuals:                       37
Method:                                 MLE   Df Model:                            3
Date:                      Sun, 02 Oct 2022   Pseudo R-squ.:                  0.4209
Time:                              16:20:54   Log-Likelihood:                -8.8044
converged:                             True   LL-Null:                       -15.203
Covariance Type:                  nonrobust   LLR p-value:                  0.005098
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               5.1256      3.121      1.642  

In [19]:
def do_analysis(predictors, target, penalty='l2', cv=0, st=False):
    
    df = sleep_data[predictors + target].dropna()
    X = df[predictors]
    y = df[target].values.ravel()
    
    message = f"Accuracy for {predictors} with penalty {penalty}"

    mod = LogisticRegression(random_state=0, penalty=penalty)

    if cv > 0:
        #mod = LogisticRegressionCV(random_state=0, penalty=penalty, cv=cv)
        #mod.fit(X, y)
        #accuracy = cross_val_score(mod, X, y, cv=cv).mean()
        if st:
            kf = StratifiedKFold(n_splits=cv)
            message = message + " and stratified cross-validation"        
        else:
            kf = KFold(n_splits=cv)
            message = message + " and cross-validation"        
                
        ypred = cross_val_predict(mod, X, y, cv=kf)
        results = cross_validate(mod, X, y, cv=kf)
        accuracy = results['test_score'].mean()
    else:
        mod.fit(X, y)
        accuracy = mod.score(X, y)
        ypred = mod.predict(X)
    cm = confusion_matrix(y, ypred)

    message = message + f" is {np.round(accuracy, 3)}"        

    print(message)
    print(cm)
    
for target in [['bad_sleep_outlier_binary'], ['good_night']]:
    for predictors in [['awake_time', 'deep_sleep_time', 'rem_sleep_time'],
                   ['total_sleep_hours_round', 'median_stress_sleep', 'awake_percentage'],
                   ['total_sleep_hours_round', 'median_hr_sleep', 'median_stress_sleep' ]]:
        print(30*'-')
        print(target)
        
        do_analysis(predictors, target)
        do_analysis(predictors, target, cv=5)
        do_analysis(predictors, target, cv=5, st=True)


------------------------------
['bad_sleep_outlier_binary']
Accuracy for ['awake_time', 'deep_sleep_time', 'rem_sleep_time'] with penalty l2 is 0.854
[[34  2]
 [ 4  1]]
Accuracy for ['awake_time', 'deep_sleep_time', 'rem_sleep_time'] with penalty l2 and cross-validation is 0.808
[[32  4]
 [ 4  1]]
Accuracy for ['awake_time', 'deep_sleep_time', 'rem_sleep_time'] with penalty l2 and stratified cross-validation is 0.808
[[32  4]
 [ 4  1]]
------------------------------
['bad_sleep_outlier_binary']
Accuracy for ['total_sleep_hours_round', 'median_stress_sleep', 'awake_percentage'] with penalty l2 is 0.976
[[36  0]
 [ 1  4]]
Accuracy for ['total_sleep_hours_round', 'median_stress_sleep', 'awake_percentage'] with penalty l2 and cross-validation is 0.975
[[36  0]
 [ 1  4]]
Accuracy for ['total_sleep_hours_round', 'median_stress_sleep', 'awake_percentage'] with penalty l2 and stratified cross-validation is 0.975
[[36  0]
 [ 1  4]]
------------------------------
['bad_sleep_outlier_binary']
Acc