TODO:
- Add confusion matrices and other statistics to appendix
- Add related work

# Table of Contents

uses the most recent Fourier data from EJ (`sc-agg-f16.npz`) to classify the sleep state for each time interval. The models used are:

- a Kalman filter on the Fourier data, then using softmax classification on the hidden states
- a softmax classifier on the original Fourier data

The approaches are:

1. **done**: train Kalman and softmax on one person, training and validating softmax on random time segments
2. **done**: train Kalman and softmax on one person, then run Kalman and softmax on other person
3. repeat, training and testing on multiple people
4. **done**: train softmax on one person, training and validating on random time segments
5. **done**: train softmax on one person, then run softmax on other person
6. repeat, training and testing on multiple people
7. train Kalman and softmax on one person<br>
on new person, filter on first $n$ states to get hidden state $n+1$<br>
sample hidden states and observations given hidden state $n+1$<br>
classify sampled hidden states and classify sampled observations
8. repeat, training and testing on multiple people

In [2]:
from matplotlib import pyplot as plt
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from pykalman import KalmanFilter

π = np.pi

# Load data

In [23]:
data_path = 'sc-agg-f16.npz'

data = np.load(data_path)

data.files

['labels',
 'SC4711',
 'SC4712',
 'SC4401',
 'SC4402',
 'SC4661',
 'SC4662',
 'SC4371',
 'SC4372',
 'SC4231',
 'SC4232',
 'SC4571',
 'SC4572',
 'SC4001',
 'SC4002',
 'SC4431',
 'SC4432',
 'SC4511',
 'SC4512',
 'SC4561',
 'SC4562',
 'SC4141',
 'SC4142',
 'SC4731',
 'SC4732',
 'SC4161',
 'SC4162',
 'SC4771',
 'SC4772',
 'SC4501',
 'SC4502',
 'SC4491',
 'SC4492',
 'SC4021',
 'SC4022',
 'SC4271',
 'SC4272',
 'SC4081',
 'SC4082',
 'SC4481',
 'SC4482',
 'SC4531',
 'SC4532',
 'SC4101',
 'SC4102',
 'SC4041',
 'SC4042',
 'SC4541',
 'SC4542',
 'SC4111',
 'SC4112',
 'SC4421',
 'SC4422',
 'SC4121',
 'SC4122',
 'SC4281',
 'SC4282',
 'SC4581',
 'SC4582',
 'SC4741',
 'SC4742',
 'SC4641',
 'SC4642',
 'SC4191',
 'SC4192',
 'SC4591',
 'SC4592',
 'SC4291',
 'SC4292',
 'SC4381',
 'SC4382',
 'SC4611',
 'SC4612',
 'SC4351',
 'SC4352',
 'SC4331',
 'SC4332',
 'SC4411',
 'SC4412',
 'SC4051',
 'SC4052',
 'SC4451',
 'SC4452',
 'SC4551',
 'SC4552',
 'SC4071',
 'SC4072',
 'SC4761',
 'SC4762',
 'SC4522',
 'SC4171',

In [4]:
data['labels']

array(['EEG FPZ-CZ-2.5hz', 'EEG FPZ-CZ-5.0hz', 'EEG FPZ-CZ-7.5hz',
       'EEG FPZ-CZ-10.0hz', 'EEG FPZ-CZ-12.5hz', 'EEG FPZ-CZ-15.0hz',
       'EEG FPZ-CZ-17.5hz', 'EEG FPZ-CZ-20.0hz', 'EEG FPZ-CZ-22.5hz',
       'EEG FPZ-CZ-25.0hz', 'EEG FPZ-CZ-27.5hz', 'EEG FPZ-CZ-30.0hz',
       'EEG FPZ-CZ-32.5hz', 'EEG FPZ-CZ-35.0hz', 'EEG FPZ-CZ-37.5hz',
       'EEG FPZ-CZ-40.0hz', 'EEG FPZ-CZ-42.5hz', 'EEG FPZ-CZ-45.0hz',
       'EEG FPZ-CZ-47.5hz', 'EEG FPZ-CZ-50.0hz', 'EEG PZ-OZ-2.5hz',
       'EEG PZ-OZ-5.0hz', 'EEG PZ-OZ-7.5hz', 'EEG PZ-OZ-10.0hz',
       'EEG PZ-OZ-12.5hz', 'EEG PZ-OZ-15.0hz', 'EEG PZ-OZ-17.5hz',
       'EEG PZ-OZ-20.0hz', 'EEG PZ-OZ-22.5hz', 'EEG PZ-OZ-25.0hz',
       'EEG PZ-OZ-27.5hz', 'EEG PZ-OZ-30.0hz', 'EEG PZ-OZ-32.5hz',
       'EEG PZ-OZ-35.0hz', 'EEG PZ-OZ-37.5hz', 'EEG PZ-OZ-40.0hz',
       'EEG PZ-OZ-42.5hz', 'EEG PZ-OZ-45.0hz', 'EEG PZ-OZ-47.5hz',
       'EEG PZ-OZ-50.0hz', 'EOG HORIZONTAL-2.5hz', 'EOG HORIZONTAL-5.0hz',
       'EOG HORIZONTAL-7.5hz', 'EOG HORI

In [5]:
data['train_patients']

array(['SC4241', 'SC4242', 'SC4001', 'SC4002', 'SC4271', 'SC4272',
       'SC4761', 'SC4762', 'SC4021', 'SC4022', 'SC4341', 'SC4342',
       'SC4011', 'SC4012', 'SC4411', 'SC4412', 'SC4281', 'SC4282',
       'SC4431', 'SC4432', 'SC4041', 'SC4042', 'SC4081', 'SC4082',
       'SC4731', 'SC4732', 'SC4631', 'SC4632', 'SC4121', 'SC4122',
       'SC4051', 'SC4052', 'SC4061', 'SC4062', 'SC4031', 'SC4032',
       'SC4091', 'SC4092', 'SC4771', 'SC4772', 'SC4211', 'SC4212',
       'SC4451', 'SC4452', 'SC4251', 'SC4252', 'SC4111', 'SC4112',
       'SC4371', 'SC4372', 'SC4171', 'SC4172', 'SC4561', 'SC4562',
       'SC4471', 'SC4472', 'SC4581', 'SC4582', 'SC4741', 'SC4742',
       'SC4231', 'SC4232', 'SC4522', 'SC4751', 'SC4752', 'SC4651',
       'SC4652', 'SC4611', 'SC4612', 'SC4151', 'SC4152', 'SC4321',
       'SC4322', 'SC4701', 'SC4702'], dtype='<U6')

# Softmax classification on Kalman filter hidden states

## 1. train Kalman and softmax on one person, training and validating softmax on random time segments

In [89]:
patient = data['train_patients'][0]
print(patient)
X, y = data[patient][:, :-1], data[patient][:, -1]

num_features = X.shape[1]

X.shape

SC4241


(2702, 64)

In [91]:
sleep_stages=  ('m', 'w', '1', '2', '3', '4', 'r')
# dim_x doesn't necessarily need to be the same as num_stages
num_stages = len(sleep_stages)
dim_x = num_stages
dim_z = num_features

# Init
kf = KalmanFilter(n_dim_state=dim_x, n_dim_obs=dim_z)

# Fit to fourier data with expectation maximization
kf.em(X)

<pykalman.standard.KalmanFilter at 0x7f4e7bb63280>

In [92]:
# Predict the states for the fourier data
states, _ = kf.smooth(X)

In [96]:
# Train-validation split
val_size = 0.3
X_train, X_val, y_train, y_val = \
  train_test_split(states, y, test_size=val_size, random_state=42)

In [97]:
# Transform data to have mean zero and standard deviation one
# (otherwise, LogisticRegression says it doesn't converge though performs about the same)
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)

In [98]:
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

In [100]:
train_pred = model.predict(X_train)

print('training report')
print(classification_report(y_train, train_pred, zero_division=np.nan))

print('proportions in classes')
print(np.unique(y_train, return_counts=True)[1] / len(y_train))
print()

print('confusion matrix')
print(np.round(confusion_matrix(y_train, train_pred, normalize='true'), 2))

training report
              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99      1314
         1.0       0.10      0.02      0.03        59
         2.0       0.82      0.95      0.88       360
         3.0       0.78      0.26      0.39        27
         5.0       0.82      0.88      0.85       131

    accuracy                           0.93      1891
   macro avg       0.70      0.62      0.63      1891
weighted avg       0.91      0.93      0.92      1891

proportions in classes
[0.69487044 0.03120042 0.19037546 0.01427816 0.06927552]

confusion matrix
[[0.99 0.   0.   0.   0.  ]
 [0.17 0.02 0.68 0.   0.14]
 [0.01 0.   0.95 0.01 0.04]
 [0.   0.   0.74 0.26 0.  ]
 [0.   0.02 0.1  0.   0.88]]


In [101]:
val_pred = model.predict(X_val)

print('validation report')
print(classification_report(y_val, val_pred, zero_division=np.nan))

print('proportions in classes')
print(np.unique(y_val, return_counts=True)[1] / len(y_val))
print()

print('confusion matrix')
print(np.round(confusion_matrix(y_val, val_pred, normalize='true'), 2))

validation report
              precision    recall  f1-score   support

         0.0       0.99      0.98      0.99       559
         1.0       0.00      0.00      0.00        26
         2.0       0.81      0.98      0.89       156
         3.0       1.00      0.24      0.38        17
         5.0       0.83      0.94      0.88        53

    accuracy                           0.93       811
   macro avg       0.73      0.63      0.63       811
weighted avg       0.91      0.93      0.92       811

proportions in classes
[0.6892725  0.03205919 0.19235512 0.02096178 0.06535142]

confusion matrix
[[0.98 0.   0.01 0.   0.  ]
 [0.19 0.   0.62 0.   0.19]
 [0.   0.   0.98 0.   0.02]
 [0.   0.   0.76 0.24 0.  ]
 [0.02 0.   0.04 0.   0.94]]


## 2. train Kalman and softmax on one person, then run Kalman and softmax on other person

In [117]:
train_patient = data['train_patients'][0]
print('train patient:', train_patient)
X_train, y_train = data[train_patient][:, :-1], data[train_patient][:, -1]

val_patient = data['validate_patients'][0]
print('val patient:', val_patient)
X_val, y_val = data[val_patient][:, :-1], data[val_patient][:, -1]

num_features = X_train.shape[1]

X_train.shape, X_val.shape

train patient: SC4241
val patient: SC4261


((2702, 64), (2800, 64))

In [111]:
sleep_stages=  ('m', 'w', '1', '2', '3', '4', 'r')
num_stages = len(sleep_stages)

# dim_x doesn't necessarily need to be the same as num_stages
dim_x = num_stages
dim_z = num_features

# Init
kf = KalmanFilter(n_dim_state=dim_x, n_dim_obs=dim_z)

# Fit to fourier data with expectation maximization
kf.em(X_train)

<pykalman.standard.KalmanFilter at 0x7f4e7bb62dd0>

In [118]:
# Predict the states for the fourier data
X_train_states, _ = kf.smooth(X_train)
X_val_states, _ = kf.smooth(X_val)

In [124]:
# Transform data to have mean zero and standard deviation one
scaler = StandardScaler().fit(X_train)
X_train_states = scaler.transform(X_train)
X_val_states = scaler.transform(X_val)

In [125]:
model = LogisticRegression(max_iter=200)
model.fit(X_train_states, y_train)

In [126]:
train_pred = model.predict(X_train_states)

print('training report')
print(classification_report(y_train, train_pred, zero_division=np.nan))

print('proportions in classes')
print(np.unique(y_train, return_counts=True)[1] / len(y_train))
print()

print('confusion matrix')
print(np.round(confusion_matrix(y_train, train_pred, normalize='true'), 2))

training report
              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99      1873
         1.0       0.17      0.04      0.06        85
         2.0       0.84      0.95      0.89       516
         3.0       0.81      0.50      0.62        44
         5.0       0.82      0.90      0.86       184

    accuracy                           0.94      2702
   macro avg       0.73      0.68      0.68      2702
weighted avg       0.92      0.94      0.93      2702

proportions in classes
[0.69319023 0.03145818 0.19096965 0.01628423 0.06809771]

confusion matrix
[[0.99 0.01 0.   0.   0.  ]
 [0.19 0.04 0.62 0.   0.15]
 [0.01 0.   0.95 0.01 0.03]
 [0.   0.   0.5  0.5  0.  ]
 [0.01 0.02 0.08 0.   0.9 ]]


In [127]:
val_pred = model.predict(X_val_states)

print('validation report')
print(classification_report(y_val, val_pred, zero_division=np.nan))

print('proportions in classes')
print(np.unique(y_val, return_counts=True)[1] / len(y_val))
print()

print('confusion matrix')
print(np.round(confusion_matrix(y_val, val_pred, normalize='true'), 2))

validation report
              precision    recall  f1-score   support

         0.0       1.00      0.00      0.00      1830
         1.0        nan      0.00      0.00       245
         2.0       0.00      0.00      0.00       405
         3.0       0.05      1.00      0.10        85
         4.0        nan      0.00      0.00         9
         5.0       0.00      0.00      0.00       226

    accuracy                           0.03      2800
   macro avg       0.26      0.17      0.02      2800
weighted avg       0.72      0.03      0.00      2800

proportions in classes
[0.65357143 0.0875     0.14464286 0.03035714 0.00321429 0.08071429]

confusion matrix
[[0.   0.   0.01 0.35 0.   0.64]
 [0.   0.   0.   0.98 0.   0.02]
 [0.   0.   0.   1.   0.   0.  ]
 [0.   0.   0.   1.   0.   0.  ]
 [0.   0.   0.   1.   0.   0.  ]
 [0.   0.   0.   1.   0.   0.  ]]


# Softmax classification on Fourier data

## 4. train softmax on one person, training and validating on random time segments

In [84]:
patient = data['train_patients'][0]
print(patient)
X, y = data[patient][:, :-1], data[patient][:, -1]
X.shape

SC4241


(2702, 64)

In [128]:
# Train-validation split
val_size = 0.3
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=val_size, random_state=42)

# Transform data to have mean zero and standard deviation one
scaler = StandardScaler().fit(X_train)
train_X = scaler.transform(X_train)
val_X = scaler.transform(X_val)

X_train.shape, X_val.shape

((1891, 64), (811, 64))

In [129]:
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [130]:
train_pred = model.predict(X_train)

print('training report')
print(classification_report(y_train, train_pred, zero_division=np.nan))

print('proportions in classes')
print(np.unique(y_train, return_counts=True)[1] / len(y_train))
print()

print('confusion matrix')
print(np.round(confusion_matrix(y_train, train_pred, normalize='true'), 2))

training report
              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99      1314
         1.0       0.58      0.25      0.35        59
         2.0       0.85      0.91      0.88       360
         3.0       0.70      0.52      0.60        27
         5.0       0.81      0.85      0.83       131

    accuracy                           0.94      1891
   macro avg       0.78      0.71      0.73      1891
weighted avg       0.93      0.94      0.93      1891

proportions in classes
[0.69487044 0.03120042 0.19037546 0.01427816 0.06927552]

confusion matrix
[[0.99 0.   0.   0.   0.  ]
 [0.1  0.25 0.51 0.   0.14]
 [0.01 0.02 0.91 0.02 0.04]
 [0.   0.   0.48 0.52 0.  ]
 [0.03 0.01 0.11 0.   0.85]]


In [131]:
val_pred = model.predict(X_val)

print('validation report')
print(classification_report(y_val, val_pred, zero_division=np.nan))

print('proportions in classes')
print(np.unique(y_val, return_counts=True)[1] / len(y_val))
print()

print('confusion matrix')
print(np.round(confusion_matrix(y_val, val_pred, normalize='true'), 2))

validation report
              precision    recall  f1-score   support

         0.0       0.98      0.98      0.98       559
         1.0       0.44      0.15      0.23        26
         2.0       0.84      0.93      0.88       156
         3.0       0.74      0.82      0.78        17
         5.0       0.73      0.72      0.72        53

    accuracy                           0.93       811
   macro avg       0.75      0.72      0.72       811
weighted avg       0.92      0.93      0.92       811

proportions in classes
[0.6892725  0.03205919 0.19235512 0.02096178 0.06535142]

confusion matrix
[[0.98 0.01 0.   0.   0.  ]
 [0.19 0.15 0.42 0.   0.23]
 [0.01 0.   0.93 0.02 0.04]
 [0.   0.   0.18 0.82 0.  ]
 [0.06 0.   0.23 0.   0.72]]


## 5. train softmax on one person, then run softmax on other person

In [132]:
train_patient = data['train_patients'][0]
print('train patient:', train_patient)
X_train, y_train = data[train_patient][:, :-1], data[train_patient][:, -1]

val_patient = data['validate_patients'][0]
print('val patient:', val_patient)
X_val, y_val = data[val_patient][:, :-1], data[val_patient][:, -1]

X_train.shape, X_val.shape

train patient: SC4241
val patient: SC4261


((2702, 64), (2800, 64))

In [133]:
# Transform data to have mean zero and standard deviation one
scaler = StandardScaler().fit(X_train)
train_X = scaler.transform(X_train)
val_X = scaler.transform(X_val)

X_train.shape, X_val.shape

((2702, 64), (2800, 64))

In [134]:
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [135]:
train_pred = model.predict(X_train)

print('training report')
print(classification_report(y_train, train_pred, zero_division=np.nan))

print('proportions in classes')
print(np.unique(y_train, return_counts=True)[1] / len(y_train))
print()

print('confusion matrix')
print(np.round(confusion_matrix(y_train, train_pred, normalize='true'), 2))

training report
              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99      1873
         1.0       0.57      0.25      0.34        85
         2.0       0.86      0.92      0.89       516
         3.0       0.74      0.64      0.68        44
         5.0       0.80      0.83      0.81       184

    accuracy                           0.94      2702
   macro avg       0.79      0.72      0.74      2702
weighted avg       0.93      0.94      0.93      2702

proportions in classes
[0.69319023 0.03145818 0.19096965 0.01628423 0.06809771]

confusion matrix
[[0.99 0.   0.   0.   0.  ]
 [0.15 0.25 0.45 0.   0.15]
 [0.01 0.01 0.92 0.02 0.04]
 [0.   0.   0.36 0.64 0.  ]
 [0.03 0.02 0.12 0.   0.83]]


In [136]:
val_pred = model.predict(X_val)

print('validation report')
print(classification_report(y_val, val_pred, zero_division=np.nan))

print('proportions in classes')
print(np.unique(y_val, return_counts=True)[1] / len(y_val))
print()

print('confusion matrix')
print(np.round(confusion_matrix(y_val, val_pred, normalize='true'), 2))

validation report
              precision    recall  f1-score   support

         0.0       0.97      0.93      0.95      1830
         1.0       0.26      0.27      0.26       245
         2.0       0.55      0.93      0.69       405
         3.0       0.03      0.01      0.02        85
         4.0        nan      0.00      0.00         9
         5.0       0.82      0.30      0.44       226

    accuracy                           0.79      2800
   macro avg       0.53      0.41      0.39      2800
weighted avg       0.81      0.79      0.78      2800

proportions in classes
[0.65357143 0.0875     0.14464286 0.03035714 0.00321429 0.08071429]

confusion matrix
[[0.93 0.04 0.03 0.   0.   0.  ]
 [0.18 0.27 0.48 0.04 0.   0.03]
 [0.   0.04 0.93 0.02 0.   0.01]
 [0.   0.01 0.98 0.01 0.   0.  ]
 [0.   0.   1.   0.   0.   0.  ]
 [0.   0.47 0.16 0.06 0.   0.3 ]]
