TODO:
- Add confusion matrices and other statistics to appendix
- Add related work
- 

# Table of Contents

uses the most recent Fourier data from EJ (`sc-agg-f16.npz`) to classify the sleep state for each time interval. The models used are:

- a Kalman filter on the Fourier data, then using softmax classification on the hidden states
- a softmax classifier on the original Fourier data

The approaches are:

1. **done**: train Kalman and softmax on one person, training and validating softmax on random time segments
2. **done**: train Kalman and softmax on one person, then run Kalman and softmax on other person
3. **done**: repeat, training and testing on multiple people
4. **done**: train softmax on one person, training and validating on random time segments
5. **done**: train softmax on one person, then run softmax on other person
6. repeat, training and testing on multiple people
7. train Kalman and softmax on one person<br>
on new person, filter on first $n$ observations to get hidden state $n+1$<br>
sample hidden states and observations given hidden state $n+1$<br>
classify sampled hidden states and classify sampled observations
8. repeat, training and testing on multiple people
9. teacher forcing? sample next state instead of whole sequence?
10. for one person: train on one night; validate on next night<br>
compare likelihood of true observations (train night) to likelihood of predicted observations (val night)

In [4]:
from matplotlib import pyplot as plt
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score, \
  adjusted_rand_score, adjusted_mutual_info_score, fowlkes_mallows_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from pykalman import KalmanFilter

π = np.pi

In [217]:
def get_scores(y, pred):
  """Get same scores as other methods used in the project."""
  names = [
    "Fowlkes-Meadows",
    "Homogeneity",
    "Completeness",
    "V Measure",
    "Adjusted Rand",
    "Adjusted Mutual",
    "Accuracy"
  ]

  scores = [
    fowlkes_mallows_score(y, pred),
    homogeneity_score(y, pred),
    completeness_score(y, pred),
    v_measure_score(y, pred),
    adjusted_rand_score(y, pred),
    adjusted_mutual_info_score(y, pred),
    accuracy_score(y, pred)
  ]
  
  formatted = ""
  num_places = 4
  for name, score in zip(names, scores):
    formatted += f" & {str(round(score, num_places)).lstrip('0')}"
    print(f"{name}: {round(score, num_places)}")

  return formatted

def report(y, pred, nan_mask, which_report: str):
  print(f'{which_report} report')
  print(classification_report(y[~nan_mask], pred, zero_division=np.nan))

  print('proportions in classes')
  print(np.unique(y[~nan_mask], return_counts=True)[1] / len(pred))
  print()

  print('confusion matrix')
  print(np.round(confusion_matrix(y[~nan_mask], pred, normalize='true'), 2))
  print()

  return get_scores(y[~nan_mask], pred)

# Load data

In [6]:
data_path = 'sc-agg-f16.npz'

data = np.load(data_path)

data.files

['labels',
 'SC4711',
 'SC4712',
 'SC4401',
 'SC4402',
 'SC4661',
 'SC4662',
 'SC4371',
 'SC4372',
 'SC4231',
 'SC4232',
 'SC4571',
 'SC4572',
 'SC4001',
 'SC4002',
 'SC4431',
 'SC4432',
 'SC4511',
 'SC4512',
 'SC4561',
 'SC4562',
 'SC4141',
 'SC4142',
 'SC4731',
 'SC4732',
 'SC4161',
 'SC4162',
 'SC4771',
 'SC4772',
 'SC4501',
 'SC4502',
 'SC4491',
 'SC4492',
 'SC4021',
 'SC4022',
 'SC4271',
 'SC4272',
 'SC4081',
 'SC4082',
 'SC4481',
 'SC4482',
 'SC4531',
 'SC4532',
 'SC4101',
 'SC4102',
 'SC4041',
 'SC4042',
 'SC4541',
 'SC4542',
 'SC4111',
 'SC4112',
 'SC4421',
 'SC4422',
 'SC4121',
 'SC4122',
 'SC4281',
 'SC4282',
 'SC4581',
 'SC4582',
 'SC4741',
 'SC4742',
 'SC4641',
 'SC4642',
 'SC4191',
 'SC4192',
 'SC4591',
 'SC4592',
 'SC4291',
 'SC4292',
 'SC4381',
 'SC4382',
 'SC4611',
 'SC4612',
 'SC4351',
 'SC4352',
 'SC4331',
 'SC4332',
 'SC4411',
 'SC4412',
 'SC4051',
 'SC4052',
 'SC4451',
 'SC4452',
 'SC4551',
 'SC4552',
 'SC4071',
 'SC4072',
 'SC4761',
 'SC4762',
 'SC4522',
 'SC4171',

In [7]:
data['labels']

array(['EEG FPZ-CZ-2.5hz', 'EEG FPZ-CZ-5.0hz', 'EEG FPZ-CZ-7.5hz',
       'EEG FPZ-CZ-10.0hz', 'EEG FPZ-CZ-12.5hz', 'EEG FPZ-CZ-15.0hz',
       'EEG FPZ-CZ-17.5hz', 'EEG FPZ-CZ-20.0hz', 'EEG FPZ-CZ-22.5hz',
       'EEG FPZ-CZ-25.0hz', 'EEG FPZ-CZ-27.5hz', 'EEG FPZ-CZ-30.0hz',
       'EEG FPZ-CZ-32.5hz', 'EEG FPZ-CZ-35.0hz', 'EEG FPZ-CZ-37.5hz',
       'EEG FPZ-CZ-40.0hz', 'EEG FPZ-CZ-42.5hz', 'EEG FPZ-CZ-45.0hz',
       'EEG FPZ-CZ-47.5hz', 'EEG FPZ-CZ-50.0hz', 'EEG PZ-OZ-2.5hz',
       'EEG PZ-OZ-5.0hz', 'EEG PZ-OZ-7.5hz', 'EEG PZ-OZ-10.0hz',
       'EEG PZ-OZ-12.5hz', 'EEG PZ-OZ-15.0hz', 'EEG PZ-OZ-17.5hz',
       'EEG PZ-OZ-20.0hz', 'EEG PZ-OZ-22.5hz', 'EEG PZ-OZ-25.0hz',
       'EEG PZ-OZ-27.5hz', 'EEG PZ-OZ-30.0hz', 'EEG PZ-OZ-32.5hz',
       'EEG PZ-OZ-35.0hz', 'EEG PZ-OZ-37.5hz', 'EEG PZ-OZ-40.0hz',
       'EEG PZ-OZ-42.5hz', 'EEG PZ-OZ-45.0hz', 'EEG PZ-OZ-47.5hz',
       'EEG PZ-OZ-50.0hz', 'EOG HORIZONTAL-2.5hz', 'EOG HORIZONTAL-5.0hz',
       'EOG HORIZONTAL-7.5hz', 'EOG HORI

In [8]:
data['train_patients']

array(['SC4241', 'SC4242', 'SC4001', 'SC4002', 'SC4271', 'SC4272',
       'SC4761', 'SC4762', 'SC4021', 'SC4022', 'SC4341', 'SC4342',
       'SC4011', 'SC4012', 'SC4411', 'SC4412', 'SC4281', 'SC4282',
       'SC4431', 'SC4432', 'SC4041', 'SC4042', 'SC4081', 'SC4082',
       'SC4731', 'SC4732', 'SC4631', 'SC4632', 'SC4121', 'SC4122',
       'SC4051', 'SC4052', 'SC4061', 'SC4062', 'SC4031', 'SC4032',
       'SC4091', 'SC4092', 'SC4771', 'SC4772', 'SC4211', 'SC4212',
       'SC4451', 'SC4452', 'SC4251', 'SC4252', 'SC4111', 'SC4112',
       'SC4371', 'SC4372', 'SC4171', 'SC4172', 'SC4561', 'SC4562',
       'SC4471', 'SC4472', 'SC4581', 'SC4582', 'SC4741', 'SC4742',
       'SC4231', 'SC4232', 'SC4522', 'SC4751', 'SC4752', 'SC4651',
       'SC4652', 'SC4611', 'SC4612', 'SC4151', 'SC4152', 'SC4321',
       'SC4322', 'SC4701', 'SC4702'], dtype='<U6')

# Softmax classification on Kalman filter hidden states

In [48]:
patient0 = data['train_patients'][0]
print(patient0)
X0, y0 = data[patient0][:, :-1], data[patient0][:, -1]

num_features = X0.shape[1]

patient1 = data['train_patients'][1]
print(patient1)
X1, y1 = data[patient1][:, :-1], data[patient1][:, -1]

X0.shape

SC4241
SC4242


(2702, 64)

In [65]:
sleep_stages=  ('m', 'w', '1', '2', '3', '4', 'r')
# dim_x doesn't necessarily need to be the same as num_stages
num_stages = len(sleep_stages)
dim_x = num_stages
dim_z = num_features

# Init
kf = KalmanFilter(n_dim_state=dim_x, n_dim_obs=dim_z, em_vars="all")

# Fit to fourier data with expectation maximization
kf.em(X0, n_iter=10)
kf.loglikelihood(X0)

-515327.62069814047

In [66]:
# Init
kf = KalmanFilter(n_dim_state=dim_x, n_dim_obs=dim_z)

# Fit to fourier data with expectation maximization
kf.em(X0, n_iter=10)
kf.loglikelihood(X0)

-535863.4248915685

## 1. train Kalman and softmax on one person, training and validating softmax on random time segments

In [67]:
patient = data['train_patients'][0]
print(patient)
X, y = data[patient][:, :-1], data[patient][:, -1]

num_features = X.shape[1]

X.shape

SC4241


(2702, 64)

In [68]:
sleep_stages=  ('m', 'w', '1', '2', '3', '4', 'r')
# dim_x doesn't necessarily need to be the same as num_stages
num_stages = len(sleep_stages)
dim_x = num_stages
dim_z = num_features

# Init
kf = KalmanFilter(n_dim_state=dim_x, n_dim_obs=dim_z, em_vars="all")

# Fit to fourier data with expectation maximization
kf.em(X)

<pykalman.standard.KalmanFilter at 0x7fd38b147880>

In [69]:
# Predict the states for the fourier data
states, _ = kf.smooth(X)

In [70]:
# Train-validation split
val_size = 0.3
X_train, X_val, y_train, y_val = \
  train_test_split(states, y, test_size=val_size, random_state=42)

In [71]:
# Transform data to have mean zero and standard deviation one
# (otherwise, LogisticRegression says it doesn't converge though performs about the same)
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)

In [72]:
train_nan_mask = np.isnan(y_train)
val_nan_mask = np.isnan(y_val)

model = LogisticRegression(max_iter=200)
model.fit(X_train[~train_nan_mask], y_train[~train_nan_mask])

In [73]:
train_pred = model.predict(X_train[~train_nan_mask])

print('training report')
print(classification_report(y_train[~train_nan_mask], train_pred, zero_division=np.nan))

print('proportions in classes')
print(np.unique(y_train[~train_nan_mask], return_counts=True)[1] / len(y_train))
print()

print('confusion matrix')
print(np.round(confusion_matrix(y_train[~train_nan_mask], train_pred, normalize='true'), 2))
print()

get_scores(y_train[~train_nan_mask], train_pred)

training report
              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99      1314
         1.0       0.60      0.05      0.09        59
         2.0       0.81      0.94      0.87       360
         3.0       0.33      0.04      0.07        27
         5.0       0.74      0.86      0.80       131

    accuracy                           0.93      1891
   macro avg       0.70      0.58      0.56      1891
weighted avg       0.92      0.93      0.91      1891

proportions in classes
[0.69487044 0.03120042 0.19037546 0.01427816 0.06927552]

confusion matrix
[[0.99 0.   0.   0.   0.01]
 [0.14 0.05 0.59 0.   0.22]
 [0.01 0.   0.94 0.01 0.04]
 [0.   0.   0.96 0.04 0.  ]
 [0.   0.   0.14 0.   0.86]]

Fowlkes-Meadows: 0.9592
Homogeneity: 0.7115
CompletenessV Measure: 0.8037
Adjusted Rand: 0.7548
Adjusted Mutual: 0.9131
Accuracy: 0.7537


' & .9592 & .7115 & .8037 & .7548 & .9131 & .7537'

In [74]:
val_pred = model.predict(X_val[~val_nan_mask])

print('validation report')
print(classification_report(y_val[~val_nan_mask], val_pred, zero_division=np.nan))

print('proportions in classes')
print(np.unique(y_val[~val_nan_mask], return_counts=True)[1] / len(y_val))
print()

print('confusion matrix')
print(np.round(confusion_matrix(y_val[~val_nan_mask], val_pred, normalize='true'), 2))
print()

get_scores(y_val[~val_nan_mask], val_pred)

validation report
              precision    recall  f1-score   support

         0.0       0.98      0.99      0.99       559
         1.0        nan      0.00      0.00        26
         2.0       0.82      0.96      0.88       156
         3.0        nan      0.00      0.00        17
         5.0       0.76      0.94      0.84        53

    accuracy                           0.93       811
   macro avg       0.85      0.58      0.54       811
weighted avg       0.93      0.93      0.91       811

proportions in classes
[0.6892725  0.03205919 0.19235512 0.02096178 0.06535142]

confusion matrix
[[0.99 0.   0.   0.   0.01]
 [0.31 0.   0.5  0.   0.19]
 [0.   0.   0.96 0.   0.04]
 [0.   0.   1.   0.   0.  ]
 [0.02 0.   0.04 0.   0.94]]

Fowlkes-Meadows: 0.9561
Homogeneity: 0.7102
CompletenessV Measure: 0.8449
Adjusted Rand: 0.7717
Adjusted Mutual: 0.9067
Accuracy: 0.7703


' & .9561 & .7102 & .8449 & .7717 & .9067 & .7703'

## 2. train Kalman and softmax on one person, then run Kalman and softmax on other person

In [128]:
train_patient = data['train_patients'][0]
print('train patient:', train_patient)
X_train, y_train = data[train_patient][:, :-1], data[train_patient][:, -1]

val_patient = data['validate_patients'][0]
print('val patient:', val_patient)
X_val, y_val = data[val_patient][:, :-1], data[val_patient][:, -1]

num_features = X_train.shape[1]

X_train.shape, X_val.shape

train patient: SC4241
val patient: SC4261


((2702, 64), (2800, 64))

In [135]:
sleep_stages=  ('m', 'w', '1', '2', '3', '4', 'r')
num_stages = len(sleep_stages)

# dim_x doesn't necessarily need to be the same as num_stages
dim_x = num_stages
dim_z = num_features

# Init
kf = KalmanFilter(n_dim_state=dim_x, n_dim_obs=dim_z, em_vars="all")

# Fit to fourier data with expectation maximization
kf.em(X_train)

<pykalman.standard.KalmanFilter at 0x7fd38b145a20>

In [136]:
# Predict the states for the fourier data
X_train_states, _ = kf.smooth(X_train)
X_val_states, _ = kf.smooth(X_val)

In [137]:
# Transform data to have mean zero and standard deviation one
scaler = StandardScaler().fit(X_train_states)
X_train_states = scaler.transform(X_train_states)
X_val_states = scaler.transform(X_val_states)

In [138]:
train_nan_mask = np.isnan(y_train)
val_nan_mask = np.isnan(y_val)

model = LogisticRegression(max_iter=200)
model.fit(X_train_states[~train_nan_mask], y_train[~train_nan_mask])

In [139]:
train_pred = model.predict(X_train_states[~train_nan_mask])

print('training report')
print(classification_report(y_train[~train_nan_mask], train_pred, zero_division=np.nan))

print('proportions in classes')
print(np.unique(y_train[~train_nan_mask], return_counts=True)[1] / len(y_train))
print()

print('confusion matrix')
print(np.round(confusion_matrix(y_train[~train_nan_mask], train_pred, normalize='true'), 2))
print()

get_scores(y_train[~train_nan_mask], train_pred)

training report
              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99      1873
         1.0       0.71      0.06      0.11        85
         2.0       0.84      0.93      0.88       516
         3.0       0.61      0.32      0.42        44
         5.0       0.76      0.90      0.83       184

    accuracy                           0.93      2702
   macro avg       0.78      0.64      0.64      2702
weighted avg       0.93      0.93      0.92      2702

proportions in classes
[0.69319023 0.03145818 0.19096965 0.01628423 0.06809771]

confusion matrix
[[0.99 0.   0.   0.   0.01]
 [0.2  0.06 0.53 0.   0.21]
 [0.01 0.   0.93 0.02 0.04]
 [0.   0.   0.68 0.32 0.  ]
 [0.01 0.   0.09 0.   0.9 ]]

Fowlkes-Meadows: 0.9612
Homogeneity: 0.726
CompletenessV Measure: 0.8016
Adjusted Rand: 0.7619
Adjusted Mutual: 0.9176
Accuracy: 0.7612


' & .9612 & .726 & .8016 & .7619 & .9176 & .7612'

In [140]:
val_pred = model.predict(X_val_states[~val_nan_mask])

print('validation report')
print(classification_report(y_val[~val_nan_mask], val_pred, zero_division=np.nan))

print('proportions in classes')
print(np.unique(y_val[~val_nan_mask], return_counts=True)[1] / len(y_val))
print()

print('confusion matrix')
print(np.round(confusion_matrix(y_val[~val_nan_mask], val_pred, normalize='true'), 2))
print()

get_scores(y_val[~val_nan_mask], val_pred)

validation report
              precision    recall  f1-score   support

         0.0       1.00      0.07      0.13      1830
         1.0        nan      0.00      0.00       245
         2.0       0.24      1.00      0.38       405
         3.0       0.00      0.00      0.00        85
         4.0        nan      0.00      0.00         9
         5.0       0.00      0.00      0.00       226

    accuracy                           0.19      2800
   macro avg       0.31      0.18      0.09      2800
weighted avg       0.76      0.19      0.14      2800

proportions in classes
[0.65357143 0.0875     0.14464286 0.03035714 0.00321429 0.08071429]

confusion matrix
[[0.07 0.   0.42 0.02 0.   0.5 ]
 [0.   0.   0.89 0.   0.   0.11]
 [0.   0.   1.   0.   0.   0.  ]
 [0.   0.   1.   0.   0.   0.  ]
 [0.   0.   1.   0.   0.   0.  ]
 [0.   0.   1.   0.   0.   0.  ]]

Fowlkes-Meadows: 0.4568
Homogeneity: 0.1782
CompletenessV Measure: 0.2288
Adjusted Rand: 0.2003
Adjusted Mutual: -0.0328
Accuracy:

' & .4568 & .1782 & .2288 & .2003 & -0.0328 & .1981'

## 3. train Kalman and softmax on multiple people, then run Kalman and softmax on other people

In [168]:
rng = np.random.default_rng(42)

num_train, num_val = 5, 5

print('number of train patients:', len(data['train_patients']))
train_patients = rng.choice(data['train_patients'], num_train, replace=False)
print('train patients:', train_patients)
X_trains, y_train = [data[train_patient][:, :-1] for train_patient in train_patients], \
  np.concatenate([data[train_patient][:, -1] for train_patient in train_patients])
print()

print('number of val patients:', len(data['validate_patients']))
val_patients = rng.choice(data['validate_patients'], num_val, replace=False)
print('val patients:', val_patients)
X_vals, y_val = [data[val_patient][:, :-1] for val_patient in val_patients], \
  np.concatenate([data[val_patient][:, -1] for val_patient in val_patients])

num_features = X_trains[0].shape[1]

number of train patients: 75
train patients: ['SC4761' 'SC4741' 'SC4372' 'SC4061' 'SC4061']

number of val patients: 31
val patients: ['SC4802' 'SC4491' 'SC4461' 'SC4301' 'SC4491']


In [169]:
num_expectation_maximization_cycles = 3

sleep_stages=  ('m', 'w', '1', '2', '3', '4', 'r')
num_stages = len(sleep_stages)

# dim_x doesn't necessarily need to be the same as num_stages
dim_x = num_stages
dim_z = num_features

# Init
kf = KalmanFilter(n_dim_state=dim_x, n_dim_obs=dim_z, em_vars="all")

rng = np.random.default_rng(42)

# Fit to fourier data with expectation maximization
for i in range(num_expectation_maximization_cycles):
  for idx in rng.permutation(len(X_trains)):
    kf.em(X_trains[idx], n_iter=1)

In [178]:
# Predict the states for the fourier data
X_trains_states = np.concatenate([kf.smooth(X_train)[0] for X_train in X_trains], axis=0)
X_vals_states = np.concatenate([kf.smooth(X_val)[0] for X_val in X_vals], axis=0)

In [180]:
# Transform data to have mean zero and standard deviation one
scaler = StandardScaler().fit(X_trains_states)
X_trains_states = scaler.transform(X_trains_states) 
X_vals_states = scaler.transform(X_vals_states)

In [182]:
train_nan_mask = np.isnan(y_train)
val_nan_mask = np.isnan(y_val)

model = LogisticRegression(max_iter=200)
model.fit(X_trains_states[~train_nan_mask], y_train[~train_nan_mask])

In [186]:
train_pred = model.predict(X_trains_states[~train_nan_mask])

print('training report')
print(classification_report(y_train[~train_nan_mask], train_pred, zero_division=np.nan))

print('proportions in classes')
print(np.unique(y_train[~train_nan_mask], return_counts=True)[1] / len(y_train))
print()

print('confusion matrix')
print(np.round(confusion_matrix(y_train[~train_nan_mask], train_pred, normalize='true'), 2))
print()

get_scores(y_train[~train_nan_mask], train_pred)

training report
              precision    recall  f1-score   support

         0.0       0.97      0.97      0.97      9855
         1.0       0.30      0.04      0.07       679
         2.0       0.63      0.96      0.76      2282
         3.0       0.00      0.00      0.00       103
         4.0       0.92      0.97      0.95       202
         5.0       0.11      0.01      0.03       559

    accuracy                           0.88     13680
   macro avg       0.49      0.49      0.46     13680
weighted avg       0.84      0.88      0.85     13680

proportions in classes
[0.71934307 0.04956204 0.16656934 0.00751825 0.01474453 0.04080292
 0.00145985]

confusion matrix
[[0.97 0.01 0.02 0.   0.   0.01]
 [0.18 0.04 0.78 0.   0.   0.01]
 [0.04 0.   0.96 0.   0.   0.  ]
 [0.04 0.   0.88 0.   0.08 0.  ]
 [0.03 0.   0.   0.   0.97 0.  ]
 [0.07 0.01 0.91 0.   0.   0.01]]

Fowlkes-Meadows: 0.9151
Homogeneity: 0.5604
CompletenessV Measure: 0.7168
Adjusted Rand: 0.629
Adjusted Mutual: 0.8042
A

' & .9151 & .5604 & .7168 & .629 & .8042 & .6286'

In [198]:
val_pred = model.predict(X_vals_states[~val_nan_mask])

print('validation report')
print(classification_report(y_val[~val_nan_mask], val_pred, zero_division=np.nan))

print('proportions in classes')
print(np.unique(y_val[~val_nan_mask], return_counts=True)[1] / len(y_val))
print()

print('confusion matrix')
print(np.round(confusion_matrix(y_val[~val_nan_mask], val_pred, normalize='true'), 2))
print()

get_scores(y_val[~val_nan_mask], val_pred)

validation report
              precision    recall  f1-score   support

         0.0       0.99      0.90      0.94      9575
         1.0       0.08      0.06      0.07       730
         2.0       0.57      0.99      0.73      2507
         3.0        nan      0.00      0.00        73
         4.0       0.00       nan      0.00         0
         5.0       0.06      0.00      0.00       867

    accuracy                           0.81     13752
   macro avg       0.34      0.39      0.29     13752
weighted avg       0.80      0.81      0.79     13752

proportions in classes
[0.69626236 0.05308319 0.18230076 0.00530832 0.06304538]

confusion matrix
[[0.9  0.05 0.04 0.   0.01 0.  ]
 [0.01 0.06 0.93 0.   0.   0.  ]
 [0.01 0.   0.99 0.   0.   0.  ]
 [0.05 0.   0.95 0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.  ]
 [0.08 0.06 0.85 0.   0.   0.  ]]

Fowlkes-Meadows: 0.8457
Homogeneity: 0.5143
Completeness: 0.5791
V Measure: 0.5448
Adjusted Rand: 0.6812
Adjusted Mutual: 0.5445
Accuracy: 0.

' & .8457 & .5143 & .5791 & .5448 & .6812 & .5445 & .8136'

# Softmax classification on Fourier data

## 4. train softmax on one person, training and validating on random time segments

In [150]:
patient = data['train_patients'][0]
print(patient)
X, y = data[patient][:, :-1], data[patient][:, -1]
X.shape

SC4241


(2702, 64)

In [153]:
# Train-validation split
val_size = 0.3
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=val_size, random_state=42)

# Transform data to have mean zero and standard deviation one
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)

X_train.shape, X_val.shape

((1891, 64), (811, 64))

In [155]:
train_nan_mask = np.isnan(y_train)
val_nan_mask = np.isnan(y_val)

model = LogisticRegression()
model.fit(X_train[~train_nan_mask], y_train[~train_nan_mask])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [157]:
train_pred = model.predict(X_train[~train_nan_mask])

print('training report')
print(classification_report(y_train[~train_nan_mask], train_pred, zero_division=np.nan))

print('proportions in classes')
print(np.unique(y_train[~train_nan_mask], return_counts=True)[1] / len(y_train))
print()

print('confusion matrix')
print(np.round(confusion_matrix(y_train[~train_nan_mask], train_pred, normalize='true'), 2))
print()

get_scores(y_train[~train_nan_mask], train_pred)

training report
              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99      1314
         1.0       0.65      0.25      0.37        59
         2.0       0.85      0.93      0.88       360
         3.0       0.68      0.48      0.57        27
         5.0       0.88      0.85      0.86       131

    accuracy                           0.94      1891
   macro avg       0.81      0.70      0.73      1891
weighted avg       0.94      0.94      0.94      1891

proportions in classes
[0.69487044 0.03120042 0.19037546 0.01427816 0.06927552]

confusion matrix
[[1.   0.   0.   0.   0.  ]
 [0.1  0.25 0.54 0.   0.1 ]
 [0.02 0.01 0.92 0.02 0.03]
 [0.   0.   0.52 0.48 0.  ]
 [0.03 0.02 0.1  0.   0.85]]

Fowlkes-Meadows: 0.9665
Homogeneity: 0.7472
CompletenessV Measure: 0.8039
Adjusted Rand: 0.7745
Adjusted Mutual: 0.9282
Accuracy: 0.7734


' & .9665 & .7472 & .8039 & .7745 & .9282 & .7734'

In [158]:
val_pred = model.predict(X_val[~val_nan_mask])

print('validation report')
print(classification_report(y_val[~val_nan_mask], val_pred, zero_division=np.nan))

print('proportions in classes')
print(np.unique(y_val[~val_nan_mask], return_counts=True)[1] / len(y_val))
print()

print('confusion matrix')
print(np.round(confusion_matrix(y_val[~val_nan_mask], val_pred, normalize='true'), 2))
print()

get_scores(y_val[~val_nan_mask], val_pred)

validation report
              precision    recall  f1-score   support

         0.0       0.98      0.98      0.98       559
         1.0       0.50      0.23      0.32        26
         2.0       0.87      0.95      0.91       156
         3.0       0.82      0.53      0.64        17
         5.0       0.84      0.87      0.85        53

    accuracy                           0.93       811
   macro avg       0.80      0.71      0.74       811
weighted avg       0.93      0.93      0.93       811

proportions in classes
[0.6892725  0.03205919 0.19235512 0.02096178 0.06535142]

confusion matrix
[[0.98 0.01 0.   0.   0.  ]
 [0.27 0.23 0.38 0.   0.12]
 [0.02 0.01 0.95 0.   0.03]
 [0.   0.   0.47 0.53 0.  ]
 [0.06 0.   0.08 0.   0.87]]

Fowlkes-Meadows: 0.9458
Homogeneity: 0.7128
CompletenessV Measure: 0.7592
Adjusted Rand: 0.7353
Adjusted Mutual: 0.8863
Accuracy: 0.7323


' & .9458 & .7128 & .7592 & .7353 & .8863 & .7323'

## 5. train softmax on one person, then run softmax on other person

In [19]:
train_patient = data['train_patients'][0]
print('train patient:', train_patient)
X_train, y_train = data[train_patient][:, :-1], data[train_patient][:, -1]

val_patient = data['validate_patients'][0]
print('val patient:', val_patient)
X_val, y_val = data[val_patient][:, :-1], data[val_patient][:, -1]

X_train.shape, X_val.shape

train patient: SC4241
val patient: SC4261


((2702, 64), (2800, 64))

In [20]:
# Transform data to have mean zero and standard deviation one
scaler = StandardScaler().fit(X_train)
train_X = scaler.transform(X_train)
val_X = scaler.transform(X_val)

X_train.shape, X_val.shape

((2702, 64), (2800, 64))

In [21]:
train_nan_mask = np.isnan(y_train)
val_nan_mask = np.isnan(y_val)

model = LogisticRegression()
model.fit(X_train[~train_nan_mask], y_train[~train_nan_mask])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [23]:
train_pred = model.predict(X_train[~train_nan_mask])

print('training report')
print(classification_report(y_train[~train_nan_mask], train_pred, zero_division=np.nan))

print('proportions in classes')
print(np.unique(y_train[~train_nan_mask], return_counts=True)[1] / len(y_train))
print()

print('confusion matrix')
print(np.round(confusion_matrix(y_train[~train_nan_mask], train_pred, normalize='true'), 2))
print()

get_scores(y_train[~train_nan_mask], train_pred)

training report
              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99      1873
         1.0       0.57      0.25      0.34        85
         2.0       0.86      0.92      0.89       516
         3.0       0.74      0.64      0.68        44
         5.0       0.80      0.83      0.81       184

    accuracy                           0.94      2702
   macro avg       0.79      0.72      0.74      2702
weighted avg       0.93      0.94      0.93      2702

proportions in classes
[0.69319023 0.03145818 0.19096965 0.01628423 0.06809771]

confusion matrix
[[0.99 0.   0.   0.   0.  ]
 [0.15 0.25 0.45 0.   0.15]
 [0.01 0.01 0.92 0.02 0.04]
 [0.   0.   0.36 0.64 0.  ]
 [0.03 0.02 0.12 0.   0.83]]

Fowlkes-Meadows: 0.9617
Homogeneity: 0.733
CompletenessV Measure: 0.7725
Adjusted Rand: 0.7522
Adjusted Mutual: 0.9188
Accuracy: 0.7514


' & .9617 & .733 & .7725 & .7522 & .9188 & .7514'

In [25]:
val_pred = model.predict(X_val[~val_nan_mask])

print('validation report')
print(classification_report(y_val[~val_nan_mask], val_pred, zero_division=np.nan))

print('proportions in classes')
print(np.unique(y_val[~val_nan_mask], return_counts=True)[1] / len(y_val))
print()

print('confusion matrix')
print(np.round(confusion_matrix(y_val[~val_nan_mask], val_pred, normalize='true'), 2))
print()

get_scores(y_val[~val_nan_mask], val_pred)

validation report
              precision    recall  f1-score   support

         0.0       0.97      0.93      0.95      1830
         1.0       0.26      0.27      0.26       245
         2.0       0.55      0.93      0.69       405
         3.0       0.03      0.01      0.02        85
         4.0        nan      0.00      0.00         9
         5.0       0.82      0.30      0.44       226

    accuracy                           0.79      2800
   macro avg       0.53      0.41      0.39      2800
weighted avg       0.81      0.79      0.78      2800

proportions in classes
[0.65357143 0.0875     0.14464286 0.03035714 0.00321429 0.08071429]

confusion matrix
[[0.93 0.04 0.03 0.   0.   0.  ]
 [0.18 0.27 0.48 0.04 0.   0.03]
 [0.   0.04 0.93 0.02 0.   0.01]
 [0.   0.01 0.98 0.01 0.   0.  ]
 [0.   0.   1.   0.   0.   0.  ]
 [0.   0.47 0.16 0.06 0.   0.3 ]]

Fowlkes-Meadows: 0.8535
Homogeneity: 0.5008
CompletenessV Measure: 0.5416
Adjusted Rand: 0.5204
Adjusted Mutual: 0.7288
Accuracy: 

' & .8535 & .5008 & .5416 & .5204 & .7288 & .5188'

## 6. train softmax on multiple people, then run softmax on other people

In [209]:
print('number of train patients:', len(data['train_patients']))
train_patients = data['train_patients']
print('train patients:', train_patients)
X_trains, y_train = \
  np.concatenate([data[train_patient][:, :-1] for train_patient in train_patients], axis=0), \
  np.concatenate([data[train_patient][:, -1] for train_patient in train_patients])
print()

print('number of val patients:', len(data['validate_patients']))
val_patients = data['validate_patients']
print('val patients:', val_patients)
X_vals, y_val = \
  np.concatenate([data[val_patient][:, :-1] for val_patient in val_patients], axis=0), \
  np.concatenate([data[val_patient][:, -1] for val_patient in val_patients])

num_features = X_trains.shape[1]

number of train patients: 75
train patients: ['SC4241' 'SC4242' 'SC4001' 'SC4002' 'SC4271' 'SC4272' 'SC4761' 'SC4762'
 'SC4021' 'SC4022' 'SC4341' 'SC4342' 'SC4011' 'SC4012' 'SC4411' 'SC4412'
 'SC4281' 'SC4282' 'SC4431' 'SC4432' 'SC4041' 'SC4042' 'SC4081' 'SC4082'
 'SC4731' 'SC4732' 'SC4631' 'SC4632' 'SC4121' 'SC4122' 'SC4051' 'SC4052'
 'SC4061' 'SC4062' 'SC4031' 'SC4032' 'SC4091' 'SC4092' 'SC4771' 'SC4772'
 'SC4211' 'SC4212' 'SC4451' 'SC4452' 'SC4251' 'SC4252' 'SC4111' 'SC4112'
 'SC4371' 'SC4372' 'SC4171' 'SC4172' 'SC4561' 'SC4562' 'SC4471' 'SC4472'
 'SC4581' 'SC4582' 'SC4741' 'SC4742' 'SC4231' 'SC4232' 'SC4522' 'SC4751'
 'SC4752' 'SC4651' 'SC4652' 'SC4611' 'SC4612' 'SC4151' 'SC4152' 'SC4321'
 'SC4322' 'SC4701' 'SC4702']

number of val patients: 31
val patients: ['SC4261' 'SC4262' 'SC4491' 'SC4492' 'SC4161' 'SC4162' 'SC4301' 'SC4302'
 'SC4311' 'SC4312' 'SC4571' 'SC4572' 'SC4591' 'SC4592' 'SC4501' 'SC4502'
 'SC4362' 'SC4721' 'SC4722' 'SC4641' 'SC4642' 'SC4461' 'SC4462' 'SC4201'
 'SC4202

In [210]:
# Transform data to have mean zero and standard deviation one
scaler = StandardScaler().fit(X_trains)
X_trains = scaler.transform(X_trains) 
X_vals = scaler.transform(X_vals)

In [213]:
train_nan_mask = np.isnan(y_train)
val_nan_mask = np.isnan(y_val)

model = LogisticRegression(max_iter=200)
model.fit(X_trains[~train_nan_mask], y_train[~train_nan_mask])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [216]:
train_pred = model.predict(X_trains[~train_nan_mask])

report(y_train, train_pred, train_nan_mask, "training")

print('training report')
print(classification_report(y_train[~train_nan_mask], train_pred, zero_division=np.nan))

print('proportions in classes')
print(np.unique(y_train[~train_nan_mask], return_counts=True)[1] / len(y_train))
print()

print('confusion matrix')
print(np.round(confusion_matrix(y_train[~train_nan_mask], train_pred, normalize='true'), 2))
print()

get_scores(y_train[~train_nan_mask], train_pred)

training report
              precision    recall  f1-score   support

         0.0       0.97      0.98      0.97    139982
         1.0       0.41      0.15      0.22      9958
         2.0       0.74      0.90      0.81     33507
         3.0       0.54      0.38      0.45      4618
         4.0       0.74      0.52      0.61      2609
         5.0       0.69      0.67      0.68     12809

    accuracy                           0.89    203483
   macro avg       0.68      0.60      0.62    203483
weighted avg       0.87      0.89      0.87    203483

proportions in classes
[0.68374624 0.04864015 0.16366594 0.02255676 0.01274374 0.06256594]

confusion matrix
[[0.98 0.01 0.01 0.   0.   0.01]
 [0.27 0.15 0.4  0.   0.   0.18]
 [0.03 0.01 0.9  0.01 0.   0.04]
 [0.02 0.   0.52 0.38 0.07 0.  ]
 [0.01 0.   0.09 0.38 0.52 0.  ]
 [0.07 0.05 0.22 0.   0.   0.67]]

Fowlkes-Meadows: 0.9233
Homogeneity: 0.5919
Completeness: 0.6498
V Measure: 0.6195
Adjusted Rand: 0.8406
Adjusted Mutual: 0.6195
Acc

' & .9233 & .5919 & .6498 & .6195 & .8406 & .6195 & .8861'

In [215]:
val_pred = model.predict(X_vals[~val_nan_mask])

print('validation report')
print(classification_report(y_val[~val_nan_mask], val_pred, zero_division=np.nan))

print('proportions in classes')
print(np.unique(y_val[~val_nan_mask], return_counts=True)[1] / len(y_val))
print()

print('confusion matrix')
print(np.round(confusion_matrix(y_val[~val_nan_mask], val_pred, normalize='true'), 2))
print()

get_scores(y_val[~val_nan_mask], val_pred)

validation report
              precision    recall  f1-score   support

         0.0       0.95      0.97      0.96     57161
         1.0       0.37      0.15      0.22      4775
         2.0       0.74      0.89      0.81     14654
         3.0       0.25      0.12      0.16      1147
         4.0       0.11      0.11      0.11       207
         5.0       0.60      0.55      0.58      5290

    accuracy                           0.87     83234
   macro avg       0.50      0.47      0.47     83234
weighted avg       0.85      0.87      0.85     83234

proportions in classes
[0.68663511 0.05735874 0.17602825 0.01377811 0.00248655 0.06354507]

confusion matrix
[[0.97 0.01 0.01 0.   0.   0.01]
 [0.22 0.15 0.43 0.   0.   0.19]
 [0.05 0.02 0.89 0.01 0.   0.03]
 [0.04 0.   0.76 0.12 0.09 0.  ]
 [0.   0.   0.43 0.46 0.11 0.  ]
 [0.19 0.05 0.2  0.   0.   0.55]]

Fowlkes-Meadows: 0.8951
Homogeneity: 0.5139
Completeness: 0.5683
V Measure: 0.5397
Adjusted Rand: 0.7801
Adjusted Mutual: 0.5396
A

' & .8951 & .5139 & .5683 & .5397 & .7801 & .5396 & .8659'

## 7. ...

In [None]:
sleep_stages=  ('m', 'w', '1', '2', '3', '4', 'r')
num_stages = len(sleep_stages)

# dim_x doesn't necessarily need to be the same as num_stages
dim_x = num_stages
dim_z = num_features

# Init
kf = KalmanFilter(n_dim_state=dim_x, n_dim_obs=dim_z)

# Fit to fourier data with expectation maximization
kf.em(X_train)

<pykalman.standard.KalmanFilter at 0x7f4e7bb62dd0>

In [None]:
# Predict the states for the fourier data
X_train_states, _ = kf.smooth(X_train)
X_val_states, _ = kf.smooth(X_val)

In [None]:
# Transform data to have mean zero and standard deviation one
scaler = StandardScaler().fit(X_train)
X_train_states = scaler.transform(X_train)
X_val_states = scaler.transform(X_val)

In [None]:
model = LogisticRegression(max_iter=200)
model.fit(X_train_states, y_train)

In [None]:
train_pred = model.predict(X_train_states)

print('training report')
print(classification_report(y_train, train_pred, zero_division=np.nan))

print('proportions in classes')
print(np.unique(y_train, return_counts=True)[1] / len(y_train))
print()

print('confusion matrix')
print(np.round(confusion_matrix(y_train, train_pred, normalize='true'), 2))

training report
              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99      1873
         1.0       0.17      0.04      0.06        85
         2.0       0.84      0.95      0.89       516
         3.0       0.81      0.50      0.62        44
         5.0       0.82      0.90      0.86       184

    accuracy                           0.94      2702
   macro avg       0.73      0.68      0.68      2702
weighted avg       0.92      0.94      0.93      2702

proportions in classes
[0.69319023 0.03145818 0.19096965 0.01628423 0.06809771]

confusion matrix
[[0.99 0.01 0.   0.   0.  ]
 [0.19 0.04 0.62 0.   0.15]
 [0.01 0.   0.95 0.01 0.03]
 [0.   0.   0.5  0.5  0.  ]
 [0.01 0.02 0.08 0.   0.9 ]]


In [None]:
val_pred = model.predict(X_val_states)

print('validation report')
print(classification_report(y_val, val_pred, zero_division=np.nan))

print('proportions in classes')
print(np.unique(y_val, return_counts=True)[1] / len(y_val))
print()

print('confusion matrix')
print(np.round(confusion_matrix(y_val, val_pred, normalize='true'), 2))

validation report
              precision    recall  f1-score   support

         0.0       1.00      0.00      0.00      1830
         1.0        nan      0.00      0.00       245
         2.0       0.00      0.00      0.00       405
         3.0       0.05      1.00      0.10        85
         4.0        nan      0.00      0.00         9
         5.0       0.00      0.00      0.00       226

    accuracy                           0.03      2800
   macro avg       0.26      0.17      0.02      2800
weighted avg       0.72      0.03      0.00      2800

proportions in classes
[0.65357143 0.0875     0.14464286 0.03035714 0.00321429 0.08071429]

confusion matrix
[[0.   0.   0.01 0.35 0.   0.64]
 [0.   0.   0.   0.98 0.   0.02]
 [0.   0.   0.   1.   0.   0.  ]
 [0.   0.   0.   1.   0.   0.  ]
 [0.   0.   0.   1.   0.   0.  ]
 [0.   0.   0.   1.   0.   0.  ]]
