# Forecasting Seizures From EEG Data: Models

#### Author: Burak Himmetoglu

In [1]:
# Imports
import numpy as np
import matplotlib.pyplot as plt
from utils.construct_save_features import *
from utils.get_prepare_data_full import *
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_curve, roc_auc_score
%matplotlib inline

## Logistic Regression with L1 norm

### Dog 1

In [2]:
## Dog 1
X_1 = np.load('../data/Dog_1_X.npy')
Y_1 = np.load('../data/Dog_1_Y.npy')
clip_ids_1 = np.load('../data/Dog_1_clip_ids.npy')

# Shuffle to break time order in the k-folds
np.random.seed(1)
shuffle = np.random.choice(np.arange(len(Y_1)), size=len(Y_1), replace=False)
X_1 = X_1[shuffle]
Y_1 = Y_1[shuffle]
clip_ids_1 = clip_ids_1[shuffle]

# Rescale
X_scaler = StandardScaler()
X_1 = X_scaler.fit_transform(X_1)

In [3]:
# Stratified 5-folds 
skf = StratifiedKFold(n_splits=5, random_state = 1234)

# Range of C
Cvalues = np.linspace(0.01,100,10)

# Fit for Dog 1
cls = LogisticRegressionCV(Cs = Cvalues, cv = skf.split(X_1, Y_1), max_iter=150, solver = 'liblinear',
                           penalty = "l1", scoring = 'roc_auc', verbose = 1, n_jobs = 5, tol=0.001)

cls.fit(X_1,Y_1)

[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed:  2.8min remaining:  4.2min
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:  2.8min finished


LogisticRegressionCV(Cs=array([  1.00000e-02,   1.11200e+01,   2.22300e+01,   3.33400e+01,
         4.44500e+01,   5.55600e+01,   6.66700e+01,   7.77800e+01,
         8.88900e+01,   1.00000e+02]),
           class_weight=None,
           cv=<generator object _BaseKFold.split at 0x7f4e8866afc0>,
           dual=False, fit_intercept=True, intercept_scaling=1.0,
           max_iter=150, multi_class='ovr', n_jobs=5, penalty='l1',
           random_state=None, refit=True, scoring='roc_auc',
           solver='liblinear', tol=0.001, verbose=1)

In [6]:
# Best model
print("Best model has C = {:.4f}".format(cls.C_[0]))
print("Best model has the CV scores:")
print(cls.scores_[1][:,cls.Cs_ == cls.C_[0]])

Best model has C = 11.1200
Best model has the CV scores:
[[ 0.93061542]
 [ 0.94642002]
 [ 0.91528975]
 [ 0.90235872]
 [ 0.93712798]]


In [7]:
print("Average AUC = {:4f}".format(np.mean(cls.scores_[1][:,cls.Cs_ == cls.C_[0]])))

Average AUC = 0.926362


### Dog 2

In [2]:
## Dog 2
X_2 = np.load('../data/Dog_2_X.npy')
Y_2 = np.load('../data/Dog_2_Y.npy')
clip_ids_2 = np.load('../data/Dog_2_clip_ids.npy')

# Shuffle to break time order in the k-folds
np.random.seed(1)
shuffle = np.random.choice(np.arange(len(Y_2)), size=len(Y_2), replace=False)
X_2 = X_2[shuffle]
Y_2 = Y_2[shuffle]
clip_ids_2 = clip_ids_2[shuffle]

# Rescale
X_scaler = StandardScaler()
X_2 = X_scaler.fit_transform(X_2)

In [3]:
# Stratified 5-folds 
skf = StratifiedKFold(n_splits=5, random_state = 1234)

# Range of C
Cvalues = np.linspace(0.01,100,10)

# Fit for Dog 2
cls = LogisticRegressionCV(Cs = Cvalues, cv = skf.split(X_2, Y_2), max_iter=150, solver = 'liblinear',
                           penalty = "l1", scoring = 'roc_auc', verbose = 1, n_jobs = 5, tol=0.001)

cls.fit(X_2,Y_2)

[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed:   18.6s remaining:   27.9s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:   25.9s finished


LogisticRegressionCV(Cs=array([  1.00000e-02,   1.11200e+01,   2.22300e+01,   3.33400e+01,
         4.44500e+01,   5.55600e+01,   6.66700e+01,   7.77800e+01,
         8.88900e+01,   1.00000e+02]),
           class_weight=None,
           cv=<generator object _BaseKFold.split at 0x7fa5704ae150>,
           dual=False, fit_intercept=True, intercept_scaling=1.0,
           max_iter=150, multi_class='ovr', n_jobs=5, penalty='l1',
           random_state=None, refit=True, scoring='roc_auc',
           solver='liblinear', tol=0.001, verbose=1)

In [4]:
# Best model
print("Best model has C = {:.4f}".format(cls.C_[0]))
print("Best model has the CV scores:")
print(cls.scores_[1][:,cls.Cs_ == cls.C_[0]])

Best model has C = 55.5600
Best model has the CV scores:
[[ 0.99362745]
 [ 0.99346405]
 [ 0.99213333]
 [ 0.99636667]
 [ 0.99116667]]


In [5]:
print("Average AUC = {:4f}".format(np.mean(cls.scores_[1][:,cls.Cs_ == cls.C_[0]])))

Average AUC = 0.993352


### Dog 3

In [6]:
## Dog 3
X_3 = np.load('../data/Dog_3_X.npy')
Y_3 = np.load('../data/Dog_3_Y.npy')
clip_ids_3 = np.load('../data/Dog_3_clip_ids.npy')

# Shuffle to break time order in the k-folds
np.random.seed(1)
shuffle = np.random.choice(np.arange(len(Y_3)), size=len(Y_3), replace=False)
X_3 = X_3[shuffle]
Y_3 = Y_3[shuffle]
clip_ids_3 = clip_ids_3[shuffle]

# Rescale
X_scaler = StandardScaler()
X_3 = X_scaler.fit_transform(X_3)

In [7]:
# Stratified 5-folds 
skf = StratifiedKFold(n_splits=5, random_state = 1234)

# Range of C
Cvalues = np.linspace(0.01,100,10)

# Fit for Dog 3
cls = LogisticRegressionCV(Cs = Cvalues, cv = skf.split(X_3, Y_3), max_iter=150, solver = 'liblinear',
                           penalty = "l1", scoring = 'roc_auc', verbose = 1, n_jobs = 5, tol=0.001)

cls.fit(X_3,Y_3)

[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed:  2.1min remaining:  3.1min


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]

[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:  2.3min finished


LogisticRegressionCV(Cs=array([  1.00000e-02,   1.11200e+01,   2.22300e+01,   3.33400e+01,
         4.44500e+01,   5.55600e+01,   6.66700e+01,   7.77800e+01,
         8.88900e+01,   1.00000e+02]),
           class_weight=None,
           cv=<generator object _BaseKFold.split at 0x7fa5704f5fc0>,
           dual=False, fit_intercept=True, intercept_scaling=1.0,
           max_iter=150, multi_class='ovr', n_jobs=5, penalty='l1',
           random_state=None, refit=True, scoring='roc_auc',
           solver='liblinear', tol=0.001, verbose=1)

In [8]:
# Best model
print("Best model has C = {:.4f}".format(cls.C_[0]))
print("Best model has the CV scores:")
print(cls.scores_[1][:,cls.Cs_ == cls.C_[0]])

Best model has C = 11.1200
Best model has the CV scores:
[[ 0.96881652]
 [ 0.97086526]
 [ 0.96658323]
 [ 0.98329161]
 [ 0.98276674]]


In [9]:
print("Average AUC = {:4f}".format(np.mean(cls.scores_[1][:,cls.Cs_ == cls.C_[0]])))

Average AUC = 0.974465


### Dog 4

In [3]:
## Dog 4
X_4 = np.load('../data/Dog_4_X.npy')
Y_4 = np.load('../data/Dog_4_Y.npy')
clip_ids_4 = np.load('../data/Dog_4_clip_ids.npy')

# Shuffle to break time order in the k-folds
np.random.seed(1)
shuffle = np.random.choice(np.arange(len(Y_4)), size=len(Y_4), replace=False)
X_4 = X_4[shuffle]
Y_4 = Y_4[shuffle]
clip_ids_4 = clip_ids_4[shuffle]

# Rescale
X_scaler = StandardScaler()
X_4 = X_scaler.fit_transform(X_4)

In [4]:
# Stratified 5-folds 
skf = StratifiedKFold(n_splits=5, random_state = 1234)

# Range of C
Cvalues = np.linspace(0.01,100,10)

# Fit for Dog 4
cls = LogisticRegressionCV(Cs = Cvalues, cv = skf.split(X_4, Y_4), max_iter=150, solver = 'liblinear',
                           penalty = "l1", scoring = 'roc_auc', verbose = 1, n_jobs = 5, tol=0.001)

cls.fit(X_4,Y_4)

[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed:   41.6s remaining:  1.0min
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:   53.7s finished


LogisticRegressionCV(Cs=array([  1.00000e-02,   1.11200e+01,   2.22300e+01,   3.33400e+01,
         4.44500e+01,   5.55600e+01,   6.66700e+01,   7.77800e+01,
         8.88900e+01,   1.00000e+02]),
           class_weight=None,
           cv=<generator object _BaseKFold.split at 0x7efd884d9410>,
           dual=False, fit_intercept=True, intercept_scaling=1.0,
           max_iter=150, multi_class='ovr', n_jobs=5, penalty='l1',
           random_state=None, refit=True, scoring='roc_auc',
           solver='liblinear', tol=0.001, verbose=1)

In [5]:
# Best model
print("Best model has C = {:.4f}".format(cls.C_[0]))
print("Best model has the CV scores:")
print(cls.scores_[1][:,cls.Cs_ == cls.C_[0]])

Best model has C = 100.0000
Best model has the CV scores:
[[ 0.9220318 ]
 [ 0.93283734]
 [ 0.91929605]
 [ 0.92872968]
 [ 0.93574725]]


In [6]:
print("Average AUC = {:4f}".format(np.mean(cls.scores_[1][:,cls.Cs_ == cls.C_[0]])))

Average AUC = 0.927728


### Dog 5

In [7]:
## Dog 5
X_5 = np.load('../data/Dog_5_X.npy')
Y_5 = np.load('../data/Dog_5_Y.npy')
clip_ids_5 = np.load('../data/Dog_5_clip_ids.npy')

# Shuffle to break time order in the k-folds
np.random.seed(1)
shuffle = np.random.choice(np.arange(len(Y_5)), size=len(Y_5), replace=False)
X_5 = X_5[shuffle]
Y_5 = Y_5[shuffle]
clip_ids_5 = clip_ids_5[shuffle]

# Rescale
X_scaler = StandardScaler()
X_5 = X_scaler.fit_transform(X_5)

In [8]:
# Stratified 5-folds 
skf = StratifiedKFold(n_splits=5, random_state = 1234)

# Range of C
Cvalues = np.linspace(0.01,100,10)

# Fit for Dog 5
cls = LogisticRegressionCV(Cs = Cvalues, cv = skf.split(X_5, Y_5), max_iter=150, solver = 'liblinear',
                           penalty = "l1", scoring = 'roc_auc', verbose = 1, n_jobs = 5, tol=0.001)

cls.fit(X_5,Y_5)

[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed:    4.2s remaining:    6.4s


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]

[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:    6.7s finished


LogisticRegressionCV(Cs=array([  1.00000e-02,   1.11200e+01,   2.22300e+01,   3.33400e+01,
         4.44500e+01,   5.55600e+01,   6.66700e+01,   7.77800e+01,
         8.88900e+01,   1.00000e+02]),
           class_weight=None,
           cv=<generator object _BaseKFold.split at 0x7efd5767db48>,
           dual=False, fit_intercept=True, intercept_scaling=1.0,
           max_iter=150, multi_class='ovr', n_jobs=5, penalty='l1',
           random_state=None, refit=True, scoring='roc_auc',
           solver='liblinear', tol=0.001, verbose=1)

In [9]:
# Best model
print("Best model has C = {:.4f}".format(cls.C_[0]))
print("Best model has the CV scores:")
print(cls.scores_[1][:,cls.Cs_ == cls.C_[0]])

Best model has C = 11.1200
Best model has the CV scores:
[[ 0.9997428 ]
 [ 0.99459877]
 [ 0.98544239]
 [ 0.99645062]
 [ 0.9965535 ]]


In [10]:
print("Average AUC = {:4f}".format(np.mean(cls.scores_[1][:,cls.Cs_ == cls.C_[0]])))

Average AUC = 0.994558


### Patient 1

In [2]:
## Patient 1
X_p1 = np.load('../data/Patient_1_X.npy')
Y_p1 = np.load('../data/Patient_1_Y.npy')
clip_ids_p1 = np.load('../data/Patient_1_clip_ids.npy')

# Shuffle to break time order in the k-folds
np.random.seed(1)
shuffle = np.random.choice(np.arange(len(Y_p1)), size=len(Y_p1), replace=False)
X_p1 = X_p1[shuffle]
Y_p1 = Y_p1[shuffle]
clip_ids_p1 = clip_ids_p1[shuffle]

# Rescale
X_scaler = StandardScaler()
X_p1 = X_scaler.fit_transform(X_p1)

In [3]:
# Stratified 5-folds 
skf = StratifiedKFold(n_splits=5, random_state = 1234)

# Range of C
Cvalues = np.linspace(0.01,100,10)

# Fit for Patient 1
cls = LogisticRegressionCV(Cs = Cvalues, cv = skf.split(X_p1, Y_p1), max_iter=150, solver = 'liblinear',
                           penalty = "l1", scoring = 'roc_auc', verbose = 1, n_jobs = 5, tol=0.001)

cls.fit(X_p1,Y_p1)

[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed:   11.7s remaining:   17.6s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:   15.9s finished


LogisticRegressionCV(Cs=array([  1.00000e-02,   1.11200e+01,   2.22300e+01,   3.33400e+01,
         4.44500e+01,   5.55600e+01,   6.66700e+01,   7.77800e+01,
         8.88900e+01,   1.00000e+02]),
           class_weight=None,
           cv=<generator object _BaseKFold.split at 0x7f900c5f4150>,
           dual=False, fit_intercept=True, intercept_scaling=1.0,
           max_iter=150, multi_class='ovr', n_jobs=5, penalty='l1',
           random_state=None, refit=True, scoring='roc_auc',
           solver='liblinear', tol=0.001, verbose=1)

In [4]:
# Best model
print("Best model has C = {:.4f}".format(cls.C_[0]))
print("Best model has the CV scores:")
print(cls.scores_[1][:,cls.Cs_ == cls.C_[0]])

Best model has C = 11.1200
Best model has the CV scores:
[[ 0.98531249]
 [ 0.98496595]
 [ 0.99123988]
 [ 0.98067857]
 [ 0.98852592]]


In [5]:
print("Average AUC = {:4f}".format(np.mean(cls.scores_[1][:,cls.Cs_ == cls.C_[0]])))

Average AUC = 0.986145


### Patient 2

In [2]:
## Patient 2
X_p2 = np.load('../data/Patient_2_X.npy')
Y_p2 = np.load('../data/Patient_2_Y.npy')
clip_ids_p2 = np.load('../data/Patient_2_clip_ids.npy')

# Shuffle to break time order in the k-folds
np.random.seed(1)
shuffle = np.random.choice(np.arange(len(Y_p2)), size=len(Y_p2), replace=False)
X_p2 = X_p2[shuffle]
Y_p2 = Y_p2[shuffle]
clip_ids_p2 = clip_ids_p2[shuffle]

# Rescale
X_scaler = StandardScaler()
X_p2 = X_scaler.fit_transform(X_p2)

In [3]:
# Stratified 5-folds 
skf = StratifiedKFold(n_splits=5, random_state = 1234)

# Range of C
#Cvalues = np.linspace(0.01,100,10)
Cvalues = np.array([0.01,0.1,1])

# Fit for Patient 2
cls = LogisticRegressionCV(Cs = Cvalues, cv = skf.split(X_p2, Y_p2), max_iter=200, solver = 'liblinear',
                           penalty = "l1", scoring = 'roc_auc', verbose = 1, n_jobs = 4, tol=0.001)

cls.fit(X_p2,Y_p2)

[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:   17.5s finished


LogisticRegressionCV(Cs=array([ 0.01,  0.1 ,  1.  ]), class_weight=None,
           cv=<generator object _BaseKFold.split at 0x10b667048>,
           dual=False, fit_intercept=True, intercept_scaling=1.0,
           max_iter=200, multi_class='ovr', n_jobs=4, penalty='l1',
           random_state=None, refit=True, scoring='roc_auc',
           solver='liblinear', tol=0.001, verbose=1)

In [4]:
# Best model
print("Best model has C = {:.4f}".format(cls.C_[0]))
print("Best model has the CV scores:")
print(cls.scores_[1][:,cls.Cs_ == cls.C_[0]])

Best model has C = 1.0000
Best model has the CV scores:
[[ 0.93173999]
 [ 0.94843644]
 [ 0.97325854]
 [ 0.95916086]
 [ 0.9453699 ]]


In [14]:
print("Average AUC = {:4f}".format(np.mean(cls.scores_[1][:,cls.Cs_ == cls.C_[0]])))

Average AUC = 0.951593
