# Forecasting Seizures From EEG Data: Models for Dogs 1 to 4

#### Author: Burak Himmetoglu

In [13]:
# Imports
import numpy as np
import matplotlib.pyplot as plt
from utils.construct_save_features import *
from utils.get_prepare_data_full import *
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score
%matplotlib inline

Read dogs 1 to 4 and combine

In [3]:
!ls ../data/engineered_features/

Dog_1_clip_ids.npy	Dog_2_Y.npy		Dog_4_X.npy
Dog_1_test_dict.pickle	Dog_3_clip_ids.npy	Dog_4_Y.npy
Dog_1_test.npy		Dog_3_test_dict.pickle	Dog_5_clip_ids.npy
Dog_1_X.npy		Dog_3_test.npy		Dog_5_test_dict.pickle
Dog_1_Y.npy		Dog_3_X.npy		Dog_5_test.npy
Dog_2_clip_ids.npy	Dog_3_Y.npy		Dog_5_X.npy
Dog_2_test_dict.pickle	Dog_4_clip_ids.npy	Dog_5_Y.npy
Dog_2_test.npy		Dog_4_test_dict.pickle
Dog_2_X.npy		Dog_4_test.npy


In [4]:
dogs = ['Dog_1', 'Dog_2', 'Dog_3', 'Dog_4']
data_path = "../data/engineered_features/"

# List of arrays
array_list_X = []
array_list_Y = []
clip_id_dict = {}
array_list_Xtest = []

for dog in dogs:
    X = np.load(data_path + dog + "_X.npy")
    Y = np.load(data_path + dog + "_Y.npy")
    test = np.load(data_path + dog + "_test.npy")
    clip_id_dict[dog] = np.load(data_path + dog + "_clip_ids.npy")
    
    # Append to list
    array_list_X.append(X)
    array_list_Y.append(Y)
    array_list_Xtest.append(test)
    
# Concatenate
X = np.concatenate(array_list_X)
Y = np.concatenate(array_list_Y)
X_test = np.concatenate(array_list_Xtest)

The combined data has many more instances:

In [6]:
print(X.shape,Y.shape)

(20754, 166) (20754,)


In [8]:
# Shuffle to break time and Dog order in the k-folds
np.random.seed(1)
shuffle = np.random.choice(np.arange(len(Y)), size=len(Y), replace=False)
X = X[shuffle]
Y = Y[shuffle]

# Rescale
X_scaler = StandardScaler()
X = X_scaler.fit_transform(X)

## Linear model

In [9]:
# Stratified 5-folds 
skf = StratifiedKFold(n_splits=5, random_state = 1234)

# Range of C
Cvalues = np.linspace(0.01,100,10)

# Fit for Dog 1
cls = LogisticRegressionCV(Cs = Cvalues, cv = skf.split(X, Y), max_iter=150, solver = 'liblinear',
                           penalty = "l1", scoring = 'roc_auc', verbose = 1, n_jobs = 5)

cls.fit(X,Y)

[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed:  7.5min remaining: 11.2min


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]

[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:  8.4min finished


LogisticRegressionCV(Cs=array([  1.00000e-02,   1.11200e+01,   2.22300e+01,   3.33400e+01,
         4.44500e+01,   5.55600e+01,   6.66700e+01,   7.77800e+01,
         8.88900e+01,   1.00000e+02]),
           class_weight=None,
           cv=<generator object _BaseKFold.split at 0x7fc2bc44d0a0>,
           dual=False, fit_intercept=True, intercept_scaling=1.0,
           max_iter=150, multi_class='ovr', n_jobs=5, penalty='l1',
           random_state=None, refit=True, scoring='roc_auc',
           solver='liblinear', tol=0.0001, verbose=1)

In [10]:
# Best model
print("Best model has C = {:.4f}".format(cls.C_[0]))
print("Best model has the CV scores:")
print(cls.scores_[1][:,cls.Cs_ == cls.C_[0]])

Best model has C = 33.3400
Best model has the CV scores:
[[ 0.8874487 ]
 [ 0.88381919]
 [ 0.89591021]
 [ 0.88780431]
 [ 0.9004021 ]]


## Random Forest

In [14]:
# Stratified 5-folds 
skf = StratifiedKFold(n_splits=5, random_state = 1234)

# Range of mtry
mtry = [1,2,4,8,12,16,32]

# Classifier
cls = RandomForestClassifier(n_estimators=600)

# Fit
grid = GridSearchCV(cls, param_grid={'max_features' : mtry}, scoring = 'roc_auc', cv = 5, verbose = 1,
                    n_jobs = 5)


grid.fit(X,Y)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=5)]: Done  35 out of  35 | elapsed: 19.5min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=600, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=5,
       param_grid={'max_features': [1, 2, 4, 8, 12, 16, 32]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=1)

In [15]:
# Best model
print("Best mtry = {:d}".format(grid.best_params_['max_features']))
print("Best AUC = {:.4f}".format(grid.best_score_))

Best mtry = 8
Best AUC = 0.9534
