In [1]:
# Libraries
import numpy as np
import scipy.io as sio
import os
# import pickle
# import ieeg_funcs as ief
# import re
import dgFuncs as dg
from sklearn import preprocessing
# from scipy import stats
# from mpl_toolkits.axes_grid1 import make_axes_locatable
#rom sklearn import svm, linear_model
from sklearn.cluster import KMeans
from sklearn.externals import joblib
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def chan_labels_from_fname(in_file):
    just_fname=in_file.split('/')[-1]
    jf_splt=just_fname.split('_')
    chan_label=jf_splt[1]+'-'+jf_splt[2]
    return chan_label

In [3]:
# Get size of a subject's data (and filenames)
def data_size_and_fnames(sub_list,ftr_root):
    grand_non_fnames=list()
    grand_szr_fnames=list()
    grand_n_szr_wind=0
    grand_n_non_wind=0
    for sub in sub_list:
        print('Working on sub %d' % sub)
        non_fnames=list()
        szr_fnames=list()
        
        ftr_path=os.path.join(ftr_root,str(sub))
        for f in os.listdir(ftr_path):
            if f.endswith('non.mat'):
                non_fnames.append(os.path.join(ftr_root,str(sub),f))
            elif f.endswith('.mat') and f.startswith(str(sub)+'_'):
                szr_fnames.append(os.path.join(ftr_root,str(sub),f))

        print('%d non-szr files found' % len(non_fnames))
        print('%d szr files found' % len(szr_fnames))  

        # Loop over NON-szr files to get total # of windows
        n_non_wind=0
        ftr_dim=0
        for f in non_fnames:
            temp_ftrs=sio.loadmat(f)
            n_non_wind+=temp_ftrs['nonszr_se_ftrs'].shape[1]
            if ftr_dim==0:
                ftr_dim=temp_ftrs['nonszr_se_ftrs'].shape[0]
            elif ftr_dim!=temp_ftrs['nonszr_se_ftrs'].shape[0]:
                raise ValueError('# of features in file does match previous files')

        print('%d total # of NON-szr time windows for this sub' % n_non_wind)

        # Loop over SZR files to get total # of windows
        n_szr_wind=0
        for f in szr_fnames:
            temp_ftrs=sio.loadmat(f)
            n_szr_wind+=temp_ftrs['se_ftrs'].shape[1]
        print('%d total # of SZR time windows for this sub' % n_szr_wind)
        
        grand_non_fnames+=non_fnames
        grand_szr_fnames+=szr_fnames
        grand_n_szr_wind+=n_szr_wind
        grand_n_non_wind+=n_non_wind
        
    return grand_szr_fnames, grand_non_fnames, grand_n_szr_wind, grand_n_non_wind, ftr_dim

In [4]:
def import_nonszr_data(non_fnames, n_non_wind, ftr_dim):
    #ftr_path=os.path.join(ftr_root,str(sub))
        
    # Preallocate memory
    ftrs=np.zeros((ftr_dim,n_non_wind))

    # Import non-szr data
    ptr=0
    mns_dict=dict()
    sds_dict=dict()
    for f in non_fnames:
        chan_label=chan_labels_from_fname(f)

        temp_ftrs=sio.loadmat(f)
        temp_n_wind=temp_ftrs['nonszr_se_ftrs'].shape[1]
        raw_ftrs=temp_ftrs['nonszr_se_ftrs']
        # Z-score features
        temp_mns, temp_sds=dg.trimmed_normalize(raw_ftrs,0,zero_nans=False,verbose=False)
        mns_dict[chan_label]=temp_mns
        sds_dict[chan_label]=temp_sds
    
        ftrs[:,ptr:ptr+temp_n_wind]=raw_ftrs
        ptr+=temp_n_wind

    return ftrs

In [9]:
def import_szr_data(szr_fnames, non_fnames, n_szr_wind, ftr_dim):
        
    # Preallocate memory
    ftrs=np.zeros((ftr_dim,n_szr_wind))

    # Import non-szr data to compute mean and SD
    #ptr=0
    mns_dict=dict()
    sds_dict=dict()
    for f in non_fnames:
        chan_label=chan_labels_from_fname(f)

        temp_ftrs=sio.loadmat(f)
        temp_n_wind=temp_ftrs['nonszr_se_ftrs'].shape[1]
        raw_ftrs=temp_ftrs['nonszr_se_ftrs']
        # Z-score features
        temp_mns, temp_sds=dg.trimmed_normalize(raw_ftrs,0,zero_nans=False,verbose=False)
        mns_dict[chan_label]=temp_mns
        sds_dict[chan_label]=temp_sds
#         ptr+=temp_n_wind

    # Import szr data
    ptr=0
    for f in szr_fnames:
        chan_label=chan_labels_from_fname(f)

        temp_ftrs=sio.loadmat(f)
        temp_n_wind=temp_ftrs['se_ftrs'].shape[1]
        raw_ftrs=temp_ftrs['se_ftrs']
        # Z-score based on non-ictal means, SDs
        dg.applyNormalize(raw_ftrs,mns_dict[chan_label],sds_dict[chan_label])

        ftrs[:,ptr:ptr+temp_n_wind]=raw_ftrs
        ptr+=temp_n_wind

    return ftrs

In [10]:
train_subs=[620] 
#/Users/davidgroppe/PycharmProjects/SZR_ANT/EU_GENERAL/EU_GENERAL_FTRS/SE/1096_HL1_HL2_non.ma
ftr_root='/home/dgroppe/GIT/SZR_ANT/EU_GENERAL/EU_GENERAL_FTRS/SE/'
ftr_root='/Users/davidgroppe/PycharmProjects/SZR_ANT/EU_GENERAL/EU_GENERAL_FTRS/SE/'
szr_fnames_tr, non_fnames_tr, n_szr_wind_tr, n_non_wind_tr, ftr_dim=data_size_and_fnames(train_subs, ftr_root)
# ftrs_tr=import_nonszr_data(non_fnames_tr, n_non_wind_tr, ftr_dim)
ftrs_tr=import_szr_data(szr_fnames_tr, non_fnames_tr, n_szr_wind_tr, ftr_dim)

Working on sub 620
5 non-szr files found
14 szr files found
8960 total # of NON-szr time windows for this sub
8210 total # of SZR time windows for this sub


In [20]:
# Apply kmeans
# kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
# kmeans.cluster_centers_
k=int(np.round(n_non_wind_tr)/10)
print('Trying %d clusters' % k)
kclusters = KMeans(n_clusters=k).fit(ftrs_tr.T)

Trying 896 clusters


In [17]:
# Apply kmeans
# kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
# kmeans.cluster_centers_
dec_fact=6
print('Dec fact=%f' % dec_fact)
k=int(np.round(n_non_wind_tr)/dec_fact)
print('Trying %d clusters' % k)
kclust = KMeans(n_clusters=k,n_jobs=-1).fit(ftrs_tr.T)
print(kclust.inertia_)
sum_dist=np.sum(np.sqrt(np.sum(ftrs_tr**2,axis=0)))
print(sum_dist)
print('Pptn VA=%f ' % (1-np.divide(kclust.inertia_,sum_dist)))

Dec fact=6.000000
Trying 1493 clusters


Process ForkPoolWorker-10:
Process ForkPoolWorker-9:
Process ForkPoolWorker-11:
Process ForkPoolWorker-12:


KeyboardInterrupt: 

In [12]:
# Apply kmeans
# kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
# kmeans.cluster_centers_
k=int(np.round(n_szr_wind_tr)/2)
print('Trying %d clusters' % k)
kclusters2 = KMeans(n_clusters=k,n_jobs=-1).fit(ftrs_tr.T)

Trying 4105 clusters


In [14]:
# Measure % variance accounted for
print('Dec fact=2')
kclust=kclusters2
print(kclust.inertia_)
sum_dist=np.sum(np.sqrt(np.sum(ftrs_tr**2,axis=0)))
print(sum_dist)
print('Pptn VA=%f ' % (1-np.divide(kclust.inertia_,sum_dist)))

Dec fact=2
3851.06008892
669758.795172
Pptn VA=0.994250 


In [36]:
# Measure % variance accounted for
print(kclusters2.inertia_)
sum_dist=np.sum(np.sqrt(np.sum(ftrs_tr**2,axis=0)))
print(sum_dist)
print('Pptn VA=%f ' % (1-np.divide(kclusters2.inertia_,sum_dist)))

10647.0532239
42848.8485957
Pptn VA=0.751521 


In [12]:
# Get Training Data
train_subs=[1096, 620, 590, 862, 253, 1125]
#train_subs=[565] # Data look mislabelled
#train_subs=[264] #KeyError: 'BLA1-BLA2', some non files missing
#train_subs=[273] # Data look mislabelled
train_subs=[253, 1125]
train_subs=[1077] 
train_subs=[620] 
#/Users/davidgroppe/PycharmProjects/SZR_ANT/EU_GENERAL/EU_GENERAL_FTRS/SE/1096_HL1_HL2_non.ma
ftr_root='/home/dgroppe/SZR_ANT/EU_GENERAL/EU_GENERAL_FTRS/SE/'
szr_fnames_tr, non_fnames_tr, n_szr_wind_tr, n_non_wind_tr, ftr_dim=data_size_and_fnames(train_subs, ftr_root)
ftrs_tr, targ_labels_tr=import_data(szr_fnames_tr, non_fnames_tr, n_szr_wind_tr, n_non_wind_tr, ftr_dim)

Working on sub 620
5 non-szr files found
14 szr files found
8960 total # of NON-szr time windows for this sub
8210 total # of SZR time windows for this sub


TypeError: import_nonszr_data() takes 3 positional arguments but 5 were given

In [None]:
kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
kmeans.cluster_centers_