In [1]:
import pandas as pd
import numpy as np
from source import data_import
from source.calculate_mean_psd import calculate_mean_psd
from source.constants import CHANNELS, FREQUENCY_RANGES

print(f"loaded channel names: {CHANNELS}")

loaded channel names: ['F4-C4', 'F3-C3', 'FT9-FT10', 'FZ-CZ', 'F7-T7', 'FP2-F4', 'T8-P8-1', 'T8-P8-0', 'FP1-F3', 'CZ-PZ']


In [2]:
# load data
df_pp = data_import.load_pyarrow(file_name="preprocessed_seg_all")
df_pp.shape

/home/weasel/reps/ai-seizure-detectives/source/../data/preprocessed_seg_all.arrow was loaded.


(16857600, 17)

In [3]:
df_pp.isna().sum()

channel
F4-C4             0
F3-C3             0
FT9-FT10          0
FZ-CZ             0
F7-T7             0
FP2-F4            0
T8-P8-1           0
T8-P8-0           0
FP1-F3            0
CZ-PZ             0
before_seizure    0
is_seizure        0
target            0
epoch             0
segment_id        0
seizure_start     0
seizure_id        0
dtype: int64

In [4]:
df_pp.shape

(16857600, 17)

In [5]:
# df_pp = df_pp.iloc[:2_000_000,:]
# df_pp.shape

## Feature Extraction

In [6]:
# ignore for aggregation
ignore_col = []

# target definition
PRED_INTERVAL = 60
target_colname = 'target'

In [7]:
### aggregate Functions for mean psd:
delta = lambda x: calculate_mean_psd(x, frequency_ranges={'Delta' : FREQUENCY_RANGES['Delta']})[x.name]['Delta']
theta = lambda x: calculate_mean_psd(x, frequency_ranges={'Theta' : FREQUENCY_RANGES['Theta']})[x.name]['Theta']
gamma = lambda x: calculate_mean_psd(x, frequency_ranges={'Gamma': FREQUENCY_RANGES['Gamma']})[x.name]['Gamma']

delta_agg = pd.NamedAgg(column='delta', aggfunc=delta)
theta_agg = pd.NamedAgg(column='theta', aggfunc=theta)
gamma_agg = pd.NamedAgg(column='gamma', aggfunc=gamma)

### aggregate Functions for target:
target_foo = lambda x, pred_interval=PRED_INTERVAL: 0 < x.dt.total_seconds().min() < pred_interval


In [8]:
abs_mean = lambda x: x.apply(abs).mean()
abs_mean_agg = pd.NamedAgg(column='abs_mean', aggfunc=abs_mean)

In [9]:
df_features = df_pp.groupby(['epoch', 'segment_id']).agg(
    {C:[
        # 'mean', 
        'std',
        'var',
        'mean',
        abs_mean_agg,
        delta_agg,
        theta_agg,
        gamma_agg
        ] for C in CHANNELS} | 
    {target_colname: [target_foo]} 
    ) 
df_features.head()

Unnamed: 0_level_0,channel,F4-C4,F4-C4,F4-C4,F4-C4,F4-C4,F4-C4,F4-C4,F3-C3,F3-C3,F3-C3,...,FP1-F3,FP1-F3,CZ-PZ,CZ-PZ,CZ-PZ,CZ-PZ,CZ-PZ,CZ-PZ,CZ-PZ,target
Unnamed: 0_level_1,Unnamed: 1_level_1,std,var,mean,abs_mean,delta,theta,gamma,std,var,mean,...,theta,gamma,std,var,mean,abs_mean,delta,theta,gamma,<lambda>
epoch,segment_id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
0,0,19.215291,369.227395,1.67776,14.539727,39.509423,15.649456,1.143016,21.315638,454.356435,1.053339,...,92.822474,22.441816,16.368013,267.91186,0.692305,13.053429,33.181709,18.606278,0.272641,False
0,1,19.292719,372.208995,-0.391839,15.081409,57.166882,13.481116,1.21843,25.477037,649.079429,0.811962,...,76.80825,33.427544,18.012047,324.43384,-0.487618,14.154003,40.078259,21.722061,0.268936,False
0,2,26.026211,677.363658,1.356797,18.727514,112.603482,22.111449,0.943441,27.729263,768.912015,1.282874,...,176.024973,8.914849,22.813508,520.456131,1.294472,18.522101,78.066994,19.695134,0.256728,False
0,3,26.99949,728.972443,-1.657887,18.8776,148.551723,15.161321,1.178056,29.614908,877.042781,-2.174102,...,75.730101,21.215121,19.942794,397.715029,-0.47604,15.324033,60.340596,18.82107,0.258679,False
0,4,23.273241,541.643731,0.776272,17.412866,77.372989,25.561364,1.423834,23.413506,548.192254,0.848064,...,284.475569,16.303152,18.725786,350.655046,1.382429,15.186356,46.083693,24.182717,0.271254,False


## Flatten Dataframe

In [10]:
# joining column names with agg functions, but leaving target column
df_features.columns = ['_'.join(col).strip() for col in df_features.columns.values if target_colname != col[0]] + [target_colname]
df_features.reset_index(inplace=True)
df_features.head(10)


Unnamed: 0,epoch,segment_id,F4-C4_std,F4-C4_var,F4-C4_mean,F4-C4_abs_mean,F4-C4_delta,F4-C4_theta,F4-C4_gamma,F3-C3_std,...,FP1-F3_theta,FP1-F3_gamma,CZ-PZ_std,CZ-PZ_var,CZ-PZ_mean,CZ-PZ_abs_mean,CZ-PZ_delta,CZ-PZ_theta,CZ-PZ_gamma,target
0,0,0,19.215291,369.227395,1.67776,14.539727,39.509423,15.649456,1.143016,21.315638,...,92.822474,22.441816,16.368013,267.91186,0.692305,13.053429,33.181709,18.606278,0.272641,False
1,0,1,19.292719,372.208995,-0.391839,15.081409,57.166882,13.481116,1.21843,25.477037,...,76.80825,33.427544,18.012047,324.43384,-0.487618,14.154003,40.078259,21.722061,0.268936,False
2,0,2,26.026211,677.363658,1.356797,18.727514,112.603482,22.111449,0.943441,27.729263,...,176.024973,8.914849,22.813508,520.456131,1.294472,18.522101,78.066994,19.695134,0.256728,False
3,0,3,26.99949,728.972443,-1.657887,18.8776,148.551723,15.161321,1.178056,29.614908,...,75.730101,21.215121,19.942794,397.715029,-0.47604,15.324033,60.340596,18.82107,0.258679,False
4,0,4,23.273241,541.643731,0.776272,17.412866,77.372989,25.561364,1.423834,23.413506,...,284.475569,16.303152,18.725786,350.655046,1.382429,15.186356,46.083693,24.182717,0.271254,False
5,0,5,22.655431,513.268557,-0.036669,16.969776,76.212479,24.818786,1.681127,27.859298,...,249.780076,36.113652,19.74236,389.760781,-0.752261,15.359382,47.760727,27.81837,0.31438,False
6,0,6,23.433931,549.149139,0.8283,17.152483,76.743032,11.805732,0.505023,25.189341,...,71.469402,5.602255,19.722776,388.987881,-1.199098,15.671624,70.917918,21.647252,0.286571,False
7,0,7,18.370345,337.469568,0.423259,14.567939,48.173382,16.238093,0.567528,20.170082,...,11.973851,11.339858,17.475646,305.398194,1.369375,14.01237,41.424539,19.537928,0.229929,False
8,0,8,21.01378,441.578959,-0.204437,16.212443,65.756756,19.679089,0.560435,20.608746,...,123.848518,11.259756,19.824534,393.012134,-0.127999,15.558344,44.65695,22.937501,0.205348,False
9,0,9,20.651241,426.473748,-0.541733,14.850569,63.858712,22.94871,0.843431,25.653559,...,165.497411,13.510767,19.307319,372.772585,0.023715,15.797946,53.68535,20.680972,0.175309,False


In [11]:
df_features.isna().sum()

epoch             0
segment_id        0
F4-C4_std         0
F4-C4_var         0
F4-C4_mean        0
                 ..
CZ-PZ_abs_mean    0
CZ-PZ_delta       0
CZ-PZ_theta       0
CZ-PZ_gamma       0
target            0
Length: 73, dtype: int64

In [12]:
#Preparing data to be scaled

num_features= df_features.drop(['epoch','segment_id','target'],axis =1)

from sklearn.preprocessing import StandardScaler, minmax_scale, MinMaxScaler

scaler = MinMaxScaler()
num_features_scaled = scaler.fit_transform(num_features)
num_features_scaled.shape # timepoints x features


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


(13170, 70)

In [13]:
num_features_scaled[:30, :3]


array([[0.05761334, 0.0033193 , 0.55064641],
       [0.0578455 , 0.0033461 , 0.53107324],
       [0.07803458, 0.00608939, 0.5476109 ],
       [0.08095276, 0.00655335, 0.51909964],
       [0.06978032, 0.00486929, 0.54212062],
       [0.06792794, 0.00461421, 0.53443225],
       [0.07026212, 0.00493677, 0.54261266],
       [0.05507994, 0.0030338 , 0.538782  ],
       [0.06300577, 0.00396973, 0.53284559],
       [0.06191876, 0.00383393, 0.52965563],
       [0.06824278, 0.00465708, 0.52122081],
       [0.06324878, 0.00400041, 0.54289776],
       [0.07417467, 0.00550188, 0.543403  ],
       [0.06797294, 0.00462032, 0.53918446],
       [0.06534363, 0.00426979, 0.51596708],
       [0.08571476, 0.00734702, 0.56811915],
       [0.07158659, 0.00512464, 0.53445059],
       [0.0709821 , 0.00503846, 0.53789423],
       [0.06709531, 0.00450178, 0.53645845],
       [0.0649162 , 0.00421411, 0.52363324],
       [0.07208649, 0.00519646, 0.53046008],
       [0.09087587, 0.00825842, 0.54922236],
       [0.

In [14]:
# Determine the number of epochs
num_epochs = df_features.epoch.nunique()
# Determine the number of segments for each epoch
num_segments = df_features.segment_id.nunique()  # Assuming there are 30 segments for each epocha
num_features = num_features_scaled.shape[1]

reshaped_features = num_features_scaled.reshape(num_epochs, num_segments, num_features)
reshaped_features.shape # epochs x segments x features

(439, 30, 70)

In [15]:
# target_array = df_features.groupby('epoch')['target'].sum().gt(0).astype(int).values
target_array = df_features['target'].values.reshape(num_epochs, -1)
target_array = target_array.astype(int)
target_array.shape

(439, 30)

In [16]:
array_all_scaled = np.concatenate((reshaped_features, target_array[:,:,np.newaxis]), axis=2)
array_all_scaled.shape

(439, 30, 71)

In [17]:
np.save('data/feature_extract_reshaped_all.npy', array_all_scaled)

In [18]:
np.isnan(array_all_scaled).sum()

0