In [1]:
import pandas as pd
import numpy as np
from source import data_import
from source.calculate_mean_psd import calculate_mean_psd
from source.constants import CHANNELS, FREQUENCY_RANGES

print(f"loaded channel names: {CHANNELS}")

loaded channel names: ['F4-C4', 'F3-C3', 'FT9-FT10', 'FZ-CZ', 'F7-T7', 'FP2-F4', 'T8-P8-1', 'T8-P8-0', 'FP1-F3', 'CZ-PZ']


In [2]:
# load data
df_pp = data_import.load_pyarrow(file_name="preprocessed_seg")
df_pp.shape

/home/weasel/reps/ai-seizure-detectives/source/../data/preprocessed_seg.arrow was loaded.


(16972800, 17)

In [3]:
# df_pp = df_pp.iloc[:2_000_000,:]
# df_pp.shape

## Feature Extraction

In [4]:
# ignore for aggregation
ignore_col = []

# target definition
PRED_INTERVAL = 60
target_colname = 'target'

In [5]:
### aggregate Functions for mean psd:
delta = lambda x: calculate_mean_psd(x, frequency_ranges={'Delta' : FREQUENCY_RANGES['Delta']})[x.name]['Delta']
theta = lambda x: calculate_mean_psd(x, frequency_ranges={'Theta' : FREQUENCY_RANGES['Theta']})[x.name]['Theta']
gamma = lambda x: calculate_mean_psd(x, frequency_ranges={'Gamma': FREQUENCY_RANGES['Gamma']})[x.name]['Gamma']

delta_agg = pd.NamedAgg(column='delta', aggfunc=delta)
theta_agg = pd.NamedAgg(column='theta', aggfunc=theta)
gamma_agg = pd.NamedAgg(column='gamma', aggfunc=gamma)

### aggregate Functions for target:
target_foo = lambda x, pred_interval=PRED_INTERVAL: 0 < x.dt.total_seconds().min() < pred_interval


In [6]:
abs_mean = lambda x: x.apply(abs).mean()
abs_mean_agg = pd.NamedAgg(column='abs_mean', aggfunc=abs_mean)

In [7]:
df_features = df_pp.groupby(['epoch', 'segment_id']).agg(
    {C:[
        # 'mean', 
        'std',
        'var',
        'mean',
        abs_mean_agg,
        delta_agg,
        theta_agg,
        gamma_agg
        ] for C in CHANNELS} | 
    {target_colname: [target_foo]} 
    ) 
df_features.head()

Unnamed: 0_level_0,channel,F4-C4,F4-C4,F4-C4,F4-C4,F4-C4,F4-C4,F4-C4,F3-C3,F3-C3,F3-C3,...,FP1-F3,FP1-F3,CZ-PZ,CZ-PZ,CZ-PZ,CZ-PZ,CZ-PZ,CZ-PZ,CZ-PZ,target
Unnamed: 0_level_1,Unnamed: 1_level_1,std,var,mean,abs_mean,delta,theta,gamma,std,var,mean,...,theta,gamma,std,var,mean,abs_mean,delta,theta,gamma,<lambda>
epoch,segment_id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
0,0,19.215291,369.227395,1.67776,14.539727,39.509423,15.649456,1.143016,21.315638,454.356435,1.053339,...,92.822474,22.441816,16.368013,267.91186,0.692305,13.053429,33.181709,18.606278,0.272641,False
0,1,19.292719,372.208995,-0.391839,15.081409,57.166882,13.481116,1.21843,25.477037,649.079429,0.811962,...,76.80825,33.427544,18.012047,324.43384,-0.487618,14.154003,40.078259,21.722061,0.268936,False
0,2,26.026211,677.363658,1.356797,18.727514,112.603482,22.111449,0.943441,27.729263,768.912015,1.282874,...,176.024973,8.914849,22.813508,520.456131,1.294472,18.522101,78.066994,19.695134,0.256728,False
0,3,26.99949,728.972443,-1.657887,18.8776,148.551723,15.161321,1.178056,29.614908,877.042781,-2.174102,...,75.730101,21.215121,19.942794,397.715029,-0.47604,15.324033,60.340596,18.82107,0.258679,False
0,4,23.273241,541.643731,0.776272,17.412866,77.372989,25.561364,1.423834,23.413506,548.192254,0.848064,...,284.475569,16.303152,18.725786,350.655046,1.382429,15.186356,46.083693,24.182717,0.271254,False


## Flatten Dataframe

In [8]:
# joining column names with agg functions, but leaving target column
df_features.columns = ['_'.join(col).strip() for col in df_features.columns.values if target_colname != col[0]] + [target_colname]
df_features.reset_index(inplace=True)
df_features.tail(10)


Unnamed: 0,epoch,segment_id,F4-C4_std,F4-C4_var,F4-C4_mean,F4-C4_abs_mean,F4-C4_delta,F4-C4_theta,F4-C4_gamma,F3-C3_std,...,FP1-F3_theta,FP1-F3_gamma,CZ-PZ_std,CZ-PZ_var,CZ-PZ_mean,CZ-PZ_abs_mean,CZ-PZ_delta,CZ-PZ_theta,CZ-PZ_gamma,target
13250,441,20,56.319463,3171.881897,-1.486205,43.694133,715.362477,80.32042,0.09537,52.077373,...,165.369068,0.136447,57.400473,3294.814344,0.897942,45.527385,625.681009,128.672025,0.141898,False
13251,441,21,49.329472,2433.39679,0.601365,38.213047,340.578715,81.399241,0.1273,58.731758,...,228.128294,0.19306,66.623744,4438.723268,-0.832903,52.210783,855.368406,133.88506,0.111974,False
13252,441,22,60.999838,3720.980268,3.29883,47.512114,717.760711,94.13315,0.078255,58.943169,...,185.471813,0.133344,59.187737,3503.188215,7.024759,45.714461,769.972346,123.193307,0.118564,False
13253,441,23,73.927097,5465.215724,-1.677242,58.312686,906.963237,148.016697,0.113451,69.15753,...,301.442328,0.14325,65.6457,4309.357902,-1.29598,53.853938,886.329485,139.457708,0.118907,False
13254,441,24,56.004131,3136.462676,-1.082853,42.174246,626.99705,66.044131,0.104605,65.358588,...,151.41255,0.159559,67.584048,4567.603599,-7.436802,54.204462,890.685696,96.332307,0.152464,False
13255,441,25,90.518591,8193.615287,1.872374,72.00398,1638.956241,301.855061,0.115082,85.191436,...,714.368608,0.132972,97.85686,9575.965072,9.140534,70.493062,2238.314139,246.211458,0.134664,False
13256,441,26,81.981497,6720.965844,-0.657955,66.855945,1436.00335,287.742696,0.107075,92.049722,...,325.353849,0.122267,69.699999,4858.089908,-5.527779,57.225355,812.955111,293.656952,0.129952,False
13257,441,27,81.985539,6721.628678,-0.794683,64.198591,1204.752371,552.137925,0.092687,85.294166,...,667.285121,0.149957,94.201093,8873.845993,3.611168,66.3117,1795.157965,366.792211,0.095242,False
13258,441,28,52.093886,2713.772983,3.839184,39.699645,536.57821,105.914862,0.115153,62.138331,...,158.078978,0.156917,63.836885,4075.147868,-1.288541,51.386112,834.946251,182.762834,0.116466,False
13259,441,29,76.07636,5787.612587,-3.793365,57.063751,1127.24339,77.150492,0.120792,82.359319,...,151.956576,0.138377,49.540358,2454.247031,1.636609,39.293956,409.801775,114.829371,0.119493,False


In [32]:
#Preparing data to be scaled

num_features= df_features.drop(['epoch','segment_id','target'],axis =1)

from sklearn.preprocessing import StandardScaler, minmax_scale, MinMaxScaler

scaler = MinMaxScaler()
num_features_scaled = scaler.fit_transform(num_features)
num_features_scaled.shape # timepoints x features

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


(13260, 70)

In [58]:
# Determine the number of epochs
num_epochs = df_features.epoch.nunique()
# Determine the number of segments for each epoch
num_segments = df_features.segment_id.nunique()  # Assuming there are 30 segments for each epocha
num_features = num_features_scaled.shape[1]

reshaped_features = num_features_scaled.reshape(num_epochs, num_segments, num_features)
reshaped_features.shape

(442, 30, 70)

In [61]:
# target_array = df_features.groupby('epoch')['target'].sum().gt(0).astype(int).values
target_array = df_features['target'].values.reshape(num_epochs, -1)
target_array.shape

(442, 30)

In [72]:
array_all_scaled = np.concatenate((reshaped_features, target_array[:,:,np.newaxis]), axis=2)
array_all_scaled.shape

(442, 30, 71)

In [None]:
np.save('data/feature_extract_reshaped.npy', array_all_scaled)