In [1]:
import pandas as pd
import numpy as np
from source import data_import
from source.calculate_mean_psd import calculate_mean_psd
from source.constants import CHANNELS, FREQUENCY_RANGES

print(f"loaded channel names: {CHANNELS}")

loaded channel names: ['F4-C4', 'F3-C3', 'FT9-FT10', 'FZ-CZ', 'F7-T7', 'FP2-F4', 'T8-P8-1', 'T8-P8-0', 'FP1-F3', 'CZ-PZ']


In [2]:
py_arrow_segment_file_name = "segmented_data_300"
np_file_name = "segmented_data_300"
# load data
df_pp = data_import.load_pyarrow(file_name=py_arrow_segment_file_name)
df_pp.shape

/home/weasel/reps/ai-seizure-detectives/source/../data/segmented_data_300.arrow was loaded.


(33561600, 17)

In [3]:
df_pp.isna().sum()

channel
F7-T7             0
FP1-F3            0
F3-C3             0
FP2-F4            0
F4-C4             0
T8-P8-0           0
FZ-CZ             0
CZ-PZ             0
FT9-FT10          0
T8-P8-1           0
is_seizure        0
before_seizure    0
file              0
seizure_start     0
target            0
epoch             0
segment_id        0
dtype: int64

In [4]:
df_pp.shape

(33561600, 17)

In [11]:
# df_pp = df_pp.iloc[:2_000_000,:]
# df_pp.shape

from scipy.signal import welch
f, Pxx = welch(df_pp['F7-T7'], fs=256, nperseg=256, noverlap=128, detrend="constant", scaling="density")

# import matplotlib.pyplot as plt
# plt.pcolormesh(df_pp.index, f, 10 * np.log10(Pxx))  # Converting to dB scale
# plt.ylabel('Frequency [Hz]')
# plt.xlabel('Time [sec]')
# plt.colorbar(label='Power Spectral Density (dB)')
# plt.show()

Pxx

array([1.33617031e+02, 6.79883614e+02, 5.60459699e+02, 2.84056740e+02,
       1.56091960e+02, 9.09543689e+01, 5.37764680e+01, 3.57499764e+01,
       2.82162949e+01, 1.80549309e+01, 1.29651234e+01, 1.51794595e+01,
       2.18835322e+01, 1.17059932e+01, 8.59736083e+00, 1.19459915e+01,
       2.11182341e+01, 1.22855652e+01, 9.47673116e+00, 1.06786219e+01,
       1.39763280e+01, 1.16029194e+01, 1.05082293e+01, 1.19290316e+01,
       1.53512919e+01, 1.26771206e+01, 1.16187725e+01, 2.25264684e+01,
       4.57138126e+01, 1.93670206e+01, 1.27333580e+01, 1.41298215e+01,
       1.71990056e+01, 1.44662043e+01, 1.35388270e+01, 1.46519769e+01,
       1.78544737e+01, 1.46552974e+01, 1.26931270e+01, 1.25815882e+01,
       1.31905263e+01, 1.15302345e+01, 9.88827951e+00, 3.90704635e+01,
       1.15096926e+02, 3.13122095e+01, 6.08300294e+00, 5.57003500e+00,
       5.81081207e+00, 3.85894282e+00, 2.71452041e+00, 2.19051921e+00,
       2.04218784e+00, 1.37057212e+00, 9.51742118e-01, 7.70347292e-01,
      

## Feature Extraction

In [8]:
# ignore for aggregation
ignore_col = ['psd']

# target definition
PRED_INTERVAL = 60
target_colname = 'target'

In [9]:
### aggregate Functions for mean psd:
delta = lambda x: calculate_mean_psd(x, frequency_ranges={'Delta' : FREQUENCY_RANGES['Delta']})[x.name]['Delta']
theta = lambda x: calculate_mean_psd(x, frequency_ranges={'Theta' : FREQUENCY_RANGES['Theta']})[x.name]['Theta']
gamma = lambda x: calculate_mean_psd(x, frequency_ranges={'Gamma': FREQUENCY_RANGES['Gamma']})[x.name]['Gamma']

delta_agg = pd.NamedAgg(column='delta', aggfunc=delta)
theta_agg = pd.NamedAgg(column='theta', aggfunc=theta)
gamma_agg = pd.NamedAgg(column='gamma', aggfunc=gamma)

### aggregate Functions for target:
target_foo = lambda x, pred_interval=PRED_INTERVAL: 0 < x.dt.total_seconds().min() < pred_interval


In [10]:
abs_mean = lambda x: x.apply(abs).mean()
abs_mean_agg = pd.NamedAgg(column='abs_mean', aggfunc=abs_mean)

In [None]:
# nice to add:
    # maximum
    # 80% maximum
    # hits above 80% maximum


In [11]:
df_features = df_pp.groupby(['epoch', 'segment_id']).agg(
    {C:[
        # 'mean', 
        'std',
        'var',
        'mean',
        abs_mean_agg,
        delta_agg,
        theta_agg,
        gamma_agg
        ] for C in CHANNELS} | 
    {target_colname: [target_foo]} 
    ) 
df_features.head()

Unnamed: 0_level_0,channel,F4-C4,F4-C4,F4-C4,F4-C4,F4-C4,F4-C4,F4-C4,F3-C3,F3-C3,F3-C3,...,FP1-F3,FP1-F3,CZ-PZ,CZ-PZ,CZ-PZ,CZ-PZ,CZ-PZ,CZ-PZ,CZ-PZ,target
Unnamed: 0_level_1,Unnamed: 1_level_1,std,var,mean,abs_mean,delta,theta,gamma,std,var,mean,...,theta,gamma,std,var,mean,abs_mean,delta,theta,gamma,<lambda>
epoch,segment_id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
0,0,28.591644,817.482095,5.700549,23.484432,90.75208,22.017557,1.358653,26.327333,693.128467,10.578449,...,131.355683,24.845281,15.917382,253.363046,-2.052808,12.764042,16.860269,9.121391,0.392502,False
0,1,15.703171,246.589589,1.385836,12.17033,38.092166,5.071763,2.198591,17.862862,319.081827,4.810745,...,18.431833,26.286171,19.462216,378.777839,-2.9884,15.975275,77.629197,10.970848,0.323111,False
0,2,15.430893,238.112444,-0.270147,12.554945,37.880881,17.995183,1.250724,17.298442,299.236084,-3.39591,...,12.84502,19.786654,10.90992,119.026364,12.789988,14.157509,12.582095,10.424822,0.183952,False
0,3,19.565103,382.793261,-0.79823,14.752747,17.753778,14.348884,0.886124,23.817424,567.269685,-7.261905,...,11.210357,28.438759,16.653185,277.328577,-3.002137,13.487485,17.957038,20.901462,0.236749,False
0,4,11.894778,141.485738,2.368742,9.62149,11.566655,5.349397,0.662431,14.410295,207.65659,0.511294,...,14.802946,15.672533,11.435759,130.776582,-1.312576,8.84768,15.21939,8.397903,0.372599,False


## Flatten Dataframe

In [12]:
# joining column names with agg functions, but leaving target column
df_features.columns = ['_'.join(col).strip() for col in df_features.columns.values if target_colname != col[0]] + [target_colname]
df_features.reset_index(inplace=True)
df_features.head(10)


Unnamed: 0,epoch,segment_id,F4-C4_std,F4-C4_var,F4-C4_mean,F4-C4_abs_mean,F4-C4_delta,F4-C4_theta,F4-C4_gamma,F3-C3_std,...,FP1-F3_theta,FP1-F3_gamma,CZ-PZ_std,CZ-PZ_var,CZ-PZ_mean,CZ-PZ_abs_mean,CZ-PZ_delta,CZ-PZ_theta,CZ-PZ_gamma,target
0,0,0,28.591644,817.482095,5.700549,23.484432,90.75208,22.017557,1.358653,26.327333,...,131.355683,24.845281,15.917382,253.363046,-2.052808,12.764042,16.860269,9.121391,0.392502,False
1,0,1,15.703171,246.589589,1.385836,12.17033,38.092166,5.071763,2.198591,17.862862,...,18.431833,26.286171,19.462216,378.777839,-2.9884,15.975275,77.629197,10.970848,0.323111,False
2,0,2,15.430893,238.112444,-0.270147,12.554945,37.880881,17.995183,1.250724,17.298442,...,12.84502,19.786654,10.90992,119.026364,12.789988,14.157509,12.582095,10.424822,0.183952,False
3,0,3,19.565103,382.793261,-0.79823,14.752747,17.753778,14.348884,0.886124,23.817424,...,11.210357,28.438759,16.653185,277.328577,-3.002137,13.487485,17.957038,20.901462,0.236749,False
4,0,4,11.894778,141.485738,2.368742,9.62149,11.566655,5.349397,0.662431,14.410295,...,14.802946,15.672533,11.435759,130.776582,-1.312576,8.84768,15.21939,8.397903,0.372599,False
5,0,5,20.45625,418.458171,-1.413309,15.947802,52.580755,25.085792,2.709056,32.195808,...,544.425437,45.142188,21.627095,467.731229,3.047924,17.338217,25.949766,29.852899,0.244165,False
6,0,6,26.688526,712.277423,-3.447802,21.144689,164.351623,13.122467,0.693333,27.911897,...,18.646527,21.81543,17.049196,290.675092,-2.356532,13.824786,65.058987,15.26704,0.230536,False
7,0,7,15.269193,233.148259,-1.939866,12.332112,17.984758,6.736646,1.220931,21.066595,...,22.841471,34.591868,14.515155,210.689727,-4.021673,11.527778,13.573702,13.476781,0.236498,False
8,0,8,14.091817,198.579312,2.806777,11.536935,14.936508,12.702729,1.284247,23.404054,...,27.785088,47.398717,17.921096,321.165669,1.355311,14.064408,49.629803,13.335399,0.331788,False
9,0,9,16.63245,276.638378,2.052808,14.285714,31.961215,5.704794,0.734911,20.025988,...,13.757021,11.188177,17.46845,305.146732,-0.436508,13.904151,37.930194,30.099507,0.336281,False


In [13]:
df_features.isna().sum()

epoch             0
segment_id        0
F4-C4_std         0
F4-C4_var         0
F4-C4_mean        0
                 ..
CZ-PZ_abs_mean    0
CZ-PZ_delta       0
CZ-PZ_theta       0
CZ-PZ_gamma       0
target            0
Length: 73, dtype: int64

In [14]:
#Preparing data to be scaled

num_features= df_features.drop(['epoch','segment_id','target'],axis =1)

from sklearn.preprocessing import StandardScaler, minmax_scale, MinMaxScaler

scaler = MinMaxScaler()
num_features_scaled = scaler.fit_transform(num_features)
num_features_scaled.shape # timepoints x features


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


(131100, 70)

In [15]:
num_features_scaled[:30, :3]


array([[3.80959219e-02, 1.45129926e-03, 5.35631024e-01],
       [2.09231337e-02, 4.37777526e-04, 5.28046873e-01],
       [2.05603455e-02, 4.22727809e-04, 5.25136083e-01],
       [2.60688279e-02, 6.79583787e-04, 5.24207849e-01],
       [1.58487748e-02, 2.51183664e-04, 5.29774568e-01],
       [2.72562051e-02, 7.42900718e-04, 5.23126699e-01],
       [3.55601801e-02, 1.26452641e-03, 5.19550585e-01],
       [2.03448949e-02, 4.13914749e-04, 5.22201148e-01],
       [1.87761421e-02, 3.52543513e-04, 5.30544519e-01],
       [2.21613176e-02, 4.91123998e-04, 5.29219238e-01],
       [5.80504234e-02, 3.36985166e-03, 5.20975128e-01],
       [2.11349044e-02, 4.46684182e-04, 5.16028126e-01],
       [2.53588901e-02, 6.43073307e-04, 5.41074336e-01],
       [3.03856789e-02, 9.23289480e-04, 5.24972435e-01],
       [1.97301637e-02, 3.89279358e-04, 5.36899968e-01],
       [2.63668893e-02, 6.95212851e-04, 5.25844330e-01],
       [2.41270289e-02, 5.82113522e-04, 5.20371508e-01],
       [3.52576758e-02, 1.24310

In [16]:
# Determine the number of epochs
num_epochs = df_features.epoch.nunique()
# Determine the number of segments for each epoch
num_segments = df_features.segment_id.nunique()  # Assuming there are 30 segments for each epocha
num_features = num_features_scaled.shape[1]

reshaped_features = num_features_scaled.reshape(num_epochs, num_segments, num_features)
reshaped_features.shape # epochs x segments x features

(437, 300, 70)

In [17]:
# target_array = df_features.groupby('epoch')['target'].sum().gt(0).astype(int).values
target_array = df_features['target'].values.reshape(num_epochs, -1)
target_array = target_array.astype(int)
target_array.shape

(437, 300)

In [18]:
array_all_scaled = np.concatenate((reshaped_features, target_array[:,:,np.newaxis]), axis=2)
array_all_scaled.shape

(437, 300, 71)

In [25]:
np.isnan(array_all_scaled).sum()

0

In [19]:
np.save('data/'+np_file_name + '.npy', array_all_scaled)

In [23]:
np.isnan(array_all_scaled).sum()

0