In [10]:
import pandas as pd
import numpy as np
from source import data_import
from source.calculate_mean_psd import calculate_mean_psd
from source.constants import CHANNELS, FREQUENCY_RANGES

print(f"loaded channel names: {CHANNELS}")

loaded channel names: ['F4-C4', 'F3-C3', 'FT9-FT10', 'FZ-CZ', 'F7-T7', 'FP2-F4', 'T8-P8-1', 'T8-P8-0', 'FP1-F3', 'CZ-PZ']


In [2]:
# load data
df_pp = data_import.load_pyarrow(file_name="preprocessed_seg")
df_pp.shape

/Users/anabroggini/Documents/bootcamp/ai2/source/../data/preprocessed_seg.arrow was loaded.


(2956800, 17)

In [145]:
# df_pp = df_pp.iloc[:2_000_000,:]
# df_pp.shape

## Feature Extraction

In [3]:
# ignore for aggregation
ignore_col = []

# target definition
PRED_INTERVAL = 60
target_colname = 'target'

In [4]:
### aggregate Functions for mean psd:
delta = lambda x: calculate_mean_psd(x, frequency_ranges={'Delta' : FREQUENCY_RANGES['Delta']})[x.name]['Delta']
theta = lambda x: calculate_mean_psd(x, frequency_ranges={'Theta' : FREQUENCY_RANGES['Theta']})[x.name]['Theta']
gamma = lambda x: calculate_mean_psd(x, frequency_ranges={'Gamma': FREQUENCY_RANGES['Gamma']})[x.name]['Gamma']

delta_agg = pd.NamedAgg(column='delta', aggfunc=delta)
theta_agg = pd.NamedAgg(column='theta', aggfunc=theta)
gamma_agg = pd.NamedAgg(column='gamma', aggfunc=gamma)

### aggregate Functions for target:
target_foo = lambda x, pred_interval=PRED_INTERVAL: 0 < x.dt.total_seconds().min() < pred_interval


In [5]:
abs_mean = lambda x: x.apply(abs).mean()
abs_mean_agg = pd.NamedAgg(column='abs_mean', aggfunc=abs_mean)

In [6]:
df_features = df_pp.groupby(['epoch', 'segment_id']).agg(
    {C:[
        # 'mean', 
        'std',
        'var',
        'mean',
        abs_mean_agg,
        delta_agg,
        theta_agg,
        gamma_agg
        ] for C in CHANNELS} | 
    {target_colname: [target_foo]} 
    ) 
df_features.head()

Unnamed: 0_level_0,channel,F4-C4,F4-C4,F4-C4,F4-C4,F4-C4,F4-C4,F4-C4,F3-C3,F3-C3,F3-C3,...,FP1-F3,FP1-F3,CZ-PZ,CZ-PZ,CZ-PZ,CZ-PZ,CZ-PZ,CZ-PZ,CZ-PZ,target
Unnamed: 0_level_1,Unnamed: 1_level_1,std,var,mean,abs_mean,delta,theta,gamma,std,var,mean,...,theta,gamma,std,var,mean,abs_mean,delta,theta,gamma,<lambda>
epoch,segment_id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
0,0,19.215291,369.227396,1.67776,14.539727,39.509423,15.649456,1.143016,21.315638,454.356435,1.053339,...,92.822474,22.441816,16.368013,267.91186,0.692305,13.053429,33.181709,18.606278,0.272641,False
0,1,19.292719,372.208995,-0.391839,15.081409,57.166882,13.481116,1.21843,25.477037,649.079429,0.811962,...,76.80825,33.427544,18.012047,324.43384,-0.487618,14.154003,40.078259,21.722061,0.268936,False
0,2,26.026211,677.363658,1.356797,18.727514,112.603482,22.111449,0.943441,27.729263,768.912015,1.282874,...,176.024973,8.914849,22.813508,520.456131,1.294472,18.522101,78.066994,19.695134,0.256728,False
0,3,26.99949,728.972443,-1.657887,18.8776,148.551723,15.161321,1.178056,29.614908,877.042781,-2.174102,...,75.730101,21.215121,19.942794,397.715029,-0.47604,15.324033,60.340596,18.82107,0.258679,False
0,4,23.273241,541.643731,0.776272,17.412866,77.372989,25.561364,1.423834,23.413506,548.192254,0.848064,...,284.475569,16.303152,18.725786,350.655046,1.382429,15.186356,46.083693,24.182717,0.271254,False


## Flatten Dataframe

In [7]:
# joining column names with agg functions, but leaving target column
df_features.columns = ['_'.join(col).strip() for col in df_features.columns.values if target_colname != col[0]] + [target_colname]
df_features.reset_index(inplace=True)
df_features.tail(10)


Unnamed: 0,epoch,segment_id,F4-C4_std,F4-C4_var,F4-C4_mean,F4-C4_abs_mean,F4-C4_delta,F4-C4_theta,F4-C4_gamma,F3-C3_std,...,FP1-F3_theta,FP1-F3_gamma,CZ-PZ_std,CZ-PZ_var,CZ-PZ_mean,CZ-PZ_abs_mean,CZ-PZ_delta,CZ-PZ_theta,CZ-PZ_gamma,target
2300,76,20,25.313932,640.795171,0.116834,17.794552,95.188019,42.647861,0.355765,33.487294,...,58.566418,0.761174,41.838344,1750.447057,0.325307,32.718365,347.624908,76.242583,0.193304,False
2301,76,21,23.838413,568.269912,2.425347,17.859053,55.532815,19.605176,1.335814,27.707548,...,64.785295,1.748735,33.972575,1154.135831,4.00943,26.103515,147.602594,54.769195,0.214655,False
2302,76,22,25.358219,643.039265,-1.44722,19.145417,55.704507,67.775172,0.196855,33.348169,...,71.951827,1.989523,35.964994,1293.480824,-2.112973,27.632532,164.684577,104.517317,0.127468,False
2303,76,23,31.348098,982.703239,0.478884,23.549697,175.333261,58.683852,0.386264,31.65482,...,98.0916,4.216003,49.747952,2474.858737,3.501682,40.715416,392.662181,111.171316,0.151815,False
2304,76,24,30.475637,928.764437,-0.182505,22.960227,160.385288,45.94834,0.143208,32.198535,...,97.692236,1.425963,49.032972,2404.232318,-3.726843,37.620701,311.714642,140.703379,0.145472,False
2305,76,25,24.887406,619.382963,0.147445,18.675508,84.832606,55.071043,0.125773,34.644845,...,102.395324,1.70631,43.259684,1871.400289,1.75482,34.26328,242.641297,136.565182,0.126398,False
2306,76,26,32.407469,1050.244041,-2.459588,23.532431,140.649221,52.274438,2.182925,33.407916,...,43.88136,1.209324,46.96934,2206.118943,-2.125237,37.305039,312.843153,104.359274,1.094737,False
2307,76,27,46.298131,2143.516964,1.692386,35.203013,120.710201,43.583592,17.760229,34.591939,...,91.690846,10.353716,40.078959,1606.322994,-1.060939,31.0115,324.890178,87.214394,0.698645,False
2308,76,28,91.319748,8339.296375,2.425243,52.042581,518.665885,747.761645,45.119246,47.029354,...,120.607728,11.348335,62.271117,3877.691998,6.711859,43.428942,457.229112,152.98741,19.808283,False
2309,76,29,81.189389,6591.716833,4.083427,56.392925,550.601147,374.764322,25.830989,40.085534,...,149.065111,13.897802,166.90515,27857.329154,-3.216964,91.962766,3789.88849,1453.974714,31.917868,False


In [9]:
#Preparing data to be scaled

num_features= df_features.drop(['epoch','segment_id','target'],axis =1)

from sklearn.preprocessing import StandardScaler, minmax_scale, MinMaxScaler

scaler = MinMaxScaler()
num_features_scaled = scaler.fit_transform(num_features)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [22]:
original_array =np.array(df_features)
target=original_array[:,-1]
target = target[:, np.newaxis]
segseiz_column = original_array[:, 0:2] ## epoch and segment_id
array_all_scaled = np.concatenate((segseiz_column, num_features_scaled, target), axis=1)
array_all_scaled.shape

(2310, 73)

In [24]:
import numpy as np


original_array =np.array(df_features)

# Extract the epoch column
epoch_column = original_array[:, 0]

# Determine the number of epochs (assuming epochs are from 1 to number of segments)
num_epochs = df_features.epoch.unique()[-1]

# Determine the number of segments for each epoch
num_segments = len(df_features.segment_id.unique())  # Assuming there are 30 segments for each epoch

# Initialize an empty 3D array
reshaped_array = np.empty((num_epochs,num_segments, array_all_scaled.shape[1]))

# Reshape the data for each epoch and insert it into the 3D array
for epoch in range(num_epochs):
    start_idx = epoch * num_segments
    end_idx = (epoch + 1) * num_segments
    reshaped_array[epoch,:, :] = array_all_scaled[start_idx:end_idx,:]

# Now, 'reshaped_array' is a 3D NumPy array where each 2D slice (nsegmenst x numb of features) contains data for each epoch.

In [25]:
reshaped_array.shape

(76, 30, 73)

In [None]:

np.save('data/feature_extract_reshaped.npy', reshaped_array)

In [26]:
# from source.data_import import save_pyarrow

# save_pyarrow(df_features, file_name="feature_extracted")

/Users/anabroggini/Documents/bootcamp/ai2/source/../data/feature_extracted.arrow was successfully written.
