# Make Dataset

## Imports

In [1]:
from PlantReactivityAnalysis.data.wav_data_reader import WavDataReader
from PlantReactivityAnalysis.data.signal_dataset import SignalDataset
from PlantReactivityAnalysis.features.wav_feature_extractor import WavFeatureExtractor
from PlantReactivityAnalysis.features.features_dataset import FeaturesDataset
import PlantReactivityAnalysis.data.preparation_eurythmy_data as ped

In [2]:
import pandas as pd
pd.set_option('display.max_columns', None)

## Reader

In [3]:
wav_folder= r"..\data\raw\wav_files"
#wav_folder= r"..\data\interim\testing"

In [3]:
# Initialize the reader with the folder of wavs
reader= WavDataReader(folder= wav_folder, sample_rate= 10000)

# Get the signals and the keys from the reader
signals, ids= reader.get_values_and_keys()

# Get the measurement labels from the keys of the files
meas_df= ped.return_meas_labels_by_keys(ids)

Total WAV files read: 625


In [4]:
meas_df.head()

Unnamed: 0,id_measurement,id_performance,datetime,plant,generation,num_eurythmy
0,1,1,2023-04-29,salad,1,1
1,2,1,2023-04-29,salad,1,1
2,3,1,2023-04-29,salad,1,1
3,4,1,2023-04-29,salad,2,0
4,5,1,2023-04-29,salad,2,0


## Signal Dataset

### Raw

In [5]:
# Initialize SignalDataset with the signals and features/labels
signal_dataset= SignalDataset(signals= signals, features= meas_df)

In [6]:
signal_dataset.features.shape

(625, 6)

In [7]:
signal_dataset.features.head()

Unnamed: 0,id_measurement,id_performance,datetime,plant,generation,num_eurythmy
0,1,1,2023-04-29,salad,1,1
1,2,1,2023-04-29,salad,1,1
2,3,1,2023-04-29,salad,1,1
3,4,1,2023-04-29,salad,2,0
4,5,1,2023-04-29,salad,2,0


In [8]:
# Save and Load the dataset
raw_signal_dataset_path= r"..\data\raw\raw_signal_dataset.pkl"
signal_dataset.save(raw_signal_dataset_path)
signal_dataset= SignalDataset.load(raw_signal_dataset_path)

### Normalized

In [9]:
# Standardize Signals using Zscore
signal_dataset.standardize_signals("zscore")

In [10]:
signal_dataset.features.shape

(625, 6)

In [4]:
# Save the dataset
norm_signal_dataset_path= r"..\data\interim\norm_signal_dataset.pkl"
signal_dataset.save(norm_signal_dataset_path)

NameError: name 'signal_dataset' is not defined

### Segmented by Letters (raw)

In [12]:
# Load raw dataset
signal_dataset= SignalDataset.load(raw_signal_dataset_path)

In [13]:
# Get Letter segments from raw data
letter_dictionary= ped.return_letter_dictionary(indexes= signal_dataset.features['id_measurement'].tolist())

In [14]:
# Segment the dataset
signal_dataset.segment_signals_by_dict('id_measurement', letter_dictionary, 'eurythmy_letter')

In [15]:
signal_dataset.features.shape

(8390, 7)

In [16]:
signal_dataset.features.head()

Unnamed: 0,id_measurement,id_performance,datetime,plant,generation,num_eurythmy,eurythmy_letter
0,1,1,2023-04-29,salad,1,1,A1
1,1,1,2023-04-29,salad,1,1,G1
2,1,1,2023-04-29,salad,1,1,D1
3,1,1,2023-04-29,salad,1,1,A2
4,1,1,2023-04-29,salad,1,1,G2


In [5]:
raw_letters_signal_dataset_path= r"..\data\raw\raw_letters_signal_dataset.pkl"

In [18]:
# Save the dataset
signal_dataset.save(raw_letters_signal_dataset_path)

### Segmented by Letters (normalized)

In [19]:
# Load normalized dataset
signal_dataset= SignalDataset.load(norm_signal_dataset_path)

In [20]:
# Get Letter segments from raw data
letter_dictionary= ped.return_letter_dictionary(indexes= signal_dataset.features['id_measurement'].tolist())

In [21]:
# Segment the dataset
signal_dataset.segment_signals_by_dict('id_measurement', letter_dictionary, 'eurythmy_letter')

In [6]:
norm_letters_signal_dataset_path= r"..\data\interim\norm_letters_signal_dataset.pkl"

In [23]:
# Save the dataset
signal_dataset.save(norm_letters_signal_dataset_path)

### Segmented in 1s (raw)

In [24]:
# Load raw dataset
signal_dataset= SignalDataset.load(raw_signal_dataset_path)

In [25]:
# Segment Signals in 1sec segments
signal_dataset.segment_signals_by_duration(segment_duration= 1)

In [26]:
#Add eurythmy letter data to the features
df= ped.add_meas_letters(signal_dataset.features)

In [27]:
signal_dataset.features.shape

(148682, 8)

In [28]:
signal_dataset.features.iloc[15:20]

Unnamed: 0,id_measurement,id_performance,datetime,plant,generation,num_eurythmy,initial_second,eurythmy_letter
15,1,1,2023-04-29,salad,1,1,15.0,
16,1,1,2023-04-29,salad,1,1,16.0,
17,1,1,2023-04-29,salad,1,1,17.0,A1
18,1,1,2023-04-29,salad,1,1,18.0,A1
19,1,1,2023-04-29,salad,1,1,19.0,A1


In [14]:
# Save the dataset
raw_1s_signal_dataset_path= r"..\data\raw\raw_1s_signal_dataset.pkl"
signal_dataset.save(raw_1s_signal_dataset_path)

### Segmented in 1s (normalized)

In [30]:
# Load normalized dataset
signal_dataset= SignalDataset.load(norm_signal_dataset_path)

In [31]:
# Segment Signals in 1sec segments
signal_dataset.segment_signals_by_duration(segment_duration=1)

In [32]:
#Add eurythmy letter data to the features
df= ped.add_meas_letters(signal_dataset.features)

In [33]:
# Save the dataset
norm_1s_signal_dataset_path= r"..\data\interim\norm_1s_signal_dataset.pkl"
signal_dataset.save(norm_1s_signal_dataset_path)

## Features Dataset

In [7]:
window_size= 2
hop_length= 1

feature_extractor= WavFeatureExtractor(sample_rate= 10000, lib_mfccs= True, pyau_mfccs= True, temporal= True,
                                       statistical= True, window_size= window_size, hop_length= hop_length)

### Normalized letters

In [8]:
# Load dataset
signal_dataset= SignalDataset.load(norm_letters_signal_dataset_path)

In [9]:
%%time

# Extract features and store in dataset
feat_dataset= FeaturesDataset.from_signal_dataset(signal_dataset, feature_extractor)

  feature_values.append(func(waveform_data))


CPU times: total: 4h 35min 43s
Wall time: 3h 21min


In [10]:
feat_dataset.features.shape

(8390, 187)

In [11]:
feat_dataset.features.head()

Unnamed: 0,id_measurement,id_performance,datetime,plant,generation,num_eurythmy,eurythmy_letter,lib_mfcc_1_avg,lib_mfcc_2_avg,lib_mfcc_3_avg,lib_mfcc_4_avg,lib_mfcc_5_avg,lib_mfcc_6_avg,lib_mfcc_7_avg,lib_mfcc_8_avg,lib_mfcc_9_avg,lib_mfcc_10_avg,lib_mfcc_11_avg,lib_mfcc_12_avg,lib_mfcc_13_avg,lib_mfcc_1_std,lib_mfcc_2_std,lib_mfcc_3_std,lib_mfcc_4_std,lib_mfcc_5_std,lib_mfcc_6_std,lib_mfcc_7_std,lib_mfcc_8_std,lib_mfcc_9_std,lib_mfcc_10_std,lib_mfcc_11_std,lib_mfcc_12_std,lib_mfcc_13_std,zcr_mean,energy_mean,energy_entropy_mean,spectral_centroid_mean,spectral_spread_mean,spectral_entropy_mean,spectral_flux_mean,spectral_rolloff_mean,mfcc_1_mean,mfcc_2_mean,mfcc_3_mean,mfcc_4_mean,mfcc_5_mean,mfcc_6_mean,mfcc_7_mean,mfcc_8_mean,mfcc_9_mean,mfcc_10_mean,mfcc_11_mean,mfcc_12_mean,mfcc_13_mean,chroma_1_mean,chroma_2_mean,chroma_3_mean,chroma_4_mean,chroma_5_mean,chroma_6_mean,chroma_7_mean,chroma_8_mean,chroma_9_mean,chroma_10_mean,chroma_11_mean,chroma_12_mean,chroma_std_mean,delta zcr_mean,delta energy_mean,delta energy_entropy_mean,delta spectral_centroid_mean,delta spectral_spread_mean,delta spectral_entropy_mean,delta spectral_flux_mean,delta spectral_rolloff_mean,delta mfcc_1_mean,delta mfcc_2_mean,delta mfcc_3_mean,delta mfcc_4_mean,delta mfcc_5_mean,delta mfcc_6_mean,delta mfcc_7_mean,delta mfcc_8_mean,delta mfcc_9_mean,delta mfcc_10_mean,delta mfcc_11_mean,delta mfcc_12_mean,delta mfcc_13_mean,delta chroma_1_mean,delta chroma_2_mean,delta chroma_3_mean,delta chroma_4_mean,delta chroma_5_mean,delta chroma_6_mean,delta chroma_7_mean,delta chroma_8_mean,delta chroma_9_mean,delta chroma_10_mean,delta chroma_11_mean,delta chroma_12_mean,delta chroma_std_mean,zcr_std,energy_std,energy_entropy_std,spectral_centroid_std,spectral_spread_std,spectral_entropy_std,spectral_flux_std,spectral_rolloff_std,mfcc_1_std,mfcc_2_std,mfcc_3_std,mfcc_4_std,mfcc_5_std,mfcc_6_std,mfcc_7_std,mfcc_8_std,mfcc_9_std,mfcc_10_std,mfcc_11_std,mfcc_12_std,mfcc_13_std,chroma_1_std,chroma_2_std,chroma_3_std,chroma_4_std,chroma_5_std,chroma_6_std,chroma_7_std,chroma_8_std,chroma_9_std,chroma_10_std,chroma_11_std,chroma_12_std,chroma_std_std,delta zcr_std,delta energy_std,delta energy_entropy_std,delta spectral_centroid_std,delta spectral_spread_std,delta spectral_entropy_std,delta spectral_flux_std,delta spectral_rolloff_std,delta mfcc_1_std,delta mfcc_2_std,delta mfcc_3_std,delta mfcc_4_std,delta mfcc_5_std,delta mfcc_6_std,delta mfcc_7_std,delta mfcc_8_std,delta mfcc_9_std,delta mfcc_10_std,delta mfcc_11_std,delta mfcc_12_std,delta mfcc_13_std,delta chroma_1_std,delta chroma_2_std,delta chroma_3_std,delta chroma_4_std,delta chroma_5_std,delta chroma_6_std,delta chroma_7_std,delta chroma_8_std,delta chroma_9_std,delta chroma_10_std,delta chroma_11_std,delta chroma_12_std,delta chroma_std_std,zero_crossing_rate,root_mean_square_energy,slope_sign_changes_ratio,duration_seconds,flatness_ratio_10000,flatness_ratio_5000,flatness_ratio_1000,flatness_ratio_500,flatness_ratio_100,hjorth_mobility,hjorth_complexity,mean,variance,standard_deviation,interquartile_range,skewness,kurtosis,dfa
0,1,1,2023-04-29,salad,1,1,A1,-16.96806,90.087669,26.899481,24.194113,17.151419,14.240753,11.290216,9.826055,8.650502,8.07935,7.286806,6.587972,6.071565,66.670685,0.680445,0.709683,0.69815,0.689253,0.71593,0.877229,1.018075,0.871036,0.662058,0.664061,0.795815,0.718052,0.000128,0.167745,3.262508,0.037384,0.088853,0.014813,0.043979,0.001289,-58.572811,1.317898,0.15585,0.24343,0.106877,0.120635,0.064013,0.061221,0.025595,0.036339,0.02877,0.039881,0.033821,0.000156,2.4e-05,0.004477,0.000141,0.191084,8.6e-05,0.01459,1.6e-05,5e-05,0.000132,0.000731,3.8e-05,0.055679,0.0,-0.000887,-0.000271,0.000553,0.001472,4.4e-05,0.000276,2.409338e-20,0.583388,0.018914,0.002263,0.003191,0.001899,0.002272,0.001184,0.001858,0.001786,0.002242,0.002507,0.001986,0.000766,2.632583e-07,4.109202e-08,7e-06,4.167794e-08,-1.1e-05,1.225482e-07,1e-05,2.259808e-08,1.875885e-07,5.319431e-07,3.634603e-06,8.780187e-08,-4e-06,0.000337,0.124951,0.106644,0.035332,0.072924,0.035863,0.074596,0.003923,32.318233,1.064469,0.201958,0.206716,0.134111,0.120819,0.115699,0.100873,0.089252,0.133393,0.109719,0.100023,0.086061,0.000451,7.5e-05,0.012839,0.0004,0.023806,0.000234,0.039621,4.3e-05,0.000146,0.000364,0.002162,0.000118,0.003631,0.00045,0.034493,0.125547,0.035283,0.083237,0.045532,0.08538,0.005764,41.23504,1.3685,0.279319,0.270835,0.183203,0.161093,0.161593,0.129834,0.131561,0.189406,0.160592,0.139598,0.117087,0.000626,0.000107,0.017821,0.000579,0.032843,0.000317,0.055035,5.9e-05,0.000206,0.00051,0.002974,0.000163,0.005356,3.3e-05,1.021693,0.0032,9.0,0.0,0.063533,0.473978,0.7487,0.998622,2e-05,59933.361624,0.766171,0.456839,0.675898,1.182676,-0.391712,-0.974329,1.553715
1,1,1,2023-04-29,salad,1,1,G1,-19.277445,89.521439,26.375952,23.633984,16.586861,13.726748,10.920086,9.534808,8.284781,7.517893,6.723184,6.13703,5.597915,66.973846,0.263484,0.263755,0.262599,0.254363,0.249903,0.240857,0.237597,0.245338,0.266747,0.273929,0.268865,0.264854,0.0001,0.119247,3.231678,0.042233,0.092726,0.013197,0.089567,0.000989,-61.884994,1.222501,0.160577,0.216129,0.106043,0.111016,0.070964,0.062873,0.007619,0.031325,0.026238,0.025751,0.030118,0.000181,1.5e-05,0.00365,0.000108,0.192713,0.0001,0.011169,1.3e-05,5e-05,0.000198,0.000858,2.1e-05,0.054683,0.0,0.004876,0.000262,-9.6e-05,8e-06,-1.4e-05,-4.9e-05,1.927471e-20,0.187885,0.006214,0.000119,0.000449,-0.000324,-0.000546,-0.000313,0.000794,0.002105,0.002015,0.000354,-0.000789,-0.000176,-6.483114e-08,-2.436456e-08,-5e-06,-1.26866e-07,1e-05,-2.66569e-08,-1.7e-05,-2.264775e-08,-2.431965e-08,-1.415465e-07,-6.819462e-07,-9.27765e-09,3e-06,0.000316,0.157601,0.189011,0.043935,0.089479,0.038276,0.165839,0.004083,33.49551,1.105848,0.175194,0.218954,0.148064,0.136833,0.11666,0.121877,0.126277,0.111334,0.123313,0.091418,0.078962,0.000708,4.7e-05,0.011905,0.000386,0.021346,0.000342,0.03343,4.9e-05,0.000202,0.000689,0.002735,5.3e-05,0.003464,0.000441,0.054531,0.225242,0.053744,0.106492,0.049917,0.220227,0.005332,39.108367,1.297519,0.21904,0.275184,0.21146,0.190363,0.158141,0.173801,0.170225,0.153951,0.172197,0.129385,0.112274,0.000973,6.4e-05,0.015491,0.000544,0.026763,0.000453,0.043424,6.6e-05,0.000281,0.000882,0.003606,6.3e-05,0.005382,6.1e-05,0.578902,0.003467,9.0,0.0,0.177978,0.545711,0.691344,0.992111,3.5e-05,33676.824714,0.320248,0.232568,0.482253,0.272925,0.298048,1.306129,1.63642
2,1,1,2023-04-29,salad,1,1,D1,-37.458382,89.903816,26.839069,24.092028,17.070927,14.16201,11.355535,9.97653,8.766505,7.958597,7.128137,6.506106,6.034042,25.633757,0.639767,0.597869,0.632565,0.597008,0.642655,0.607788,0.620355,0.579742,0.602118,0.610222,0.669156,0.610437,3.9e-05,0.485594,3.257664,0.030042,0.071572,0.007235,0.045636,0.000544,-66.781971,1.039756,0.133514,0.183744,0.081755,0.080067,0.039118,0.055555,0.029227,0.026589,0.019218,0.017594,0.014035,5.2e-05,1e-05,0.002379,6.6e-05,0.194783,4.1e-05,0.008916,6e-06,1.9e-05,7.6e-05,0.000524,1.9e-05,0.055169,0.0,-0.001164,-0.000271,0.000458,0.001356,2.8e-05,0.00073,0.0,0.602073,0.019452,0.00265,0.0035,0.002256,0.002349,0.002075,0.002939,0.002086,0.00106,-0.001293,-0.001791,-0.000446,1.215333e-07,2.683734e-08,2e-06,7.278122e-08,-1e-05,2.364913e-08,2e-05,9.905201e-09,2.043249e-08,1.482657e-07,7.498478e-07,4.169417e-08,-3e-06,0.000222,0.239421,0.176993,0.033177,0.076449,0.022933,0.085698,0.002253,33.506518,1.084038,0.162053,0.206292,0.104223,0.120591,0.098353,0.111142,0.077953,0.080654,0.085079,0.077661,0.063264,0.000163,3.7e-05,0.00827,0.000232,0.018183,0.000166,0.031825,2.2e-05,6.2e-05,0.000269,0.002172,7.2e-05,0.002514,0.000315,0.099146,0.20017,0.036419,0.093084,0.028341,0.098705,0.002924,44.126391,1.435268,0.215428,0.265359,0.138052,0.170294,0.137253,0.158087,0.105826,0.113183,0.118941,0.105937,0.091127,0.000218,4.6e-05,0.010269,0.000296,0.022469,0.000225,0.03959,3e-05,7.4e-05,0.000328,0.003055,9.7e-05,0.003817,0.0,0.93464,0.002844,9.0,0.0,0.074811,0.618556,0.794667,0.997756,1.3e-05,102873.344464,0.787085,0.25405,0.504034,1.091701,0.317148,-1.614749,1.616557
3,1,1,2023-04-29,salad,1,1,A2,-18.129278,89.619514,26.385571,23.596951,16.690722,13.939219,10.827085,9.400678,8.313966,7.570932,6.841765,5.967853,5.736463,27.599552,0.371661,0.501501,0.672059,0.423575,0.505776,0.491764,0.561433,0.483301,0.446979,0.427201,0.542246,0.371556,0.000178,0.15992,3.13482,0.058227,0.124244,0.018161,0.085443,0.001311,-51.057582,1.540157,0.185878,0.277555,0.123274,0.136119,0.082429,0.083458,0.030998,0.029258,0.023158,0.038771,0.031913,0.000192,2.7e-05,0.006764,0.00011,0.187908,0.000108,0.019785,1.5e-05,5.9e-05,0.00022,0.000859,3.6e-05,0.053729,0.0,-0.001931,-0.007569,0.00111,0.001593,0.000981,0.000676,0.0001,0.39673,0.012017,0.001494,0.002058,0.000698,0.000878,0.000641,0.000997,0.000823,0.000122,-0.000821,-0.00054,-0.000269,2.134975e-06,1.688325e-06,0.000694,4.026872e-06,-0.000555,1.552766e-05,0.000493,4.360922e-07,9.833871e-07,1.176395e-05,4.891033e-05,1.390498e-06,-0.000107,0.000577,0.158297,0.309144,0.048459,0.089707,0.034461,0.1212,0.003307,32.495419,1.049497,0.175624,0.198305,0.124001,0.137182,0.097633,0.094867,0.080669,0.093725,0.102551,0.08259,0.07373,0.000477,5.7e-05,0.01684,0.00026,0.024259,0.000252,0.040373,3.4e-05,0.00013,0.000509,0.001853,7.8e-05,0.003089,0.000827,0.079123,0.360949,0.053016,0.10501,0.044117,0.145586,0.004492,40.901903,1.316471,0.237636,0.261241,0.173874,0.192164,0.137107,0.128759,0.115928,0.134027,0.146684,0.117719,0.102797,0.000648,7.1e-05,0.022243,0.000359,0.031452,0.000319,0.052707,4.5e-05,0.000173,0.000697,0.002348,0.000104,0.003868,0.0,0.712616,0.004578,9.0,0.0,0.059411,0.346533,0.608044,0.988978,2.1e-05,67718.649555,0.612003,0.133273,0.365066,0.54585,0.745154,-0.625135,1.623486
4,1,1,2023-04-29,salad,1,1,G2,-5.649859,89.638298,26.364552,23.625319,16.918325,13.814619,10.872191,9.92057,8.517897,7.806834,6.841885,6.451043,5.772421,45.387932,0.550625,1.099175,1.086241,0.373823,0.3854,0.636668,0.442019,0.816318,0.322139,0.48185,0.520333,0.42789,8.9e-05,0.197431,3.24274,0.034842,0.093569,0.011152,0.056166,0.001067,-53.031806,1.519695,0.184345,0.260347,0.120823,0.129256,0.077194,0.066027,0.03849,0.054446,0.027223,0.027441,0.023046,0.000154,1.5e-05,0.002468,0.000102,0.193608,9e-05,0.010538,1.6e-05,4.1e-05,0.00017,0.000667,2.2e-05,0.05511,0.0,0.00066,0.000103,-0.000111,-0.000229,-7e-06,-1e-05,-7.709882e-20,-0.003799,0.000813,0.001709,-0.00047,0.000136,0.000299,-0.000375,0.000255,-0.000671,8.9e-05,-0.000414,0.000108,-0.000422,-2.06085e-08,-1.204822e-08,-2e-06,-5.465197e-08,4e-06,-9.979501e-09,-6e-06,-8.239709e-09,-1.243283e-08,-9.359624e-08,-9.218943e-08,-2.952753e-09,1e-06,0.000336,0.14903,0.219442,0.032723,0.071448,0.042012,0.124334,0.004836,32.123254,1.075602,0.17494,0.217125,0.138608,0.135423,0.124756,0.115346,0.108065,0.11984,0.120998,0.108438,0.084919,0.000744,5e-05,0.008903,0.000479,0.020648,0.000404,0.031937,7.4e-05,0.000183,0.000773,0.002593,7.2e-05,0.002585,0.000492,0.056772,0.27042,0.04153,0.095504,0.058038,0.126017,0.006887,43.251157,1.441351,0.232245,0.293852,0.198902,0.181459,0.179917,0.159479,0.158322,0.174555,0.176218,0.160072,0.124853,0.001063,6.8e-05,0.012431,0.000676,0.027505,0.000569,0.04156,0.000104,0.000255,0.001096,0.003466,0.0001,0.003801,3.3e-05,0.820057,0.004067,9.0,0.0,0.0,0.338067,0.635756,0.9949,2.4e-05,51573.974516,-0.305329,0.579268,0.761097,1.273651,-0.153654,-0.858505,1.643827


In [12]:
# Save the dataset
feat_norm_letters_dataset_path= r"..\data\processed\feat_norm_letters_" + str(window_size) + "_" + str(hop_length) + "_dataset.pkl"
feat_dataset.save(feat_norm_letters_dataset_path)

### Letters

In [13]:
# Load dataset
signal_dataset= SignalDataset.load(raw_letters_signal_dataset_path)

In [14]:
%%time

# Extract features and store in dataset
feat_dataset= FeaturesDataset.from_signal_dataset(signal_dataset, feature_extractor)

  feature_values.append(func(waveform_data))


CPU times: total: 4h 52min 49s
Wall time: 3h 8min 6s


In [15]:
# Save the dataset
feat_raw_letters_dataset_path= r"..\data\processed\feat_raw_letters_" + str(window_size) + "_" + str(hop_length) + "_dataset.pkl"
feat_dataset.save(feat_raw_letters_dataset_path)