# Make Dataset

## Imports

In [1]:
from PlantReactivityAnalysis.data.wav_data_reader import WavDataReader
from PlantReactivityAnalysis.data.signal_dataset import SignalDataset
from PlantReactivityAnalysis.features.wav_feature_extractor import WavFeatureExtractor
from PlantReactivityAnalysis.features.features_dataset import FeaturesDataset
import PlantReactivityAnalysis.data.preparation_eurythmy_data as ped

In [2]:
import pandas as pd
pd.set_option('display.max_columns', None)

## Reader

In [3]:
wav_folder= r"..\data\raw\wav_files"
#wav_folder= r"..\data\interim\testing"

In [4]:
# Initialize the reader with the folder of wavs
reader= WavDataReader(folder= wav_folder, sample_rate= 10000)

# Get the signals and the keys from the reader
signals, ids= reader.get_ordered_signals_and_keys()

# Get the measurement labels from the keys of the files
meas_df= ped.return_meas_labels_by_keys(ids)

Total WAV files read: 625


In [5]:
meas_df.head()

Unnamed: 0,id_measurement,id_performance,datetime,plant,generation,num_eurythmy
0,1,1,2023-04-29,salad,1,1
1,2,1,2023-04-29,salad,1,1
2,3,1,2023-04-29,salad,1,1
3,4,1,2023-04-29,salad,2,0
4,5,1,2023-04-29,salad,2,0


## Signal Dataset

### Raw

In [6]:
# Initialize SignalDataset with the signals and features/labels
signal_dataset= SignalDataset(signals= signals, features= meas_df)

In [7]:
signal_dataset.features.shape

(625, 6)

In [8]:
signal_dataset.features.head()

Unnamed: 0,id_measurement,id_performance,datetime,plant,generation,num_eurythmy
0,1,1,2023-04-29,salad,1,1
1,2,1,2023-04-29,salad,1,1
2,3,1,2023-04-29,salad,1,1
3,4,1,2023-04-29,salad,2,0
4,5,1,2023-04-29,salad,2,0


In [9]:
# Save and Load the dataset
raw_signal_dataset_path= r"..\data\raw\raw_signal_dataset.pkl"

In [10]:
signal_dataset.save(raw_signal_dataset_path)
signal_dataset= SignalDataset.load(raw_signal_dataset_path)

### Normalized

In [11]:
# Standardize Signals using Zscore
signal_dataset.standardize_signals("zscore")

In [12]:
signal_dataset.features.shape

(625, 6)

In [13]:
# Save the dataset
norm_signal_dataset_path= r"..\data\interim\norm_signal_dataset.pkl"
signal_dataset.save(norm_signal_dataset_path)

### Segmented by Letters (raw)

In [14]:
# Load raw dataset
signal_dataset= SignalDataset.load(raw_signal_dataset_path)

In [15]:
# Get Letter segments from raw data
letter_dictionary= ped.return_letter_dictionary(indexes= signal_dataset.features['id_measurement'].tolist())

In [16]:
# Segment the dataset
signal_dataset.segment_signals_by_dict('id_measurement', letter_dictionary, 'eurythmy_letter')

In [17]:
signal_dataset.features.shape #8390, 7

(8878, 7)

In [18]:
signal_dataset.features.head()

Unnamed: 0,id_measurement,id_performance,datetime,plant,generation,num_eurythmy,eurythmy_letter
0,1,1,2023-04-29,salad,1,1,A1
1,1,1,2023-04-29,salad,1,1,G1
2,1,1,2023-04-29,salad,1,1,D1
3,1,1,2023-04-29,salad,1,1,A2
4,1,1,2023-04-29,salad,1,1,G2


In [4]:
raw_letters_signal_dataset_path= r"..\data\raw\raw_letters_signal_dataset.pkl"

In [20]:
# Save the dataset
signal_dataset.save(raw_letters_signal_dataset_path)

### Segmented by Letters (normalized)

In [21]:
# Load normalized dataset
signal_dataset= SignalDataset.load(norm_signal_dataset_path)

In [22]:
# Get Letter segments from raw data
letter_dictionary= ped.return_letter_dictionary(indexes= signal_dataset.features['id_measurement'].tolist())

In [23]:
# Segment the dataset
signal_dataset.segment_signals_by_dict('id_measurement', letter_dictionary, 'eurythmy_letter')

In [3]:
norm_letters_signal_dataset_path= r"..\data\interim\norm_letters_signal_dataset.pkl"

In [25]:
# Save the dataset
signal_dataset.save(norm_letters_signal_dataset_path)

### Segmented in 1s (raw)

In [26]:
# Load raw dataset
signal_dataset= SignalDataset.load(raw_signal_dataset_path)

In [27]:
# Segment Signals in 1sec segments
signal_dataset.segment_signals_by_duration(segment_duration= 1)

In [28]:
#Add eurythmy letter data to the features
df= ped.add_meas_letters(signal_dataset.features)

In [29]:
signal_dataset.features.shape

(148682, 8)

In [30]:
signal_dataset.features.iloc[15:20]

Unnamed: 0,id_measurement,id_performance,datetime,plant,generation,num_eurythmy,initial_second,eurythmy_letter
15,1,1,2023-04-29,salad,1,1,15.0,
16,1,1,2023-04-29,salad,1,1,16.0,
17,1,1,2023-04-29,salad,1,1,17.0,A1
18,1,1,2023-04-29,salad,1,1,18.0,A1
19,1,1,2023-04-29,salad,1,1,19.0,A1


In [31]:
# Save the dataset
raw_1s_signal_dataset_path= r"..\data\raw\raw_1s_signal_dataset.pkl"
signal_dataset.save(raw_1s_signal_dataset_path)

### Segmented in 1s (normalized)

In [32]:
# Load normalized dataset
signal_dataset= SignalDataset.load(norm_signal_dataset_path)

In [33]:
# Segment Signals in 1sec segments
signal_dataset.segment_signals_by_duration(segment_duration=1)

In [34]:
#Add eurythmy letter data to the features
df= ped.add_meas_letters(signal_dataset.features)

In [35]:
# Save the dataset
norm_1s_signal_dataset_path= r"..\data\interim\norm_1s_signal_dataset.pkl"
signal_dataset.save(norm_1s_signal_dataset_path)

## Features Dataset

In [5]:
window_size= 2
hop_length= 1

feature_extractor= WavFeatureExtractor(sample_rate= 10000, lib_mfccs= True, pyau_mfccs= True, temporal= True,
                                       statistical= True, window_size= window_size, hop_length= hop_length)

### Normalized letters

In [6]:
# Load dataset
signal_dataset= SignalDataset.load(norm_letters_signal_dataset_path)

In [7]:
%%time

# Extract features and store in dataset
feat_dataset= FeaturesDataset.from_signal_dataset(signal_dataset, feature_extractor)

  feature_values.append(func(waveform_data))


CPU times: total: 5h 11min 53s
Wall time: 3h 54min 24s


In [8]:
feat_dataset.features.shape

(8878, 187)

In [9]:
feat_dataset.features.head()

Unnamed: 0,id_measurement,id_performance,datetime,plant,generation,num_eurythmy,eurythmy_letter,lib_mfcc_1_avg,lib_mfcc_2_avg,lib_mfcc_3_avg,lib_mfcc_4_avg,lib_mfcc_5_avg,lib_mfcc_6_avg,lib_mfcc_7_avg,lib_mfcc_8_avg,lib_mfcc_9_avg,lib_mfcc_10_avg,lib_mfcc_11_avg,lib_mfcc_12_avg,lib_mfcc_13_avg,lib_mfcc_1_std,lib_mfcc_2_std,lib_mfcc_3_std,lib_mfcc_4_std,lib_mfcc_5_std,lib_mfcc_6_std,lib_mfcc_7_std,lib_mfcc_8_std,lib_mfcc_9_std,lib_mfcc_10_std,lib_mfcc_11_std,lib_mfcc_12_std,lib_mfcc_13_std,zcr_mean,energy_mean,energy_entropy_mean,spectral_centroid_mean,spectral_spread_mean,spectral_entropy_mean,spectral_flux_mean,spectral_rolloff_mean,mfcc_1_mean,mfcc_2_mean,mfcc_3_mean,mfcc_4_mean,mfcc_5_mean,mfcc_6_mean,mfcc_7_mean,mfcc_8_mean,mfcc_9_mean,mfcc_10_mean,mfcc_11_mean,mfcc_12_mean,mfcc_13_mean,chroma_1_mean,chroma_2_mean,chroma_3_mean,chroma_4_mean,chroma_5_mean,chroma_6_mean,chroma_7_mean,chroma_8_mean,chroma_9_mean,chroma_10_mean,chroma_11_mean,chroma_12_mean,chroma_std_mean,delta zcr_mean,delta energy_mean,delta energy_entropy_mean,delta spectral_centroid_mean,delta spectral_spread_mean,delta spectral_entropy_mean,delta spectral_flux_mean,delta spectral_rolloff_mean,delta mfcc_1_mean,delta mfcc_2_mean,delta mfcc_3_mean,delta mfcc_4_mean,delta mfcc_5_mean,delta mfcc_6_mean,delta mfcc_7_mean,delta mfcc_8_mean,delta mfcc_9_mean,delta mfcc_10_mean,delta mfcc_11_mean,delta mfcc_12_mean,delta mfcc_13_mean,delta chroma_1_mean,delta chroma_2_mean,delta chroma_3_mean,delta chroma_4_mean,delta chroma_5_mean,delta chroma_6_mean,delta chroma_7_mean,delta chroma_8_mean,delta chroma_9_mean,delta chroma_10_mean,delta chroma_11_mean,delta chroma_12_mean,delta chroma_std_mean,zcr_std,energy_std,energy_entropy_std,spectral_centroid_std,spectral_spread_std,spectral_entropy_std,spectral_flux_std,spectral_rolloff_std,mfcc_1_std,mfcc_2_std,mfcc_3_std,mfcc_4_std,mfcc_5_std,mfcc_6_std,mfcc_7_std,mfcc_8_std,mfcc_9_std,mfcc_10_std,mfcc_11_std,mfcc_12_std,mfcc_13_std,chroma_1_std,chroma_2_std,chroma_3_std,chroma_4_std,chroma_5_std,chroma_6_std,chroma_7_std,chroma_8_std,chroma_9_std,chroma_10_std,chroma_11_std,chroma_12_std,chroma_std_std,delta zcr_std,delta energy_std,delta energy_entropy_std,delta spectral_centroid_std,delta spectral_spread_std,delta spectral_entropy_std,delta spectral_flux_std,delta spectral_rolloff_std,delta mfcc_1_std,delta mfcc_2_std,delta mfcc_3_std,delta mfcc_4_std,delta mfcc_5_std,delta mfcc_6_std,delta mfcc_7_std,delta mfcc_8_std,delta mfcc_9_std,delta mfcc_10_std,delta mfcc_11_std,delta mfcc_12_std,delta mfcc_13_std,delta chroma_1_std,delta chroma_2_std,delta chroma_3_std,delta chroma_4_std,delta chroma_5_std,delta chroma_6_std,delta chroma_7_std,delta chroma_8_std,delta chroma_9_std,delta chroma_10_std,delta chroma_11_std,delta chroma_12_std,delta chroma_std_std,zero_crossing_rate,root_mean_square_energy,slope_sign_changes_ratio,duration_seconds,flatness_ratio_10000,flatness_ratio_5000,flatness_ratio_1000,flatness_ratio_500,flatness_ratio_100,hjorth_mobility,hjorth_complexity,mean,variance,standard_deviation,interquartile_range,skewness,kurtosis,dfa
0,1,1,2023-04-29,salad,1,1,A1,-23.534912,83.198769,25.629099,19.752411,14.39227,11.136339,9.130905,9.132786,9.201151,8.549523,7.044505,6.178267,5.965608,94.381187,5.893996,7.979805,4.722373,5.581942,4.857383,3.813931,2.075639,1.897661,1.554459,1.101802,1.844,1.201164,3.9e-05,0.088293,3.261156,0.032339,0.081496,0.004831,0.018852,0.000211,-62.10925,1.242231,0.156427,0.212559,0.093449,0.103613,0.062251,0.059889,0.036835,0.030198,0.018111,0.026666,0.01578,4.4e-05,8e-06,0.001371,4.3e-05,0.196795,2.4e-05,0.005584,3e-06,1e-05,3.4e-05,0.000326,1.2e-05,0.054775,0.0,-8.3e-05,-0.001171,0.000653,0.001579,6.8e-05,0.000363,0.0,0.580328,0.019567,0.00261,0.00352,0.001203,0.000764,-0.000157,0.000929,0.000994,0.000307,-0.000145,0.000688,1.3e-05,9.071631e-07,1.27499e-07,1.5e-05,1.650146e-07,-4.5e-05,2.736884e-07,8.6e-05,6.399836e-08,2.250915e-07,6.739145e-07,2.442303e-06,1.371131e-07,-1.453088e-05,0.000222,0.077577,0.13634,0.026048,0.059164,0.015137,0.034235,0.001276,24.29723,0.838808,0.126133,0.165385,0.092794,0.109901,0.072774,0.09384,0.065705,0.072746,0.071245,0.068011,0.067845,0.000174,3.1e-05,0.005554,0.000178,0.010334,9.7e-05,0.01618,8e-06,3e-05,9e-05,0.001466,4.9e-05,0.000916,0.000319,0.057773,0.161315,0.023138,0.050035,0.019565,0.03968,0.00183,22.312091,0.801896,0.134204,0.16563,0.106375,0.139996,0.09683,0.12103,0.087582,0.099202,0.101545,0.092924,0.097668,0.000231,4.3e-05,0.007664,0.000234,0.013436,0.000132,0.020331,1.1e-05,3.7e-05,0.00011,0.002048,6.5e-05,0.001063,1.7e-05,1.443897,0.018133,9.0,0.352622,0.352622,0.536244,0.616367,0.774433,1.7e-05,80464.14567,-0.069488,2.080011,1.442224,1.082166,-1.65149,1.428384,1.652581
1,1,1,2023-04-29,salad,1,1,G1,-81.110771,88.325844,28.627588,24.234732,17.512836,14.781477,11.906365,10.43669,9.275619,8.338777,7.319038,6.78876,6.659635,107.915901,5.592562,4.527091,0.87311,0.960102,1.345174,1.155145,0.981901,1.173628,1.252299,1.431759,1.505018,1.125069,6.7e-05,0.036979,3.247725,0.031531,0.065249,0.008275,0.060365,0.000511,-74.022181,0.832606,0.1017,0.14279,0.060989,0.07681,0.042051,0.041954,0.031592,0.042674,0.024715,0.020495,0.005546,7.6e-05,1.4e-05,0.003797,5.4e-05,0.19443,3.9e-05,0.008414,8e-06,2e-05,0.000117,0.000399,2e-05,0.054907,0.0,-0.002814,-0.000501,0.000308,0.000752,2.6e-05,0.001228,0.0,0.180828,0.006731,0.000876,0.001314,0.000844,0.000926,0.000632,0.000717,0.000514,0.00024,0.000155,0.000245,0.000198,4.550032e-07,-8.178217e-09,1.7e-05,8.616054e-08,-1.7e-05,6.400987e-08,2.5e-05,5.656602e-08,2.393033e-08,-1.954098e-07,-2.146093e-07,2.483693e-08,-5.714218e-06,0.000278,0.037703,0.163475,0.035331,0.077949,0.02032,0.096558,0.001798,29.464559,0.993798,0.140181,0.190954,0.108714,0.101863,0.088546,0.097048,0.07729,0.09924,0.094032,0.080031,0.077644,0.000245,3.6e-05,0.012263,0.000156,0.014731,0.00014,0.02065,2.3e-05,5.4e-05,0.000308,0.001101,5.9e-05,0.001867,0.000406,0.011726,0.2191,0.040969,0.090714,0.027827,0.128605,0.002586,33.777046,1.148502,0.170788,0.221685,0.13644,0.128271,0.127941,0.129519,0.10888,0.140506,0.132047,0.107975,0.10899,0.000349,4.8e-05,0.017193,0.000213,0.020766,0.000201,0.029278,3.2e-05,7.7e-05,0.000422,0.001522,8.4e-05,0.002694,6e-06,0.732309,0.002644,9.0,0.328122,0.328122,0.688267,0.812756,0.9828,2.1e-05,54194.021625,0.579218,0.200783,0.448089,0.144289,-2.380866,4.743071,1.640983
2,1,1,2023-04-29,salad,1,1,D1,24.882153,89.385994,26.346279,23.596043,16.54591,13.589697,10.682384,9.265663,8.128407,7.406046,6.642943,6.148542,5.691988,59.993217,0.354342,0.350243,0.370835,0.302728,0.327575,0.431741,0.538075,0.620608,0.43625,0.336217,0.356718,0.557945,0.0001,0.202628,3.248218,0.04563,0.128851,0.008015,0.026937,0.000578,-35.315901,2.10327,0.265058,0.371516,0.17443,0.190836,0.091169,0.087125,0.052937,0.074836,0.043423,0.050004,0.044027,5.7e-05,1.3e-05,0.002173,6.8e-05,0.194083,5.6e-05,0.01079,7e-06,2e-05,5.6e-05,0.000567,1.5e-05,0.055196,0.0,0.001532,0.000494,-0.000354,-0.000408,-8.8e-05,-0.000211,0.0,0.011734,-0.000325,-0.000596,0.0003,0.000138,0.000972,0.002798,0.002181,-0.001121,-0.002047,-0.001878,0.001422,0.000613,-8.747123e-07,-7.184711e-08,-6e-06,-6.5718e-07,2.2e-05,-4.95371e-07,-3.6e-05,-9.587337e-08,-3.009933e-07,-7.334357e-07,-1.241162e-06,-1.663212e-07,6.846981e-06,0.000401,0.177835,0.176057,0.030787,0.049501,0.022487,0.044192,0.002322,15.14479,0.536593,0.159475,0.168653,0.126814,0.107653,0.130092,0.138037,0.135007,0.133202,0.125984,0.110877,0.105254,0.00018,4.9e-05,0.006867,0.000254,0.018759,0.000205,0.034613,2.6e-05,5.9e-05,0.000155,0.001983,5e-05,0.003913,0.000528,0.079925,0.190558,0.027738,0.054177,0.025826,0.055053,0.002919,22.322038,0.79294,0.235095,0.251321,0.186034,0.155704,0.187763,0.193832,0.188132,0.191871,0.188995,0.162429,0.149996,0.000219,6.4e-05,0.008311,0.000323,0.021651,0.000266,0.040057,3.4e-05,7.2e-05,0.00019,0.002374,6e-05,0.005729,3.3e-05,1.401224,0.010444,9.0,0.0,0.0,0.039,0.279922,0.871789,2e-05,59588.992615,-1.019622,0.923799,0.961145,1.515033,-0.104159,-0.900896,1.679972
3,1,1,2023-04-29,salad,1,1,A2,24.231644,88.621597,25.669394,22.936642,15.933237,13.037097,10.171865,8.814935,7.88807,7.525625,6.835879,6.08257,5.667099,49.872616,2.093105,1.765709,1.699441,1.610318,1.539667,1.553005,1.327099,0.530647,0.567084,0.853385,0.550213,0.55398,6.7e-05,0.130958,3.246399,0.042626,0.120713,0.006644,0.02305,0.000389,-37.58827,2.108827,0.263524,0.373077,0.164793,0.171458,0.093967,0.09722,0.054322,0.058412,0.042166,0.051357,0.036084,6.4e-05,1.1e-05,0.001602,4e-05,0.195005,3e-05,0.009612,6e-06,1.7e-05,6e-05,0.000309,1.4e-05,0.054725,0.0,-0.001233,0.000568,-0.000345,-0.001109,-2.9e-05,4.5e-05,0.0,-0.562003,-0.019236,-0.003142,-0.005041,-0.002442,-0.001331,-0.000573,-0.000474,0.000158,-0.001223,3.7e-05,-0.000513,-0.000303,-1.050935e-07,-4.277174e-08,-5e-06,-3.054473e-07,2.2e-05,-8.3938e-08,-4.7e-05,-1.860274e-08,-4.955292e-08,-2.239235e-07,-1.072793e-06,-4.986739e-08,7.047755e-06,0.000226,0.11094,0.133441,0.029652,0.052113,0.014303,0.041577,0.001422,17.326778,0.597061,0.123058,0.134881,0.098827,0.106716,0.090826,0.10503,0.101068,0.116089,0.101743,0.094099,0.084965,0.000187,2.8e-05,0.004392,0.00011,0.01252,8.4e-05,0.024307,1.5e-05,4.5e-05,0.000153,0.000809,3.4e-05,0.002307,0.000277,0.041242,0.144102,0.024714,0.048307,0.017029,0.047357,0.002055,22.534897,0.786018,0.164293,0.177738,0.141664,0.165959,0.126755,0.147267,0.146208,0.152,0.151599,0.127924,0.116298,0.000247,3.8e-05,0.005974,0.000148,0.01607,0.00011,0.031242,1.8e-05,5.8e-05,0.000187,0.001058,4.2e-05,0.002879,6.7e-05,1.631849,0.012356,9.0,0.0,0.0,0.053656,0.217267,0.8483,1.5e-05,87704.209952,-0.912563,1.83016,1.352834,1.442888,-1.178505,0.668149,1.672634
4,1,1,2023-04-29,salad,1,1,G2,-55.030022,90.277245,27.136541,24.391529,17.351828,14.491565,11.674594,10.279722,9.026291,8.241884,7.445746,6.870279,6.332922,85.320297,0.771419,0.725948,0.723136,0.72742,0.716087,0.715486,0.723956,0.737132,0.731813,0.724335,0.721453,0.728042,2.8e-05,0.13586,3.299588,0.021953,0.057486,0.003281,0.019302,0.000233,-69.93337,0.968435,0.105019,0.168915,0.075515,0.08357,0.041228,0.040927,0.031488,0.026295,0.017951,0.023164,0.021055,4.4e-05,4e-06,0.001049,2e-05,0.197668,1.2e-05,0.004087,3e-06,9e-06,2.6e-05,0.000189,4e-06,0.055324,0.0,-0.001958,5.8e-05,-0.000156,-0.000655,-5e-06,-0.000124,0.0,-0.38034,-0.012666,-0.00163,-0.002312,-0.000983,-0.000489,0.000185,-0.000142,-0.000612,-0.000225,0.000228,-0.000228,-0.000296,-1.173815e-08,-7.263613e-09,-1e-06,-3.558277e-08,2e-06,-7.83072e-09,-3e-06,-4.158123e-09,-8.823139e-09,-5.195987e-08,-8.967604e-08,-2.177257e-09,6.608713e-07,0.000153,0.093842,0.056375,0.024274,0.059307,0.011812,0.037042,0.00112,26.467828,0.886959,0.141665,0.170428,0.083581,0.093652,0.0771,0.061025,0.059212,0.058035,0.055242,0.051118,0.046122,0.000216,1.4e-05,0.00428,7.5e-05,0.010616,4.7e-05,0.019414,1.3e-05,3.9e-05,9.2e-05,0.000895,1.5e-05,0.002376,0.00022,0.029749,0.056299,0.022645,0.067458,0.014334,0.038483,0.001456,33.716587,1.143247,0.199484,0.222639,0.10869,0.119317,0.104247,0.082896,0.083608,0.081615,0.075996,0.072731,0.065632,0.000296,1.6e-05,0.005429,9.1e-05,0.01431,6.2e-05,0.026611,1.7e-05,5e-05,0.000112,0.001188,1.9e-05,0.003305,1.7e-05,0.77145,0.003511,9.0,0.111556,0.334178,0.604156,0.740889,0.9786,2.2e-05,52998.819272,0.371457,0.457155,0.676132,0.577155,-1.348706,0.188683,1.734061


In [10]:
# Save the dataset
feat_norm_letters_dataset_path= r"..\data\processed\feat_norm_letters_" + str(window_size) + "_" + str(hop_length) + "_dataset.pkl"
feat_dataset.save(feat_norm_letters_dataset_path)

### Letters

In [11]:
# Load dataset
signal_dataset= SignalDataset.load(raw_letters_signal_dataset_path)

In [12]:
%%time

# Extract features and store in dataset
feat_dataset= FeaturesDataset.from_signal_dataset(signal_dataset, feature_extractor)

  feature_values.append(func(waveform_data))


CPU times: total: 5h 23min 59s
Wall time: 9h 54min 16s


In [13]:
# Save the dataset
feat_raw_letters_dataset_path= r"..\data\processed\feat_raw_letters_" + str(window_size) + "_" + str(hop_length) + "_dataset.pkl"
feat_dataset.save(feat_raw_letters_dataset_path)