# Make Dataset

## Imports

In [5]:
from PlantReactivityAnalysis.data.wav_data_reader import WavDataReader
from PlantReactivityAnalysis.data.signal_dataset import SignalDataset
from PlantReactivityAnalysis.features.wav_feature_extractor import WavFeatureExtractor
from PlantReactivityAnalysis.features.features_dataset import FeaturesDataset
import PlantReactivityAnalysis.data.preparation_eurythmy_data as ped

In [6]:
import pandas as pd
pd.set_option('display.max_columns', None)

## Reader

In [7]:
wav_folder= r"..\data\raw\wav_files"
#wav_folder= r"..\data\interim\testing"

In [4]:
# Initialize the reader with the folder of wavs
reader= WavDataReader(folder= wav_folder, sample_rate= 10000)

# Get the signals and the keys from the reader
signals, ids= reader.get_ordered_signals_and_keys()

# Get the measurement labels from the keys of the files
meas_df= ped.return_meas_labels_by_keys(ids)

Total WAV files read: 625


In [5]:
meas_df.head()

Unnamed: 0,id_measurement,id_performance,datetime,plant,generation,num_eurythmy
0,1,1,2023-04-29,salad,1,1
1,2,1,2023-04-29,salad,1,1
2,3,1,2023-04-29,salad,1,1
3,4,1,2023-04-29,salad,2,0
4,5,1,2023-04-29,salad,2,0


## Signal Dataset

### Raw

In [6]:
# Initialize SignalDataset with the signals and features/labels
signal_dataset= SignalDataset(signals= signals, features= meas_df)

In [7]:
signal_dataset.features.shape

(625, 6)

In [8]:
signal_dataset.features.head()

Unnamed: 0,id_measurement,id_performance,datetime,plant,generation,num_eurythmy
0,1,1,2023-04-29,salad,1,1
1,2,1,2023-04-29,salad,1,1
2,3,1,2023-04-29,salad,1,1
3,4,1,2023-04-29,salad,2,0
4,5,1,2023-04-29,salad,2,0


In [9]:
raw_signal_dataset_path= r"..\data\raw\raw_signal_dataset.pkl"

In [10]:
# Save and Load the dataset
signal_dataset.save(raw_signal_dataset_path)
signal_dataset= SignalDataset.load(raw_signal_dataset_path)

### Normalized

In [11]:
# Standardize Signals using Zscore
signal_dataset.standardize_signals("zscore")

In [12]:
signal_dataset.features.shape

(625, 6)

In [13]:
# Save the dataset
norm_signal_dataset_path= r"..\data\interim\norm_signal_dataset.pkl"
signal_dataset.save(norm_signal_dataset_path)

### Segmented by Letters (raw)

In [4]:
# Load raw dataset
signal_dataset= SignalDataset.load(raw_signal_dataset_path)

In [15]:
# Get Letter segments from raw data
letter_dictionary= ped.return_letter_dictionary(indexes= signal_dataset.features['id_measurement'].tolist())

In [16]:
# Segment the dataset
signal_dataset.segment_signals_by_dict('id_measurement', letter_dictionary, 'eurythmy_letter')

In [17]:
signal_dataset.features.shape #8390, 7

(8878, 7)

In [18]:
signal_dataset.features.head()

Unnamed: 0,id_measurement,id_performance,datetime,plant,generation,num_eurythmy,eurythmy_letter
0,1,1,2023-04-29,salad,1,1,A1
1,1,1,2023-04-29,salad,1,1,G1
2,1,1,2023-04-29,salad,1,1,D1
3,1,1,2023-04-29,salad,1,1,A2
4,1,1,2023-04-29,salad,1,1,G2


In [10]:
raw_letters_signal_dataset_path= r"..\data\raw\raw_letters_signal_dataset.pkl"

In [20]:
# Save the dataset
signal_dataset.save(raw_letters_signal_dataset_path)

### Segmented by Letters (normalized)

In [21]:
# Load normalized dataset
signal_dataset= SignalDataset.load(norm_signal_dataset_path)

In [22]:
# Get Letter segments from raw data
letter_dictionary= ped.return_letter_dictionary(indexes= signal_dataset.features['id_measurement'].tolist())

In [23]:
# Segment the dataset
signal_dataset.segment_signals_by_dict('id_measurement', letter_dictionary, 'eurythmy_letter')

In [11]:
norm_letters_signal_dataset_path= r"..\data\interim\norm_letters_signal_dataset.pkl"

In [25]:
# Save the dataset
signal_dataset.save(norm_letters_signal_dataset_path)

### Segmented in 1s (raw)

In [26]:
# Load raw dataset
signal_dataset= SignalDataset.load(raw_signal_dataset_path)

In [27]:
# Segment Signals in 1sec segments
signal_dataset.segment_signals_by_duration(segment_duration= 1)

In [28]:
#Add eurythmy letter data to the features
df= ped.add_meas_letters(signal_dataset.features)

In [29]:
signal_dataset.features.shape

(148682, 8)

In [30]:
signal_dataset.features.iloc[15:20]

Unnamed: 0,id_measurement,id_performance,datetime,plant,generation,num_eurythmy,initial_second,eurythmy_letter
15,1,1,2023-04-29,salad,1,1,15.0,
16,1,1,2023-04-29,salad,1,1,16.0,
17,1,1,2023-04-29,salad,1,1,17.0,A1
18,1,1,2023-04-29,salad,1,1,18.0,A1
19,1,1,2023-04-29,salad,1,1,19.0,A1


In [31]:
# Save the dataset
raw_1s_signal_dataset_path= r"..\data\raw\raw_1s_signal_dataset.pkl"
signal_dataset.save(raw_1s_signal_dataset_path)

### Segmented in 1s (normalized)

In [32]:
# Load normalized dataset
signal_dataset= SignalDataset.load(norm_signal_dataset_path)

In [33]:
# Segment Signals in 1sec segments
signal_dataset.segment_signals_by_duration(segment_duration=1)

In [34]:
#Add eurythmy letter data to the features
df= ped.add_meas_letters(signal_dataset.features)

In [35]:
# Save the dataset
norm_1s_signal_dataset_path= r"..\data\interim\norm_1s_signal_dataset.pkl"
signal_dataset.save(norm_1s_signal_dataset_path)

## Features Dataset

In [12]:
window_size= 1
hop_length= 0.5

feature_extractor= WavFeatureExtractor(sample_rate= 10000, lib_mfccs= True, pyau_mfccs= True, temporal= True,
                                       statistical= True, window_size= window_size, hop_length= hop_length)

### Normalized letters

In [13]:
# Load dataset
signal_dataset= SignalDataset.load(norm_letters_signal_dataset_path)

In [14]:
%%time

# Extract features and store in dataset
feat_dataset= FeaturesDataset.from_signal_dataset(signal_dataset, feature_extractor)

  feature_values.append(func(waveform_data))


CPU times: total: 5h 17min 19s
Wall time: 4h 33s


In [15]:
feat_dataset.features.shape

(8878, 187)

In [16]:
feat_dataset.features.head()

Unnamed: 0,id_measurement,id_performance,datetime,plant,generation,num_eurythmy,eurythmy_letter,lib_mfcc_1_avg,lib_mfcc_2_avg,lib_mfcc_3_avg,lib_mfcc_4_avg,lib_mfcc_5_avg,lib_mfcc_6_avg,lib_mfcc_7_avg,lib_mfcc_8_avg,lib_mfcc_9_avg,lib_mfcc_10_avg,lib_mfcc_11_avg,lib_mfcc_12_avg,lib_mfcc_13_avg,lib_mfcc_1_std,lib_mfcc_2_std,lib_mfcc_3_std,lib_mfcc_4_std,lib_mfcc_5_std,lib_mfcc_6_std,lib_mfcc_7_std,lib_mfcc_8_std,lib_mfcc_9_std,lib_mfcc_10_std,lib_mfcc_11_std,lib_mfcc_12_std,lib_mfcc_13_std,zcr_mean,energy_mean,energy_entropy_mean,spectral_centroid_mean,spectral_spread_mean,spectral_entropy_mean,spectral_flux_mean,spectral_rolloff_mean,mfcc_1_mean,mfcc_2_mean,mfcc_3_mean,mfcc_4_mean,mfcc_5_mean,mfcc_6_mean,mfcc_7_mean,mfcc_8_mean,mfcc_9_mean,mfcc_10_mean,mfcc_11_mean,mfcc_12_mean,mfcc_13_mean,chroma_1_mean,chroma_2_mean,chroma_3_mean,chroma_4_mean,chroma_5_mean,chroma_6_mean,chroma_7_mean,chroma_8_mean,chroma_9_mean,chroma_10_mean,chroma_11_mean,chroma_12_mean,chroma_std_mean,delta zcr_mean,delta energy_mean,delta energy_entropy_mean,delta spectral_centroid_mean,delta spectral_spread_mean,delta spectral_entropy_mean,delta spectral_flux_mean,delta spectral_rolloff_mean,delta mfcc_1_mean,delta mfcc_2_mean,delta mfcc_3_mean,delta mfcc_4_mean,delta mfcc_5_mean,delta mfcc_6_mean,delta mfcc_7_mean,delta mfcc_8_mean,delta mfcc_9_mean,delta mfcc_10_mean,delta mfcc_11_mean,delta mfcc_12_mean,delta mfcc_13_mean,delta chroma_1_mean,delta chroma_2_mean,delta chroma_3_mean,delta chroma_4_mean,delta chroma_5_mean,delta chroma_6_mean,delta chroma_7_mean,delta chroma_8_mean,delta chroma_9_mean,delta chroma_10_mean,delta chroma_11_mean,delta chroma_12_mean,delta chroma_std_mean,zcr_std,energy_std,energy_entropy_std,spectral_centroid_std,spectral_spread_std,spectral_entropy_std,spectral_flux_std,spectral_rolloff_std,mfcc_1_std,mfcc_2_std,mfcc_3_std,mfcc_4_std,mfcc_5_std,mfcc_6_std,mfcc_7_std,mfcc_8_std,mfcc_9_std,mfcc_10_std,mfcc_11_std,mfcc_12_std,mfcc_13_std,chroma_1_std,chroma_2_std,chroma_3_std,chroma_4_std,chroma_5_std,chroma_6_std,chroma_7_std,chroma_8_std,chroma_9_std,chroma_10_std,chroma_11_std,chroma_12_std,chroma_std_std,delta zcr_std,delta energy_std,delta energy_entropy_std,delta spectral_centroid_std,delta spectral_spread_std,delta spectral_entropy_std,delta spectral_flux_std,delta spectral_rolloff_std,delta mfcc_1_std,delta mfcc_2_std,delta mfcc_3_std,delta mfcc_4_std,delta mfcc_5_std,delta mfcc_6_std,delta mfcc_7_std,delta mfcc_8_std,delta mfcc_9_std,delta mfcc_10_std,delta mfcc_11_std,delta mfcc_12_std,delta mfcc_13_std,delta chroma_1_std,delta chroma_2_std,delta chroma_3_std,delta chroma_4_std,delta chroma_5_std,delta chroma_6_std,delta chroma_7_std,delta chroma_8_std,delta chroma_9_std,delta chroma_10_std,delta chroma_11_std,delta chroma_12_std,delta chroma_std_std,zero_crossing_rate,root_mean_square_energy,slope_sign_changes_ratio,duration_seconds,flatness_ratio_10000,flatness_ratio_5000,flatness_ratio_1000,flatness_ratio_500,flatness_ratio_100,hjorth_mobility,hjorth_complexity,mean,variance,standard_deviation,interquartile_range,skewness,kurtosis,dfa
0,1,1,2023-04-29,salad,1,1,A1,-112.034447,73.410385,25.487236,20.726713,14.367032,11.460776,9.627193,9.346633,9.139145,8.575263,7.333161,6.673249,6.549365,106.814957,22.046612,9.565619,6.329313,5.816319,5.128364,4.339742,2.688915,1.737823,1.247607,1.829911,2.272723,1.612619,3.9e-05,0.088304,3.261203,0.032223,0.081098,0.004826,0.018708,0.000211,-62.296443,1.235709,0.155466,0.2112,0.092647,0.102716,0.061738,0.059721,0.037312,0.03064,0.018006,0.026201,0.015822,4.4e-05,8e-06,0.00137,4.3e-05,0.196797,2.4e-05,0.00558,3e-06,1e-05,3.4e-05,0.000326,1.2e-05,0.054775,0.0,-7.2e-05,-0.001124,0.000538,0.001182,6.3e-05,0.0002189057,0.0,0.393135,0.013044,0.001649,0.002161,0.000401,-0.000133,-0.000671,0.000761,0.00147,0.000749,-0.00025,0.000223,5.4e-05,8.808522e-07,1.245783e-07,1.5e-05,1.390759e-07,-4.3e-05,2.6653e-07,8.3e-05,6.089822e-08,2.136081e-07,6.515224e-07,2.399732e-06,1.328375e-07,-1.396324e-05,0.000155,0.053899,0.09915,0.019595,0.044021,0.010882,0.027826,0.00089,17.71012,0.626208,0.098801,0.128861,0.079956,0.096823,0.064831,0.082795,0.058029,0.062318,0.064395,0.061512,0.060276,0.000123,2.2e-05,0.00395,0.000125,0.007404,7e-05,0.011644,6e-06,2.1e-05,6.6e-05,0.001041,3.5e-05,0.00069,0.000226,0.05179,0.124222,0.021412,0.048003,0.014509,0.033412,0.001294,20.364791,0.738132,0.124926,0.155772,0.101129,0.130712,0.09025,0.114492,0.08103,0.087288,0.094701,0.086756,0.087719,0.000167,3.1e-05,0.005555,0.000169,0.009925,9.6e-05,0.015216,8e-06,2.8e-05,8.5e-05,0.00148,4.7e-05,0.000866,1.7e-05,1.443897,0.018133,9.0,0.352622,0.352622,0.536244,0.616367,0.774433,1.7e-05,80464.14567,-0.069488,2.080011,1.442224,1.082166,-1.65149,1.428384,1.652581
1,1,1,2023-04-29,salad,1,1,G1,-170.879852,78.603798,25.99671,23.020082,17.198301,14.345879,11.870129,10.58734,9.401297,8.60995,7.763268,7.27469,7.08471,116.036736,26.898577,6.06699,4.872044,3.292523,2.2532,1.625616,1.251814,1.038708,1.118514,1.400293,1.549913,1.3512,6.1e-05,0.044348,3.25253,0.03074,0.064558,0.007691,0.056778,0.000467,-73.804381,0.840326,0.103055,0.143203,0.059229,0.075125,0.042734,0.043807,0.030341,0.041312,0.024302,0.0179,0.003372,7.2e-05,1.3e-05,0.003591,5.1e-05,0.19494,3.7e-05,0.007489,8e-06,1.9e-05,0.000105,0.000353,1.8e-05,0.054922,0.0,-0.003808,-0.000826,0.000482,0.001036,4.9e-05,0.00107481,0.0,0.187979,0.006684,0.000802,0.001168,0.000668,0.000815,0.000672,0.000716,0.000428,0.000174,3.8e-05,0.000196,0.000204,5.680567e-07,3.409592e-08,2.2e-05,1.67236e-07,-3e-05,2.507267e-07,4.7e-05,6.422247e-08,7.683687e-08,1.556722e-07,5.281775e-07,7.262784e-08,-9.759584e-06,0.000222,0.027613,0.137673,0.030222,0.066607,0.01693,0.081965,0.001472,24.863888,0.840115,0.1227,0.161991,0.096781,0.091749,0.075389,0.085462,0.070977,0.089754,0.085306,0.066612,0.06941,0.000189,2.9e-05,0.01009,0.000129,0.012504,0.000106,0.017602,1.9e-05,4.4e-05,0.00026,0.000861,4.8e-05,0.001542,0.000325,0.011718,0.187065,0.037112,0.083608,0.02334,0.112736,0.002114,31.386856,1.06662,0.159812,0.204403,0.129304,0.124058,0.110494,0.118117,0.102211,0.130722,0.123358,0.092624,0.099943,0.000273,4.1e-05,0.014192,0.00018,0.017603,0.000152,0.024818,2.7e-05,6.3e-05,0.000355,0.001203,6.9e-05,0.002257,6e-06,0.732309,0.002644,9.0,0.328122,0.328122,0.688267,0.812756,0.9828,2.1e-05,54194.021625,0.579218,0.200783,0.448089,0.144289,-2.380866,4.743071,1.640983
2,1,1,2023-04-29,salad,1,1,D1,-57.091522,89.337288,26.508764,23.764023,16.68268,13.718043,10.865041,9.510201,8.364995,7.58216,6.759281,6.306276,5.865403,60.71896,0.975504,0.911059,0.930427,0.826592,0.853468,0.918153,0.880269,1.006184,0.930436,0.748558,0.630205,0.764092,8.9e-05,0.199841,3.250993,0.045377,0.128382,0.007561,0.026341,0.000511,-35.565006,2.097352,0.266039,0.372387,0.174122,0.187017,0.090341,0.084718,0.049184,0.074682,0.044365,0.049061,0.041187,5.3e-05,1.2e-05,0.001889,6.2e-05,0.194729,5.1e-05,0.009594,7e-06,1.9e-05,5.5e-05,0.000522,1.4e-05,0.055066,0.0,0.000749,0.000142,-4.6e-05,-6.7e-05,-6e-06,-0.0001128802,0.0,0.007481,3e-06,7.7e-05,0.000391,0.000959,0.001465,0.001078,-0.000214,-0.001,-0.001106,-0.000771,0.000344,0.000403,-1.352994e-08,-1.526116e-08,-3e-06,-9.231683e-08,6e-06,-1.673815e-08,-1.1e-05,-6.088499e-09,-3.72732e-08,1.353775e-08,4.718429e-07,-1.064933e-08,1.856277e-06,0.000298,0.134757,0.135781,0.026961,0.046496,0.016714,0.036898,0.001637,14.466541,0.514074,0.158313,0.168099,0.125613,0.10857,0.129274,0.139768,0.129965,0.131533,0.123465,0.106441,0.099328,0.00013,3.5e-05,0.004742,0.000179,0.013179,0.000144,0.024359,1.9e-05,4.5e-05,0.00012,0.001393,3.4e-05,0.002813,0.000407,0.070984,0.154807,0.026999,0.054341,0.020091,0.048197,0.002127,21.665968,0.774335,0.236108,0.251097,0.185558,0.159615,0.186253,0.195774,0.184927,0.188494,0.186097,0.155339,0.14225,0.000161,4.7e-05,0.005858,0.000232,0.015779,0.00019,0.029291,2.5e-05,5.6e-05,0.00015,0.001731,4.2e-05,0.004106,3.3e-05,1.401224,0.010444,9.0,0.0,0.0,0.039,0.279922,0.871789,2e-05,59588.992615,-1.019622,0.923799,0.961145,1.515033,-0.104159,-0.900896,1.679972
3,1,1,2023-04-29,salad,1,1,A2,-54.112663,88.635345,25.925098,23.210552,16.179735,13.258812,10.393497,9.009682,8.006456,7.578205,6.890536,6.181704,5.76251,51.991676,2.152629,1.712131,1.62494,1.561132,1.546459,1.582774,1.385537,0.72617,0.960766,1.028287,0.793056,1.044516,8.4e-05,0.125743,3.234316,0.044631,0.123611,0.007564,0.025688,0.000478,-37.377053,2.115316,0.2636,0.373572,0.164805,0.171138,0.0933,0.096777,0.057405,0.058771,0.042997,0.050144,0.034386,7.7e-05,1.3e-05,0.001865,4.6e-05,0.194253,3.3e-05,0.011012,6e-06,2e-05,7e-05,0.000374,1.6e-05,0.054661,0.0,-0.000213,0.001048,-0.000499,-0.001331,-5.9e-05,5.5838e-06,0.0,-0.564227,-0.019478,-0.002359,-0.003431,-0.001445,-0.001665,-0.001109,-0.000952,-0.00064,-0.000619,-0.000209,-0.000306,-0.000566,-2.282429e-07,-7.451886e-08,-1.2e-05,-5.145155e-07,4.2e-05,-1.270376e-07,-8.5e-05,-2.679488e-08,-1.250775e-07,-3.217276e-07,-1.742538e-06,-1.257309e-07,1.331985e-05,0.000211,0.074523,0.117373,0.024525,0.044786,0.012692,0.03681,0.001291,15.108996,0.524597,0.119276,0.128743,0.094552,0.10091,0.083469,0.09824,0.092863,0.104456,0.088897,0.086755,0.079926,0.000177,2.6e-05,0.003996,9.2e-05,0.011103,7.1e-05,0.02123,1.2e-05,3.9e-05,0.00014,0.000773,3e-05,0.001936,0.000255,0.034025,0.141802,0.024051,0.047024,0.01586,0.046305,0.001885,20.131753,0.70729,0.159955,0.17176,0.135574,0.156839,0.11887,0.138968,0.133419,0.139178,0.133851,0.119447,0.111485,0.000245,3.7e-05,0.005623,0.000129,0.014914,9.6e-05,0.028512,1.5e-05,5.1e-05,0.000183,0.001044,3.9e-05,0.0025,6.7e-05,1.631849,0.012356,9.0,0.0,0.0,0.053656,0.217267,0.8483,1.5e-05,87704.209952,-0.912563,1.83016,1.352834,1.442888,-1.178505,0.668149,1.672634
4,1,1,2023-04-29,salad,1,1,G2,-140.213089,89.335587,28.212805,24.651039,17.705042,14.935773,12.120633,10.671359,9.451908,8.676296,7.84805,7.260213,6.750432,89.765442,4.779769,3.468286,0.826041,0.841072,1.036144,1.000561,0.853096,0.945601,0.997191,0.937441,0.899895,0.951077,2.8e-05,0.135619,3.299527,0.022182,0.058285,0.003291,0.019599,0.000233,-69.559411,0.981367,0.104998,0.168913,0.076558,0.085247,0.039533,0.042674,0.031397,0.025925,0.01963,0.023688,0.022486,4.4e-05,4e-06,0.00105,2e-05,0.197666,1.2e-05,0.00409,3e-06,9e-06,2.6e-05,0.000189,4e-06,0.055324,0.0,-0.001916,7.1e-05,-0.000176,-0.000689,-7e-06,-6.884562e-07,0.0,-0.380254,-0.012674,-0.00181,-0.002465,-0.00099,-0.000528,0.000197,-0.000207,-0.000662,-0.000256,0.000175,-0.000286,-0.000345,-5.427917e-08,-4.536108e-09,-2e-06,-1.631209e-08,3e-06,-2.955832e-08,-4e-06,-6.603539e-09,-1.294648e-08,-6.699816e-08,-3.129957e-07,-1.187722e-08,8.928822e-07,0.000121,0.071046,0.04664,0.021223,0.054367,0.009365,0.031474,0.000873,24.009154,0.808268,0.132273,0.158725,0.077649,0.087787,0.075609,0.059313,0.053535,0.05543,0.052982,0.051416,0.04791,0.000161,1.1e-05,0.00339,6e-05,0.007996,3.8e-05,0.014355,1.1e-05,3e-05,7.4e-05,0.000655,1.2e-05,0.001801,0.000176,0.02803,0.048219,0.02173,0.065004,0.011498,0.034853,0.001145,32.133532,1.093845,0.190261,0.218257,0.105405,0.117502,0.106197,0.083109,0.077216,0.079361,0.07351,0.075481,0.069756,0.000223,1.3e-05,0.004282,7.4e-05,0.010778,5.2e-05,0.019752,1.4e-05,3.8e-05,9.1e-05,0.000874,1.5e-05,0.002508,1.7e-05,0.77145,0.003511,9.0,0.111556,0.334178,0.604156,0.740889,0.9786,2.2e-05,52998.819272,0.371457,0.457155,0.676132,0.577155,-1.348706,0.188683,1.734061


In [17]:
# Save the dataset
feat_norm_letters_dataset_path= r"..\data\processed\feat_norm_letters_" + str(window_size) + "_" + str(hop_length) + "_dataset.pkl"
feat_dataset.save(feat_norm_letters_dataset_path)

### Letters

In [18]:
# Load dataset
signal_dataset= SignalDataset.load(raw_letters_signal_dataset_path)

In [19]:
%%time

# Extract features and store in dataset
feat_dataset= FeaturesDataset.from_signal_dataset(signal_dataset, feature_extractor)

  feature_values.append(func(waveform_data))


CPU times: total: 5h 28min 56s
Wall time: 9h 58min 46s


In [20]:
# Save the dataset
feat_raw_letters_dataset_path= r"..\data\processed\feat_raw_letters_" + str(window_size) + "_" + str(hop_length) + "_dataset.pkl"
feat_dataset.save(feat_raw_letters_dataset_path)