# Make Dataset

## Imports

In [1]:
from PlantReactivityAnalysis.data.wav_data_reader import WavDataReader
from PlantReactivityAnalysis.data.signal_dataset import SignalDataset
from PlantReactivityAnalysis.features.wav_feature_extractor import WavFeatureExtractor
from PlantReactivityAnalysis.features.features_dataset import FeaturesDataset
import PlantReactivityAnalysis.data.preparation_eurythmy_data as ped

In [2]:
import pandas as pd
pd.set_option('display.max_columns', None)

## Reader

In [3]:
wav_folder= r"..\data\raw\wav_files"
#wav_folder= r"..\data\interim\testing"

In [4]:
# Initialize the reader with the folder of wavs
reader= WavDataReader(folder= wav_folder, sample_rate= 10000)

# Get the signals and the keys from the reader
signals, ids= reader.get_ordered_signals_and_keys()

# Get the measurement labels from the keys of the files
meas_df= ped.return_meas_labels_by_keys(ids)

Total WAV files read: 625


In [5]:
meas_df.head()

Unnamed: 0,id_measurement,id_performance,datetime,plant,generation,num_eurythmy
0,1,1,2023-04-29,salad,1,1
1,2,1,2023-04-29,salad,1,1
2,3,1,2023-04-29,salad,1,1
3,4,1,2023-04-29,salad,2,0
4,5,1,2023-04-29,salad,2,0


## Signal Dataset

### Raw

In [6]:
# Initialize SignalDataset with the signals and features/labels
signal_dataset= SignalDataset(signals= signals, features= meas_df)

In [7]:
signal_dataset.features.shape

(625, 6)

In [8]:
signal_dataset.features.head()

Unnamed: 0,id_measurement,id_performance,datetime,plant,generation,num_eurythmy
0,1,1,2023-04-29,salad,1,1
1,2,1,2023-04-29,salad,1,1
2,3,1,2023-04-29,salad,1,1
3,4,1,2023-04-29,salad,2,0
4,5,1,2023-04-29,salad,2,0


In [9]:
# Save and Load the dataset
raw_signal_dataset_path= r"..\data\raw\raw_signal_dataset.pkl"

In [10]:
signal_dataset.save(raw_signal_dataset_path)
signal_dataset= SignalDataset.load(raw_signal_dataset_path)

### Normalized

In [11]:
# Standardize Signals using Zscore
signal_dataset.standardize_signals("zscore")

In [12]:
signal_dataset.features.shape

(625, 6)

In [13]:
# Save the dataset
norm_signal_dataset_path= r"..\data\interim\norm_signal_dataset.pkl"
signal_dataset.save(norm_signal_dataset_path)

### Segmented by Letters (raw)

In [14]:
# Load raw dataset
signal_dataset= SignalDataset.load(raw_signal_dataset_path)

In [15]:
# Get Letter segments from raw data
letter_dictionary= ped.return_letter_dictionary(indexes= signal_dataset.features['id_measurement'].tolist())

In [16]:
# Segment the dataset
signal_dataset.segment_signals_by_dict('id_measurement', letter_dictionary, 'eurythmy_letter')

In [17]:
signal_dataset.features.shape #8390, 7

(8878, 7)

In [18]:
signal_dataset.features.head()

Unnamed: 0,id_measurement,id_performance,datetime,plant,generation,num_eurythmy,eurythmy_letter
0,1,1,2023-04-29,salad,1,1,A1
1,1,1,2023-04-29,salad,1,1,G1
2,1,1,2023-04-29,salad,1,1,D1
3,1,1,2023-04-29,salad,1,1,A2
4,1,1,2023-04-29,salad,1,1,G2


In [19]:
raw_letters_signal_dataset_path= r"..\data\raw\raw_letters_signal_dataset.pkl"

In [20]:
# Save the dataset
signal_dataset.save(raw_letters_signal_dataset_path)

### Segmented by Letters (normalized)

In [21]:
# Load normalized dataset
signal_dataset= SignalDataset.load(norm_signal_dataset_path)

In [22]:
# Get Letter segments from raw data
letter_dictionary= ped.return_letter_dictionary(indexes= signal_dataset.features['id_measurement'].tolist())

In [23]:
# Segment the dataset
signal_dataset.segment_signals_by_dict('id_measurement', letter_dictionary, 'eurythmy_letter')

In [24]:
norm_letters_signal_dataset_path= r"..\data\interim\norm_letters_signal_dataset.pkl"

In [25]:
# Save the dataset
signal_dataset.save(norm_letters_signal_dataset_path)

### Segmented in 1s (raw)

In [26]:
# Load raw dataset
signal_dataset= SignalDataset.load(raw_signal_dataset_path)

In [27]:
# Segment Signals in 1sec segments
signal_dataset.segment_signals_by_duration(segment_duration= 1)

In [28]:
#Add eurythmy letter data to the features
df= ped.add_meas_letters(signal_dataset.features)

In [29]:
signal_dataset.features.shape

(148682, 8)

In [30]:
signal_dataset.features.iloc[15:20]

Unnamed: 0,id_measurement,id_performance,datetime,plant,generation,num_eurythmy,initial_second,eurythmy_letter
15,1,1,2023-04-29,salad,1,1,15.0,
16,1,1,2023-04-29,salad,1,1,16.0,
17,1,1,2023-04-29,salad,1,1,17.0,A1
18,1,1,2023-04-29,salad,1,1,18.0,A1
19,1,1,2023-04-29,salad,1,1,19.0,A1


In [31]:
# Save the dataset
raw_1s_signal_dataset_path= r"..\data\raw\raw_1s_signal_dataset.pkl"
signal_dataset.save(raw_1s_signal_dataset_path)

### Segmented in 1s (normalized)

In [32]:
# Load normalized dataset
signal_dataset= SignalDataset.load(norm_signal_dataset_path)

In [33]:
# Segment Signals in 1sec segments
signal_dataset.segment_signals_by_duration(segment_duration=1)

In [34]:
#Add eurythmy letter data to the features
df= ped.add_meas_letters(signal_dataset.features)

In [35]:
# Save the dataset
norm_1s_signal_dataset_path= r"..\data\interim\norm_1s_signal_dataset.pkl"
signal_dataset.save(norm_1s_signal_dataset_path)

## Features Dataset

In [36]:
window_size= 1
hop_length= 1

feature_extractor= WavFeatureExtractor(sample_rate= 10000, lib_mfccs= True, pyau_mfccs= True, temporal= True,
                                       statistical= True, window_size= window_size, hop_length= hop_length)

### Normalized letters

In [37]:
# Load dataset
signal_dataset= SignalDataset.load(norm_letters_signal_dataset_path)

In [38]:
%%time

# Extract features and store in dataset
feat_dataset= FeaturesDataset.from_signal_dataset(signal_dataset, feature_extractor)

  feature_values.append(func(waveform_data))


CPU times: total: 4h 57min 5s
Wall time: 4h 1min 24s


In [39]:
feat_dataset.features.shape

(8878, 187)

In [40]:
feat_dataset.features.head()

Unnamed: 0,id_measurement,id_performance,datetime,plant,generation,num_eurythmy,eurythmy_letter,lib_mfcc_1_avg,lib_mfcc_2_avg,lib_mfcc_3_avg,lib_mfcc_4_avg,lib_mfcc_5_avg,lib_mfcc_6_avg,lib_mfcc_7_avg,lib_mfcc_8_avg,lib_mfcc_9_avg,lib_mfcc_10_avg,lib_mfcc_11_avg,lib_mfcc_12_avg,lib_mfcc_13_avg,lib_mfcc_1_std,lib_mfcc_2_std,lib_mfcc_3_std,lib_mfcc_4_std,lib_mfcc_5_std,lib_mfcc_6_std,lib_mfcc_7_std,lib_mfcc_8_std,lib_mfcc_9_std,lib_mfcc_10_std,lib_mfcc_11_std,lib_mfcc_12_std,lib_mfcc_13_std,zcr_mean,energy_mean,energy_entropy_mean,spectral_centroid_mean,spectral_spread_mean,spectral_entropy_mean,spectral_flux_mean,spectral_rolloff_mean,mfcc_1_mean,mfcc_2_mean,mfcc_3_mean,mfcc_4_mean,mfcc_5_mean,mfcc_6_mean,mfcc_7_mean,mfcc_8_mean,mfcc_9_mean,mfcc_10_mean,mfcc_11_mean,mfcc_12_mean,mfcc_13_mean,chroma_1_mean,chroma_2_mean,chroma_3_mean,chroma_4_mean,chroma_5_mean,chroma_6_mean,chroma_7_mean,chroma_8_mean,chroma_9_mean,chroma_10_mean,chroma_11_mean,chroma_12_mean,chroma_std_mean,delta zcr_mean,delta energy_mean,delta energy_entropy_mean,delta spectral_centroid_mean,delta spectral_spread_mean,delta spectral_entropy_mean,delta spectral_flux_mean,delta spectral_rolloff_mean,delta mfcc_1_mean,delta mfcc_2_mean,delta mfcc_3_mean,delta mfcc_4_mean,delta mfcc_5_mean,delta mfcc_6_mean,delta mfcc_7_mean,delta mfcc_8_mean,delta mfcc_9_mean,delta mfcc_10_mean,delta mfcc_11_mean,delta mfcc_12_mean,delta mfcc_13_mean,delta chroma_1_mean,delta chroma_2_mean,delta chroma_3_mean,delta chroma_4_mean,delta chroma_5_mean,delta chroma_6_mean,delta chroma_7_mean,delta chroma_8_mean,delta chroma_9_mean,delta chroma_10_mean,delta chroma_11_mean,delta chroma_12_mean,delta chroma_std_mean,zcr_std,energy_std,energy_entropy_std,spectral_centroid_std,spectral_spread_std,spectral_entropy_std,spectral_flux_std,spectral_rolloff_std,mfcc_1_std,mfcc_2_std,mfcc_3_std,mfcc_4_std,mfcc_5_std,mfcc_6_std,mfcc_7_std,mfcc_8_std,mfcc_9_std,mfcc_10_std,mfcc_11_std,mfcc_12_std,mfcc_13_std,chroma_1_std,chroma_2_std,chroma_3_std,chroma_4_std,chroma_5_std,chroma_6_std,chroma_7_std,chroma_8_std,chroma_9_std,chroma_10_std,chroma_11_std,chroma_12_std,chroma_std_std,delta zcr_std,delta energy_std,delta energy_entropy_std,delta spectral_centroid_std,delta spectral_spread_std,delta spectral_entropy_std,delta spectral_flux_std,delta spectral_rolloff_std,delta mfcc_1_std,delta mfcc_2_std,delta mfcc_3_std,delta mfcc_4_std,delta mfcc_5_std,delta mfcc_6_std,delta mfcc_7_std,delta mfcc_8_std,delta mfcc_9_std,delta mfcc_10_std,delta mfcc_11_std,delta mfcc_12_std,delta mfcc_13_std,delta chroma_1_std,delta chroma_2_std,delta chroma_3_std,delta chroma_4_std,delta chroma_5_std,delta chroma_6_std,delta chroma_7_std,delta chroma_8_std,delta chroma_9_std,delta chroma_10_std,delta chroma_11_std,delta chroma_12_std,delta chroma_std_std,zero_crossing_rate,root_mean_square_energy,slope_sign_changes_ratio,duration_seconds,flatness_ratio_10000,flatness_ratio_5000,flatness_ratio_1000,flatness_ratio_500,flatness_ratio_100,hjorth_mobility,hjorth_complexity,mean,variance,standard_deviation,interquartile_range,skewness,kurtosis,dfa
0,1,1,2023-04-29,salad,1,1,A1,-97.838631,69.27356,23.362997,19.670675,13.726768,11.218821,9.208938,8.885289,8.858075,8.523602,7.353079,6.539922,6.364826,118.226295,26.50104,9.37835,6.639277,5.881349,5.224013,4.339543,2.838975,1.978278,1.259007,1.849545,2.406303,1.947991,3.3e-05,0.088971,3.264958,0.030536,0.076987,0.004579,0.017933,0.0002,-63.783953,1.185665,0.148074,0.201866,0.089235,0.099855,0.060547,0.056997,0.035531,0.029857,0.01657,0.024122,0.015072,4.2e-05,8e-06,0.00133,4e-05,0.196965,2.3e-05,0.00524,3e-06,9e-06,3.2e-05,0.000315,1.1e-05,0.054824,0.0,-5.5e-05,-0.00078,0.000436,0.001053,4.5e-05,0.0002419726,0.0,0.386885,0.013045,0.00174,0.002347,0.000802,0.00051,-0.000105,0.000619,0.000662,0.000205,-9.7e-05,0.000459,8e-06,6.047754e-07,8.499934e-08,1.010486e-05,1.100097e-07,-3.009859e-05,1.824589e-07,5.7e-05,4.266557e-08,1.50061e-07,4.492763e-07,1.628202e-06,9.140872e-08,-9.687256e-06,0.000146,0.053164,0.097423,0.019238,0.044314,0.010767,0.027899,0.000872,18.33436,0.644265,0.104472,0.136991,0.080117,0.093298,0.064427,0.081773,0.057405,0.060885,0.061697,0.058945,0.059256,0.000122,2.2e-05,0.003948,0.000124,0.007312,6.9e-05,0.011435,6e-06,2.2e-05,6.5e-05,0.00103,3.5e-05,0.00067,0.000211,0.047956,0.120787,0.020865,0.048218,0.014285,0.032865,0.001265,20.755186,0.740586,0.126518,0.159216,0.100205,0.126061,0.089695,0.109876,0.078372,0.085441,0.090546,0.083349,0.086447,0.000166,3.1e-05,0.005522,0.000166,0.009742,9.5e-05,0.014809,8e-06,2.8e-05,8.2e-05,0.001463,4.7e-05,0.00082,1.7e-05,1.443897,0.018133,9.0,0.352622,0.352622,0.536244,0.616367,0.774433,1.7e-05,80464.14567,-0.069488,2.080011,1.442224,1.082166,-1.65149,1.428384,1.652581
1,1,1,2023-04-29,salad,1,1,G1,-151.758621,76.297348,25.773884,23.195232,17.153683,14.401384,11.833508,10.480188,9.294729,8.389806,7.522802,7.051421,6.99697,133.723511,27.914768,5.736088,4.853868,3.402934,2.528081,1.871545,1.410442,1.231589,1.254152,1.676308,1.889821,1.557738,5.6e-05,0.061664,3.262174,0.029286,0.063262,0.006898,0.052122,0.000422,-73.209831,0.857017,0.103996,0.149589,0.06356,0.07558,0.042261,0.046169,0.031771,0.03899,0.027234,0.018053,0.005316,6.8e-05,1.1e-05,0.003175,4.3e-05,0.195497,3.5e-05,0.006649,7e-06,1.6e-05,9.3e-05,0.000334,1.7e-05,0.05496,0.0,-0.005452,-0.000555,0.000314,0.000595,3.3e-05,0.0008260015,0.0,0.000785,0.000639,-1.6e-05,-8.2e-05,7.7e-05,0.000195,0.00034,0.000401,0.00021,-4.8e-05,-0.000138,-0.000108,1e-05,3.777188e-07,2.274788e-08,1.453678e-05,1.103207e-07,-1.986442e-05,1.676313e-07,3.2e-05,4.274459e-08,5.185454e-08,1.055283e-07,3.544756e-07,4.82052e-08,-6.545901e-06,0.000212,0.031881,0.124692,0.03018,0.066916,0.016484,0.07471,0.001473,24.990641,0.8394,0.125326,0.160711,0.095185,0.092746,0.073359,0.084875,0.074347,0.087786,0.083423,0.06512,0.069007,0.000204,2.7e-05,0.009846,0.000117,0.011784,0.00011,0.016113,1.8e-05,4e-05,0.000237,0.000868,4.8e-05,0.001538,0.000311,0.013422,0.167148,0.035288,0.080392,0.022627,0.102325,0.002125,30.416615,1.031922,0.16181,0.196821,0.123788,0.122129,0.107537,0.116526,0.105646,0.126138,0.119678,0.091077,0.098802,0.000292,3.7e-05,0.013848,0.000162,0.016631,0.000158,0.022809,2.6e-05,5.7e-05,0.000326,0.001212,6.8e-05,0.002225,6e-06,0.732309,0.002644,9.0,0.328122,0.328122,0.688267,0.812756,0.9828,2.1e-05,54194.021625,0.579218,0.200783,0.448089,0.144289,-2.380866,4.743071,1.640983
2,1,1,2023-04-29,salad,1,1,D1,-39.021881,89.484283,26.571812,23.835693,16.747375,13.777249,10.816953,9.473647,8.450934,7.695979,6.742785,6.240778,5.757567,74.634789,0.521752,0.517014,0.516875,0.381142,0.408188,0.682959,0.599139,0.880994,0.617856,0.334806,0.48208,0.731387,8.9e-05,0.18649,3.24774,0.04617,0.128797,0.007804,0.029033,0.000511,-36.167952,2.079668,0.264204,0.369089,0.171541,0.185854,0.090748,0.085453,0.048311,0.07351,0.044454,0.048333,0.041639,5.4e-05,1.2e-05,0.001944,6.4e-05,0.194609,5.2e-05,0.009793,7e-06,2e-05,5.7e-05,0.000537,1.4e-05,0.055029,0.0,0.000624,6.4e-05,-7e-05,-0.0001,-7e-06,4.527081e-05,0.0,-0.000118,-0.000197,-0.000254,0.000223,0.001039,0.00177,0.000972,-0.000479,-0.000549,-9.7e-05,-0.000681,0.000187,0.00072,-1.114784e-07,-4.332987e-09,-1.182349e-06,-9.670379e-08,2.380916e-06,-4.477795e-08,-4e-06,-3.393578e-09,-4.870197e-08,-3.643277e-08,3.406729e-07,-2.457836e-08,7.894906e-07,0.000319,0.127388,0.15018,0.029138,0.049793,0.01796,0.041468,0.00175,15.128978,0.534141,0.161547,0.169583,0.127013,0.10957,0.129995,0.143724,0.133723,0.135032,0.125536,0.109622,0.101762,0.000142,3.7e-05,0.005095,0.000186,0.014123,0.000153,0.026135,2e-05,4.9e-05,0.000132,0.001497,3.7e-05,0.002956,0.000433,0.065063,0.166706,0.02823,0.057625,0.021072,0.052685,0.002237,22.822836,0.810829,0.242546,0.256488,0.186482,0.159857,0.188615,0.200064,0.186581,0.192952,0.187032,0.15777,0.144459,0.000175,4.9e-05,0.006217,0.00024,0.016613,0.0002,0.030997,2.6e-05,6e-05,0.000163,0.001828,4.5e-05,0.00427,3.3e-05,1.401224,0.010444,9.0,0.0,0.0,0.039,0.279922,0.871789,2e-05,59588.992615,-1.019622,0.923799,0.961145,1.515033,-0.104159,-0.900896,1.679972
3,1,1,2023-04-29,salad,1,1,A2,-35.181309,88.256424,25.60961,22.896559,15.855588,12.917089,10.019514,8.652267,7.855108,7.729014,7.010394,5.972252,5.486945,61.873417,2.751151,2.160524,2.043172,1.934997,1.894169,1.97484,1.669991,0.551645,1.013694,1.244305,0.423966,0.286011,8.9e-05,0.123615,3.230978,0.046237,0.125896,0.008328,0.028481,0.000533,-37.175691,2.120212,0.26014,0.373992,0.163257,0.170304,0.093428,0.096943,0.058916,0.057099,0.043966,0.050175,0.034436,9.1e-05,1.4e-05,0.002065,5e-05,0.193622,3.8e-05,0.012225,7e-06,2.2e-05,7.8e-05,0.000406,1.7e-05,0.054771,0.0,0.000132,0.001057,-0.000433,-0.00105,-5.2e-05,0.0001110854,0.0,-0.378112,-0.012837,-0.001571,-0.002246,-0.000986,-0.001091,-0.000672,-0.000653,-0.000334,-0.0003,-0.00021,-0.000337,-0.000306,-1.633791e-07,-7.908155e-08,-8.845388e-06,-5.61237e-07,4.182813e-05,-1.469776e-07,-8.8e-05,-3.202806e-08,-9.100147e-08,-4.245099e-07,-1.640388e-06,-8.894644e-08,1.309342e-05,0.000217,0.053695,0.119859,0.02452,0.043831,0.013701,0.039527,0.001391,14.900734,0.519694,0.129107,0.128526,0.096237,0.102634,0.086337,0.102663,0.097634,0.107512,0.096927,0.090498,0.080579,0.000206,2.7e-05,0.004182,9.7e-05,0.012068,8.1e-05,0.023332,1.4e-05,4.2e-05,0.000145,0.000805,3.1e-05,0.002413,0.00027,0.032662,0.144298,0.025188,0.048007,0.017697,0.048755,0.002061,20.034402,0.703564,0.1739,0.168674,0.137296,0.159249,0.122341,0.145849,0.138655,0.14303,0.142541,0.127976,0.11251,0.000287,3.9e-05,0.005955,0.000135,0.016532,0.000111,0.031939,1.8e-05,5.8e-05,0.000195,0.001106,4e-05,0.003139,6.7e-05,1.631849,0.012356,9.0,0.0,0.0,0.053656,0.217267,0.8483,1.5e-05,87704.209952,-0.912563,1.83016,1.352834,1.442888,-1.178505,0.668149,1.672634
4,1,1,2023-04-29,salad,1,1,G2,-127.330101,88.080475,28.74946,24.44278,17.571262,14.89902,12.060678,10.527637,9.346112,8.616164,7.778655,7.169141,6.660098,109.931023,6.278783,4.672277,0.902494,0.954311,1.282215,1.246059,1.01189,1.154138,1.217876,1.117111,1.066348,1.162418,3.3e-05,0.139059,3.295916,0.023946,0.062572,0.003911,0.021432,0.000267,-67.812965,1.038983,0.111559,0.177122,0.079309,0.089514,0.041811,0.044066,0.032745,0.02858,0.019434,0.025346,0.023216,4.8e-05,5e-06,0.001268,2.3e-05,0.197185,1.5e-05,0.004966,4e-06,1.1e-05,3.1e-05,0.000205,5e-06,0.055365,0.0,-0.003991,1.5e-05,-0.000108,-0.000557,-2e-06,4.87891e-19,0.0,-0.385915,-0.013126,-0.002162,-0.002634,-0.001051,-0.00018,0.000776,4.7e-05,-0.001104,-0.000244,0.000517,-0.000372,-0.000684,-1.66439e-08,-1.123955e-09,-4.251626e-08,-1.976337e-08,5.833696e-07,-3.896364e-09,-1e-06,-1.97152e-10,-6.705402e-09,-9.549722e-09,-8.740924e-09,-1.435708e-09,1.951059e-07,0.000146,0.080914,0.056192,0.023802,0.058919,0.011343,0.035032,0.001019,24.906335,0.838537,0.136411,0.163105,0.082422,0.091643,0.079026,0.063215,0.056482,0.059726,0.056717,0.054456,0.051159,0.000173,1.4e-05,0.004208,7.1e-05,0.009819,5.1e-05,0.017751,1.3e-05,3.4e-05,9e-05,0.000696,1.5e-05,0.002082,0.000211,0.03203,0.058244,0.023484,0.067952,0.01395,0.037669,0.001356,32.630013,1.114373,0.1956,0.223128,0.112655,0.122641,0.11131,0.087495,0.079817,0.084577,0.076566,0.079936,0.073167,0.000237,1.6e-05,0.0054,8.7e-05,0.013283,6.8e-05,0.02447,1.6e-05,4.3e-05,0.000111,0.000916,1.8e-05,0.00295,1.7e-05,0.77145,0.003511,9.0,0.111556,0.334178,0.604156,0.740889,0.9786,2.2e-05,52998.819272,0.371457,0.457155,0.676132,0.577155,-1.348706,0.188683,1.734061


In [41]:
# Save the dataset
feat_norm_letters_dataset_path= r"..\data\processed\feat_norm_letters_" + str(window_size) + "_" + str(hop_length) + "_dataset.pkl"
feat_dataset.save(feat_norm_letters_dataset_path)

### Letters

In [42]:
# Load dataset
signal_dataset= SignalDataset.load(raw_letters_signal_dataset_path)

In [43]:
%%time

# Extract features and store in dataset
feat_dataset= FeaturesDataset.from_signal_dataset(signal_dataset, feature_extractor)

  feature_values.append(func(waveform_data))


CPU times: total: 5h 15min 25s
Wall time: 9h 31min 10s


In [44]:
# Save the dataset
feat_raw_letters_dataset_path= r"..\data\processed\feat_raw_letters_" + str(window_size) + "_" + str(hop_length) + "_dataset.pkl"
feat_dataset.save(feat_raw_letters_dataset_path)