# Make Dataset

## Imports

In [1]:
from PlantReactivityAnalysis.data.wav_data_reader import WavDataReader
from PlantReactivityAnalysis.data.signal_dataset import SignalDataset
from PlantReactivityAnalysis.features.wav_feature_extractor import WavFeatureExtractor
from PlantReactivityAnalysis.features.features_dataset import FeaturesDataset
import PlantReactivityAnalysis.data.preparation_eurythmy_data as ped

In [2]:
import pandas as pd
pd.set_option('display.max_columns', None)

## Reader

In [3]:
wav_folder= r"..\data\raw\wav_files"
#wav_folder= r"..\data\interim\testing"

In [4]:
# Initialize the reader with the folder of wavs
reader= WavDataReader(folder= wav_folder, sample_rate= 10000)

# Get the signals and the keys from the reader
signals, ids= reader.get_ordered_signals_and_keys()

# Get the measurement labels from the keys of the files
meas_df= ped.return_meas_labels_by_keys(ids)

Total WAV files read: 625


In [5]:
meas_df.head()

Unnamed: 0,id_measurement,id_performance,datetime,plant,generation,num_eurythmy
0,1,1,2023-04-29,salad,1,1
1,2,1,2023-04-29,salad,1,1
2,3,1,2023-04-29,salad,1,1
3,4,1,2023-04-29,salad,2,0
4,5,1,2023-04-29,salad,2,0


## Signal Dataset

### Raw

In [6]:
# Initialize SignalDataset with the signals and features/labels
signal_dataset= SignalDataset(signals= signals, features= meas_df)

In [7]:
signal_dataset.features.shape

(625, 6)

In [8]:
signal_dataset.features.head()

Unnamed: 0,id_measurement,id_performance,datetime,plant,generation,num_eurythmy
0,1,1,2023-04-29,salad,1,1
1,2,1,2023-04-29,salad,1,1
2,3,1,2023-04-29,salad,1,1
3,4,1,2023-04-29,salad,2,0
4,5,1,2023-04-29,salad,2,0


In [9]:
# Save and Load the dataset
raw_signal_dataset_path= r"..\data\raw\raw_signal_dataset.pkl"

In [10]:
signal_dataset.save(raw_signal_dataset_path)
signal_dataset= SignalDataset.load(raw_signal_dataset_path)

### Normalized

In [11]:
# Standardize Signals using Zscore
signal_dataset.standardize_signals("zscore")

In [12]:
signal_dataset.features.shape

(625, 6)

In [3]:
# Save the dataset
norm_signal_dataset_path= r"..\data\interim\norm_signal_dataset.pkl"
signal_dataset.save(norm_signal_dataset_path)

NameError: name 'signal_dataset' is not defined

### Segmented by Letters (raw)

In [14]:
# Load raw dataset
signal_dataset= SignalDataset.load(raw_signal_dataset_path)

In [15]:
# Get Letter segments from raw data
letter_dictionary= ped.return_letter_dictionary(indexes= signal_dataset.features['id_measurement'].tolist())

In [16]:
# Segment the dataset
signal_dataset.segment_signals_by_dict('id_measurement', letter_dictionary, 'eurythmy_letter')

In [17]:
signal_dataset.features.shape #8390, 7

(8878, 7)

In [18]:
signal_dataset.features.head()

Unnamed: 0,id_measurement,id_performance,datetime,plant,generation,num_eurythmy,eurythmy_letter
0,1,1,2023-04-29,salad,1,1,A1
1,1,1,2023-04-29,salad,1,1,G1
2,1,1,2023-04-29,salad,1,1,D1
3,1,1,2023-04-29,salad,1,1,A2
4,1,1,2023-04-29,salad,1,1,G2


In [13]:
raw_letters_signal_dataset_path= r"..\data\raw\raw_letters_signal_dataset.pkl"

In [20]:
# Save the dataset
signal_dataset.save(raw_letters_signal_dataset_path)

### Segmented by Letters (normalized)

In [21]:
# Load normalized dataset
signal_dataset= SignalDataset.load(norm_signal_dataset_path)

In [22]:
# Get Letter segments from raw data
letter_dictionary= ped.return_letter_dictionary(indexes= signal_dataset.features['id_measurement'].tolist())

In [23]:
# Segment the dataset
signal_dataset.segment_signals_by_dict('id_measurement', letter_dictionary, 'eurythmy_letter')

In [6]:
norm_letters_signal_dataset_path= r"..\data\interim\norm_letters_signal_dataset.pkl"

In [25]:
# Save the dataset
signal_dataset.save(norm_letters_signal_dataset_path)

### Segmented in 1s (raw)

In [26]:
# Load raw dataset
signal_dataset= SignalDataset.load(raw_signal_dataset_path)

In [27]:
# Segment Signals in 1sec segments
signal_dataset.segment_signals_by_duration(segment_duration= 1)

In [28]:
#Add eurythmy letter data to the features
df= ped.add_meas_letters(signal_dataset.features)

In [29]:
signal_dataset.features.shape

(148682, 8)

In [30]:
signal_dataset.features.iloc[15:20]

Unnamed: 0,id_measurement,id_performance,datetime,plant,generation,num_eurythmy,initial_second,eurythmy_letter
15,1,1,2023-04-29,salad,1,1,15.0,
16,1,1,2023-04-29,salad,1,1,16.0,
17,1,1,2023-04-29,salad,1,1,17.0,A1
18,1,1,2023-04-29,salad,1,1,18.0,A1
19,1,1,2023-04-29,salad,1,1,19.0,A1


In [31]:
# Save the dataset
raw_1s_signal_dataset_path= r"..\data\raw\raw_1s_signal_dataset.pkl"
signal_dataset.save(raw_1s_signal_dataset_path)

### Segmented in 1s (normalized)

In [32]:
# Load normalized dataset
signal_dataset= SignalDataset.load(norm_signal_dataset_path)

In [33]:
# Segment Signals in 1sec segments
signal_dataset.segment_signals_by_duration(segment_duration=1)

In [34]:
#Add eurythmy letter data to the features
df= ped.add_meas_letters(signal_dataset.features)

In [35]:
# Save the dataset
norm_1s_signal_dataset_path= r"..\data\interim\norm_1s_signal_dataset.pkl"
signal_dataset.save(norm_1s_signal_dataset_path)

## Features Dataset

In [4]:
window_size= 2
hop_length= 2

feature_extractor= WavFeatureExtractor(sample_rate= 10000, lib_mfccs= True, pyau_mfccs= True, temporal= True,
                                       statistical= True, window_size= window_size, hop_length= hop_length)

### Normalized letters

In [7]:
# Load dataset
signal_dataset= SignalDataset.load(norm_letters_signal_dataset_path)

In [8]:
%%time

# Extract features and store in dataset
feat_dataset= FeaturesDataset.from_signal_dataset(signal_dataset, feature_extractor)

  feature_values.append(func(waveform_data))


CPU times: total: 5h 2min 30s
Wall time: 3h 46min 9s


In [9]:
feat_dataset.features.shape

(8878, 187)

In [10]:
feat_dataset.features.head()

Unnamed: 0,id_measurement,id_performance,datetime,plant,generation,num_eurythmy,eurythmy_letter,lib_mfcc_1_avg,lib_mfcc_2_avg,lib_mfcc_3_avg,lib_mfcc_4_avg,lib_mfcc_5_avg,lib_mfcc_6_avg,lib_mfcc_7_avg,lib_mfcc_8_avg,lib_mfcc_9_avg,lib_mfcc_10_avg,lib_mfcc_11_avg,lib_mfcc_12_avg,lib_mfcc_13_avg,lib_mfcc_1_std,lib_mfcc_2_std,lib_mfcc_3_std,lib_mfcc_4_std,lib_mfcc_5_std,lib_mfcc_6_std,lib_mfcc_7_std,lib_mfcc_8_std,lib_mfcc_9_std,lib_mfcc_10_std,lib_mfcc_11_std,lib_mfcc_12_std,lib_mfcc_13_std,zcr_mean,energy_mean,energy_entropy_mean,spectral_centroid_mean,spectral_spread_mean,spectral_entropy_mean,spectral_flux_mean,spectral_rolloff_mean,mfcc_1_mean,mfcc_2_mean,mfcc_3_mean,mfcc_4_mean,mfcc_5_mean,mfcc_6_mean,mfcc_7_mean,mfcc_8_mean,mfcc_9_mean,mfcc_10_mean,mfcc_11_mean,mfcc_12_mean,mfcc_13_mean,chroma_1_mean,chroma_2_mean,chroma_3_mean,chroma_4_mean,chroma_5_mean,chroma_6_mean,chroma_7_mean,chroma_8_mean,chroma_9_mean,chroma_10_mean,chroma_11_mean,chroma_12_mean,chroma_std_mean,delta zcr_mean,delta energy_mean,delta energy_entropy_mean,delta spectral_centroid_mean,delta spectral_spread_mean,delta spectral_entropy_mean,delta spectral_flux_mean,delta spectral_rolloff_mean,delta mfcc_1_mean,delta mfcc_2_mean,delta mfcc_3_mean,delta mfcc_4_mean,delta mfcc_5_mean,delta mfcc_6_mean,delta mfcc_7_mean,delta mfcc_8_mean,delta mfcc_9_mean,delta mfcc_10_mean,delta mfcc_11_mean,delta mfcc_12_mean,delta mfcc_13_mean,delta chroma_1_mean,delta chroma_2_mean,delta chroma_3_mean,delta chroma_4_mean,delta chroma_5_mean,delta chroma_6_mean,delta chroma_7_mean,delta chroma_8_mean,delta chroma_9_mean,delta chroma_10_mean,delta chroma_11_mean,delta chroma_12_mean,delta chroma_std_mean,zcr_std,energy_std,energy_entropy_std,spectral_centroid_std,spectral_spread_std,spectral_entropy_std,spectral_flux_std,spectral_rolloff_std,mfcc_1_std,mfcc_2_std,mfcc_3_std,mfcc_4_std,mfcc_5_std,mfcc_6_std,mfcc_7_std,mfcc_8_std,mfcc_9_std,mfcc_10_std,mfcc_11_std,mfcc_12_std,mfcc_13_std,chroma_1_std,chroma_2_std,chroma_3_std,chroma_4_std,chroma_5_std,chroma_6_std,chroma_7_std,chroma_8_std,chroma_9_std,chroma_10_std,chroma_11_std,chroma_12_std,chroma_std_std,delta zcr_std,delta energy_std,delta energy_entropy_std,delta spectral_centroid_std,delta spectral_spread_std,delta spectral_entropy_std,delta spectral_flux_std,delta spectral_rolloff_std,delta mfcc_1_std,delta mfcc_2_std,delta mfcc_3_std,delta mfcc_4_std,delta mfcc_5_std,delta mfcc_6_std,delta mfcc_7_std,delta mfcc_8_std,delta mfcc_9_std,delta mfcc_10_std,delta mfcc_11_std,delta mfcc_12_std,delta mfcc_13_std,delta chroma_1_std,delta chroma_2_std,delta chroma_3_std,delta chroma_4_std,delta chroma_5_std,delta chroma_6_std,delta chroma_7_std,delta chroma_8_std,delta chroma_9_std,delta chroma_10_std,delta chroma_11_std,delta chroma_12_std,delta chroma_std_std,zero_crossing_rate,root_mean_square_energy,slope_sign_changes_ratio,duration_seconds,flatness_ratio_10000,flatness_ratio_5000,flatness_ratio_1000,flatness_ratio_500,flatness_ratio_100,hjorth_mobility,hjorth_complexity,mean,variance,standard_deviation,interquartile_range,skewness,kurtosis,dfa
0,1,1,2023-04-29,salad,1,1,A1,-29.677195,84.908073,23.882944,20.263571,13.948898,10.488656,8.994204,9.384411,10.199335,9.457244,6.846077,5.528338,5.709393,102.744949,5.298808,5.641,4.197596,5.010337,4.307392,3.860651,1.488685,1.654878,1.541794,1.190974,2.1784,1.267976,4e-05,0.082078,3.263784,0.031297,0.078345,0.004577,0.017891,0.0002,-63.643573,1.191232,0.150986,0.205074,0.089766,0.098427,0.058357,0.057284,0.034696,0.028017,0.017937,0.026331,0.014771,4.1e-05,7e-06,0.001271,4.1e-05,0.196961,2.3e-05,0.005336,3e-06,9e-06,3.3e-05,0.000302,1.1e-05,0.054781,0.0,-9.9e-05,-0.001405,0.000784,0.001895,8.2e-05,0.0004355507,0.0,0.696394,0.02348,0.003132,0.004224,0.001443,0.000917,-0.000189,0.001114,0.001192,0.000368,-0.000174,0.000826,1.5e-05,1.088596e-06,1.529988e-07,1.818875e-05,1.980175e-07,-5.417746e-05,3.284261e-07,0.000103,7.679803e-08,2.701098e-07,8.086974e-07,2.930764e-06,1.645357e-07,-1.743706e-05,0.000213,0.050824,0.127872,0.02522,0.058365,0.013922,0.033209,0.001174,24.306081,0.847447,0.124643,0.165771,0.094005,0.108946,0.071663,0.090793,0.064725,0.071365,0.069437,0.067187,0.066383,0.000158,2.9e-05,0.005067,0.000163,0.009513,8.9e-05,0.014981,8e-06,2.8e-05,8.4e-05,0.001337,4.5e-05,0.000893,0.000306,0.044577,0.151355,0.022483,0.048718,0.017942,0.038806,0.001684,21.447741,0.769409,0.128675,0.159426,0.104661,0.13694,0.094367,0.117136,0.086324,0.096386,0.097812,0.091364,0.09525,0.00021,3.9e-05,0.006984,0.000214,0.012332,0.00012,0.018749,1e-05,3.4e-05,0.000102,0.001867,5.9e-05,0.001023,1.7e-05,1.443897,0.018133,9.0,0.352622,0.352622,0.536244,0.616367,0.774433,1.7e-05,80464.14567,-0.069488,2.080011,1.442224,1.082166,-1.65149,1.428384,1.652581
1,1,1,2023-04-29,salad,1,1,G1,-100.697861,86.480331,30.171656,24.11973,17.713215,15.169153,12.242166,10.664472,9.575659,8.48641,7.267684,6.799727,7.062698,124.047333,7.396371,5.954452,0.87334,0.997615,1.579578,1.28064,1.023308,1.342239,1.522926,1.815407,1.931284,1.195273,7e-05,0.056408,3.240514,0.033382,0.070873,0.008805,0.062599,0.00054,-71.013852,0.934344,0.113942,0.159921,0.065911,0.084083,0.049058,0.046657,0.033227,0.045732,0.026167,0.023701,0.007208,7.7e-05,1.5e-05,0.004001,5.9e-05,0.193966,3.9e-05,0.009268,9e-06,2.2e-05,0.000128,0.000421,2.1e-05,0.054876,0.0,-0.004936,-0.001,0.00061,0.001415,6e-05,0.001486803,0.0,0.338419,0.012071,0.00142,0.002018,0.001082,0.001277,0.000951,0.001007,0.000592,0.000304,6.7e-05,0.000182,0.000228,6.82064e-07,4.103929e-08,2.623151e-05,2.006327e-07,-3.584353e-05,3.023455e-07,5.7e-05,7.706319e-08,9.362724e-08,1.9058e-07,6.562898e-07,8.735929e-08,-1.181173e-05,0.00027,0.061636,0.167907,0.034744,0.076751,0.020003,0.097263,0.001729,29.705222,1.000498,0.142186,0.192368,0.114031,0.104708,0.08799,0.099894,0.078608,0.102883,0.100889,0.080662,0.079898,0.000214,3.6e-05,0.012218,0.000164,0.015067,0.000128,0.021524,2.3e-05,5.4e-05,0.00032,0.001055,5.7e-05,0.001799,0.000396,0.016894,0.225994,0.041033,0.090611,0.027381,0.132122,0.002485,33.715343,1.1465,0.176029,0.220901,0.145257,0.134314,0.125225,0.131326,0.110507,0.147746,0.141739,0.110147,0.111929,0.000306,5e-05,0.01698,0.000224,0.021255,0.000183,0.030644,3.2e-05,7.7e-05,0.000441,0.001462,8.1e-05,0.002617,6e-06,0.732309,0.002644,9.0,0.328122,0.328122,0.688267,0.812756,0.9828,2.1e-05,54194.021625,0.579218,0.200783,0.448089,0.144289,-2.380866,4.743071,1.640983
2,1,1,2023-04-29,salad,1,1,D1,23.327625,89.250244,26.221252,23.516384,16.444624,13.346057,10.357534,8.935259,7.906825,7.286381,6.588382,6.108628,5.640996,60.338959,0.365579,0.368098,0.392962,0.2656,0.190609,0.31861,0.528917,0.752072,0.466588,0.234723,0.38131,0.696444,0.00011,0.198888,3.243688,0.047064,0.130659,0.00852,0.0293,0.0006,-35.484054,2.098472,0.26608,0.372382,0.174787,0.186858,0.09107,0.08631,0.051997,0.072076,0.043029,0.049743,0.044389,6e-05,1.4e-05,0.002395,7e-05,0.193673,5.9e-05,0.01152,8e-06,2.2e-05,5.9e-05,0.000584,1.6e-05,0.055201,0.0,0.001632,0.000752,-0.000325,-0.000393,-7.8e-05,-0.0001271403,0.0,0.003235,-0.000322,0.000565,0.00218,0.001826,0.001351,0.001931,0.000162,-0.002211,-2.7e-05,-0.001508,0.000718,0.000597,-4.753641e-07,-8.824976e-08,-7.575922e-06,-3.715069e-07,3.04747e-05,-2.796045e-07,-6.2e-05,-8.587731e-08,-1.794641e-07,-1.956541e-07,3.896605e-08,-1.587936e-07,9.659535e-06,0.000422,0.181552,0.176638,0.031505,0.05066,0.022444,0.04782,0.002307,15.311098,0.543559,0.159506,0.164084,0.128049,0.110844,0.128876,0.14003,0.133731,0.134252,0.124542,0.107471,0.104524,0.000185,4.9e-05,0.007117,0.00024,0.018745,0.000203,0.034615,2.5e-05,5.8e-05,0.000154,0.001932,5e-05,0.003913,0.000554,0.078167,0.190472,0.029003,0.056592,0.025669,0.058351,0.002866,22.700861,0.806247,0.234575,0.247159,0.186039,0.161244,0.18531,0.193026,0.187899,0.191078,0.185796,0.155922,0.148978,0.000227,6.4e-05,0.008686,0.000303,0.021443,0.00026,0.039803,3.3e-05,6.9e-05,0.000187,0.002277,6.1e-05,0.005689,3.3e-05,1.401224,0.010444,9.0,0.0,0.0,0.039,0.279922,0.871789,2e-05,59588.992615,-1.019622,0.923799,0.961145,1.515033,-0.104159,-0.900896,1.679972
3,1,1,2023-04-29,salad,1,1,A2,14.845909,89.415977,26.323404,23.582859,16.572325,13.621408,10.702744,9.225565,8.070519,7.477998,6.700114,6.021388,5.641294,48.2757,0.317327,0.272721,0.285732,0.262369,0.292171,0.322221,0.416858,0.315227,0.340292,0.284067,0.280039,0.272692,8e-05,0.125519,3.23951,0.043867,0.122151,0.007537,0.026346,0.00048,-37.648317,2.105766,0.260733,0.371918,0.162661,0.170489,0.094643,0.095728,0.054302,0.05415,0.041564,0.0492,0.036355,8.3e-05,1.3e-05,0.001865,4.5e-05,0.194239,3.4e-05,0.011042,7e-06,2e-05,7.1e-05,0.000367,1.6e-05,0.054814,0.0,-0.0022,0.000959,-0.000449,-0.00134,-4.7e-05,0.0001904281,0.0,-0.6765,-0.023191,-0.004101,-0.006912,-0.003378,-0.001381,-0.00058,-0.000422,0.000586,-0.001661,0.000419,-0.000782,-0.000515,-1.544461e-07,-7.156323e-08,-8.042627e-06,-5.083975e-07,3.792033e-05,-1.362389e-07,-8e-05,-2.893988e-08,-8.27191e-08,-3.878571e-07,-1.489236e-06,-8.059749e-08,1.187632e-05,0.000238,0.114639,0.140752,0.029849,0.053218,0.015667,0.045692,0.001528,17.837538,0.614918,0.134647,0.137512,0.098725,0.108725,0.092424,0.1049,0.098382,0.117667,0.10482,0.095653,0.084537,0.000251,3.2e-05,0.005,0.000117,0.014145,9.3e-05,0.027255,1.6e-05,5e-05,0.000183,0.000952,3.7e-05,0.002801,0.000282,0.040608,0.162164,0.025767,0.049208,0.019629,0.051782,0.002207,23.160617,0.807444,0.182331,0.182377,0.143348,0.168167,0.127159,0.146827,0.142618,0.155136,0.1545,0.129711,0.115196,0.00035,4.5e-05,0.00696,0.000159,0.018861,0.000126,0.036385,2e-05,6.7e-05,0.000239,0.001263,4.6e-05,0.003709,6.7e-05,1.631849,0.012356,9.0,0.0,0.0,0.053656,0.217267,0.8483,1.5e-05,87704.209952,-0.912563,1.83016,1.352834,1.442888,-1.178505,0.668149,1.672634
4,1,1,2023-04-29,salad,1,1,G2,-51.320831,90.299484,27.18132,24.430511,17.382122,14.503346,11.689738,10.318517,9.089991,8.298073,7.493353,6.917458,6.392589,90.932266,0.71464,0.705178,0.706185,0.706996,0.706821,0.704474,0.703918,0.704904,0.704429,0.701087,0.698257,0.699985,3e-05,0.130685,3.298378,0.022262,0.057568,0.003529,0.019628,0.00024,-70.256599,0.957951,0.103165,0.16348,0.073303,0.082656,0.038803,0.040801,0.030035,0.026299,0.017793,0.023184,0.021146,4.3e-05,4e-06,0.001144,2.1e-05,0.197461,1.4e-05,0.00448,4e-06,1e-05,2.8e-05,0.000185,5e-06,0.055355,0.0,-0.003592,1.3e-05,-9.7e-05,-0.000501,-2e-06,5.529431e-19,0.0,-0.347323,-0.011814,-0.001946,-0.002371,-0.000946,-0.000162,0.000699,4.2e-05,-0.000993,-0.00022,0.000465,-0.000335,-0.000615,-1.497951e-08,-1.011559e-09,-3.826463e-08,-1.778704e-08,5.250326e-07,-3.506728e-09,-1e-06,-1.774368e-10,-6.034862e-09,-8.59475e-09,-7.866832e-09,-1.292137e-09,1.755953e-07,0.00015,0.109744,0.05676,0.02416,0.059247,0.011864,0.037923,0.001071,26.591407,0.892106,0.14248,0.174391,0.085396,0.094232,0.08279,0.06418,0.056771,0.058036,0.055482,0.052266,0.048362,0.000198,1.4e-05,0.004417,7.2e-05,0.011264,5.2e-05,0.021073,1.3e-05,3.8e-05,9.2e-05,0.000826,1.6e-05,0.002363,0.000216,0.031344,0.058493,0.022818,0.067099,0.01464,0.038799,0.001431,33.015017,1.120694,0.193414,0.22158,0.108853,0.117628,0.111035,0.085365,0.079396,0.080952,0.073726,0.074666,0.068073,0.000271,1.6e-05,0.005832,8.8e-05,0.015451,6.8e-05,0.029219,1.6e-05,4.8e-05,0.000115,0.001094,2e-05,0.003393,1.7e-05,0.77145,0.003511,9.0,0.111556,0.334178,0.604156,0.740889,0.9786,2.2e-05,52998.819272,0.371457,0.457155,0.676132,0.577155,-1.348706,0.188683,1.734061


In [11]:
# Save the dataset
feat_norm_letters_dataset_path= r"..\data\processed\feat_norm_letters_" + str(window_size) + "_" + str(hop_length) + "_dataset.pkl"
feat_dataset.save(feat_norm_letters_dataset_path)

### Letters

In [14]:
# Load dataset
signal_dataset= SignalDataset.load(raw_letters_signal_dataset_path)

In [15]:
%%time

# Extract features and store in dataset
feat_dataset= FeaturesDataset.from_signal_dataset(signal_dataset, feature_extractor)

  feature_values.append(func(waveform_data))


CPU times: total: 5h 8min 1s
Wall time: 4h 55s


In [16]:
# Save the dataset
feat_raw_letters_dataset_path= r"..\data\processed\feat_raw_letters_" + str(window_size) + "_" + str(hop_length) + "_dataset.pkl"
feat_dataset.save(feat_raw_letters_dataset_path)