# Make Dataset

## Imports

In [1]:
import sys
sys.path.append("../src")

In [2]:
from data.wav_data_reader import WavDataReader
from data.signal_dataset import SignalDataset
from features.wav_feature_extractor import WavFeatureExtractor
from features.features_dataset import FeaturesDataset
import data.preparation_eurythmy_data as ped

## Reader

In [3]:
wav_folder= r"..\data\raw\wav_files"
#wav_folder= r"..\data\interim\testing"

In [4]:
# Initialize the reader with the folder of wavs
reader= WavDataReader(folder= wav_folder, sample_rate= 10000)

# Get the signals and the keys from the reader
signals, ids= reader.get_values_and_keys()

# Get the measurement labels from the keys of the files
meas_df= ped.return_meas_labels_by_keys(ids)

Total WAV files read: 625


In [5]:
meas_df.head()

Unnamed: 0,id_measurement,id_performance,datetime,plant,generation,num_eurythmy
0,1,1,2023-04-29,salad,1,1
1,2,1,2023-04-29,salad,1,1
2,3,1,2023-04-29,salad,1,1
3,4,1,2023-04-29,salad,2,0
4,5,1,2023-04-29,salad,2,0


## Signal Dataset

### Raw

In [6]:
# Initialize SignalDataset with the signals and features/labels
signal_dataset= SignalDataset(signals= signals, features= meas_df)

In [7]:
signal_dataset.features.shape

(625, 6)

In [8]:
signal_dataset.features.head()

Unnamed: 0,id_measurement,id_performance,datetime,plant,generation,num_eurythmy
0,1,1,2023-04-29,salad,1,1
1,2,1,2023-04-29,salad,1,1
2,3,1,2023-04-29,salad,1,1
3,4,1,2023-04-29,salad,2,0
4,5,1,2023-04-29,salad,2,0


In [9]:
# Save and Load the dataset
raw_signal_dataset_path= r"..\data\raw\raw_signal_dataset.pkl"
signal_dataset.save(raw_signal_dataset_path)
signal_dataset= SignalDataset.load(raw_signal_dataset_path)

### Normalized

In [10]:
# Standardize Signals using Zscore
signal_dataset.standardize_signals("zscore")

In [11]:
signal_dataset.features.shape

(625, 6)

In [12]:
# Save the dataset
norm_signal_dataset_path= r"..\data\interim\norm_signal_dataset.pkl"
signal_dataset.save(norm_signal_dataset_path)

### Segmented by Letters (raw)

In [13]:
# Load raw dataset
signal_dataset= SignalDataset.load(raw_signal_dataset_path)

In [14]:
# Get Letter segments from raw data
letter_dictionary= ped.return_letter_dictionary(indexes= signal_dataset.features['id_measurement'].tolist())

In [15]:
# Segment the dataset
signal_dataset.segment_signals_by_dict('id_measurement', letter_dictionary, 'eurythy_letter')

In [16]:
signal_dataset.features.shape

(8390, 7)

In [17]:
signal_dataset.features.head()

Unnamed: 0,id_measurement,id_performance,datetime,plant,generation,num_eurythmy,segment
0,1,1,2023-04-29,salad,1,1,A1
1,1,1,2023-04-29,salad,1,1,G1
2,1,1,2023-04-29,salad,1,1,D1
3,1,1,2023-04-29,salad,1,1,A2
4,1,1,2023-04-29,salad,1,1,G2


In [18]:
raw_letters_signal_dataset_path= r"..\data\raw\raw_letters_signal_dataset.pkl"

In [19]:
# Save the dataset
signal_dataset.save(raw_letters_signal_dataset_path)

### Segmented by Letters (normalized)

In [20]:
# Load normalized dataset
signal_dataset= SignalDataset.load(norm_signal_dataset_path)

In [21]:
# Get Letter segments from raw data
letter_dictionary= ped.return_letter_dictionary(indexes= signal_dataset.features['id_measurement'].tolist())

In [22]:
# Segment the dataset
signal_dataset.segment_signals_by_dict('id_measurement', letter_dictionary, 'eurythy_letter')

In [23]:
norm_letters_signal_dataset_path= r"..\data\interim\norm_letters_signal_dataset.pkl"

In [24]:
# Save the dataset
signal_dataset.save(norm_letters_signal_dataset_path)

### Segmented in 1s (raw)

In [25]:
# Load raw dataset
signal_dataset= SignalDataset.load(raw_signal_dataset_path)

In [26]:
# Segment Signals in 1sec segments
signal_dataset.segment_signals_by_duration(segment_duration=1)

In [27]:
#Add eurythmy letter data to the features
df= ped.add_meas_letters(signal_dataset.features)

In [28]:
signal_dataset.features.shape

(148682, 8)

In [29]:
signal_dataset.features.iloc[15:20]

Unnamed: 0,id_measurement,id_performance,datetime,plant,generation,num_eurythmy,initial_second,eurythmy_letter
15,1,1,2023-04-29,salad,1,1,15.0,
16,1,1,2023-04-29,salad,1,1,16.0,
17,1,1,2023-04-29,salad,1,1,17.0,A1
18,1,1,2023-04-29,salad,1,1,18.0,A1
19,1,1,2023-04-29,salad,1,1,19.0,A1


In [30]:
# Save the dataset
raw_1s_signal_dataset_path= r"..\data\raw\raw_1s_signal_dataset.pkl"
signal_dataset.save(raw_1s_signal_dataset_path)

### Segmented in 1s (normalized)

In [31]:
# Load normalized dataset
signal_dataset= SignalDataset.load(norm_signal_dataset_path)

In [32]:
# Segment Signals in 1sec segments
signal_dataset.segment_signals_by_duration(segment_duration=1)

In [33]:
#Add eurythmy letter data to the features
df= ped.add_meas_letters(signal_dataset.features)

In [34]:
# Save the dataset
norm_1s_signal_dataset_path= r"..\data\interim\norm_1s_signal_dataset.pkl"
signal_dataset.save(norm_1s_signal_dataset_path)

## Features Dataset

In [35]:
window_size= 1
hop_length= 1

feature_extractor= WavFeatureExtractor(sample_rate= 10000, lib_mfccs= True, pyau_mfccs= True, temporal= True,
                                       statistical= True, window_size= window_size, hop_length= hop_length)

### Normalized letters

In [6]:
# Load dataset
signal_dataset= SignalDataset.load(norm_letters_signal_dataset_path)

In [7]:
%%time

# Extract features and store in dataset
feat_dataset= FeaturesDataset.from_signal_dataset(signal_dataset, feature_extractor)

  feature_values.append(func(waveform_data))


CPU times: total: 4h 24min 7s
Wall time: 3h 9min 51s


In [8]:
feat_dataset.features.shape

(8390, 187)

In [9]:
feat_dataset.features.head()

Unnamed: 0,id_measurement,id_performance,datetime,plant,generation,num_eurythmy,segment,lib_mfcc_1_avg,lib_mfcc_2_avg,lib_mfcc_3_avg,...,flatness_ratio_100,hjorth_mobility,hjorth_complexity,mean,variance,standard_deviation,interquartile_range,skewness,kurtosis,dfa
0,1,1,2023-04-29,salad,1,1,A1,-85.835655,90.29126,27.150677,...,0.998622,2e-05,59933.361624,0.766171,0.456839,0.675898,1.182676,-0.391712,-0.974329,1.553715
1,1,1,2023-04-29,salad,1,1,G1,-96.628212,83.604851,26.647131,...,0.992111,3.5e-05,33676.824714,0.320248,0.232568,0.482253,0.272925,0.298048,1.306129,1.63642
2,1,1,2023-04-29,salad,1,1,D1,-109.998672,89.973495,27.083378,...,0.997756,1.3e-05,102873.344464,0.787085,0.25405,0.504034,1.091701,0.317148,-1.614749,1.616557
3,1,1,2023-04-29,salad,1,1,A2,-83.30658,89.631935,26.564987,...,0.988978,2.1e-05,67718.649555,0.612003,0.133273,0.365066,0.54585,0.745154,-0.625135,1.623486
4,1,1,2023-04-29,salad,1,1,G2,-65.591743,89.487083,26.376032,...,0.9949,2.4e-05,51573.974516,-0.305329,0.579268,0.761097,1.273651,-0.153654,-0.858505,1.643827


In [10]:
# Show features list
feat_dataset.features.columns.to_list()

['id_measurement',
 'id_performance',
 'datetime',
 'plant',
 'generation',
 'num_eurythmy',
 'segment',
 'lib_mfcc_1_avg',
 'lib_mfcc_2_avg',
 'lib_mfcc_3_avg',
 'lib_mfcc_4_avg',
 'lib_mfcc_5_avg',
 'lib_mfcc_6_avg',
 'lib_mfcc_7_avg',
 'lib_mfcc_8_avg',
 'lib_mfcc_9_avg',
 'lib_mfcc_10_avg',
 'lib_mfcc_11_avg',
 'lib_mfcc_12_avg',
 'lib_mfcc_13_avg',
 'lib_mfcc_1_std',
 'lib_mfcc_2_std',
 'lib_mfcc_3_std',
 'lib_mfcc_4_std',
 'lib_mfcc_5_std',
 'lib_mfcc_6_std',
 'lib_mfcc_7_std',
 'lib_mfcc_8_std',
 'lib_mfcc_9_std',
 'lib_mfcc_10_std',
 'lib_mfcc_11_std',
 'lib_mfcc_12_std',
 'lib_mfcc_13_std',
 'zcr_mean',
 'energy_mean',
 'energy_entropy_mean',
 'spectral_centroid_mean',
 'spectral_spread_mean',
 'spectral_entropy_mean',
 'spectral_flux_mean',
 'spectral_rolloff_mean',
 'mfcc_1_mean',
 'mfcc_2_mean',
 'mfcc_3_mean',
 'mfcc_4_mean',
 'mfcc_5_mean',
 'mfcc_6_mean',
 'mfcc_7_mean',
 'mfcc_8_mean',
 'mfcc_9_mean',
 'mfcc_10_mean',
 'mfcc_11_mean',
 'mfcc_12_mean',
 'mfcc_13_mean',
 

In [11]:
# Save the dataset
feat_norm_letters_dataset_path= r"..\data\processed\feat_norm_letters_" + str(window_size) + "_" + str(hop_length) + "_dataset.pkl"
#feat_dataset.save(feat_norm_letters_dataset_path)

### Letters

In [36]:
# Load dataset
signal_dataset= SignalDataset.load(raw_letters_signal_dataset_path)

In [37]:
%%time

# Extract features and store in dataset
feat_dataset= FeaturesDataset.from_signal_dataset(signal_dataset, feature_extractor)

  feature_values.append(func(waveform_data))


CPU times: total: 4h 4min 43s
Wall time: 2h 37min 45s


In [38]:
# Save the dataset
feat_raw_letters_dataset_path= r"..\data\processed\feat_raw_letters_" + str(window_size) + "_" + str(hop_length) + "_dataset.pkl"
feat_dataset.save(feat_raw_letters_dataset_path)