# Make Dataset

## Imports

In [1]:
import sys
sys.path.append("../src")

In [2]:
from data.wav_data_reader import WavDataReader
from data.signal_dataset import SignalDataset
from features.wav_feature_extractor import WavFeatureExtractor
from features.features_dataset import FeaturesDataset
import data.preparation_eurythmy_data as ped

## Reader

In [3]:
wav_folder= r"..\data\raw\wav_files"
#wav_folder= r"..\data\interim\testing"

In [4]:
# Initialize the reader with the folder of wavs
reader= WavDataReader(folder= wav_folder)

# Get the signals and the keys from the reader
signals, ids= reader.get_values_and_keys()

# Get the measurement labels from the keys of the files
meas_df= ped.return_meas_labels_by_keys(ids)

## Signal Dataset

In [5]:
%%time
# Initialize SignalDataset with the signals and features/labels
signal_dataset= SignalDataset(signals= signals, features= meas_df)

# Standardize Signals using Zscore
signal_dataset.standardize_signals("zscore")

# Segment Signals in 1sec segments
signal_dataset.segment_signals(segment_duration=1)

# Remove signals whose values are all equal
#signal_dataset.remove_constant_signals()

#Add eurythmy letter data to the features
df= ped.add_meas_letters(signal_dataset.features)

CPU times: total: 4min 23s
Wall time: 5min 29s


In [6]:
signal_dataset.features.shape

(148682, 8)

In [9]:
signal_dataset.features.head()

Unnamed: 0,id_measurement,id_performance,datetime,plant,generation,num_eurythmy,initial_second,eurythmy_letter
0,1,1,2023-04-29,salad,1,1,0.0,
1,1,1,2023-04-29,salad,1,1,1.0,
2,1,1,2023-04-29,salad,1,1,2.0,
3,1,1,2023-04-29,salad,1,1,3.0,
4,1,1,2023-04-29,salad,1,1,4.0,


In [10]:
signal_dataset_path= r"..\data\processed\signal_dataset"
signal_dataset.save(signal_dataset_path)

## Feature Extractor

In [9]:
extractor= WavFeatureExtractor(sample_rate= 10000, mfccs= True, temporal= True, statistical= True)

## Features Dataset

In [10]:
%%time
feat_dataset= FeaturesDataset(signal_dataset,extractor)
feat_dataset.features.shape

  feature_values.append(func(waveform_data))


len signals:  148682 len var feat:  148682 signal feat:  148682
id_measurement         0
id_performance         0
datetime               0
plant                  0
generation             0
num_eurythmy           0
initial_second         0
eurythmy_letter    48117
dtype: int64
mfcc_1_avg                     0
mfcc_2_avg                     0
mfcc_3_avg                     0
mfcc_4_avg                     0
mfcc_5_avg                     0
mfcc_6_avg                     0
mfcc_7_avg                     0
mfcc_8_avg                     0
mfcc_9_avg                     0
mfcc_10_avg                    0
mfcc_11_avg                    0
mfcc_12_avg                    0
mfcc_13_avg                    0
mfcc_1_std                     0
mfcc_2_std                     0
mfcc_3_std                     0
mfcc_4_std                     0
mfcc_5_std                     0
mfcc_6_std                     0
mfcc_7_std                     0
mfcc_8_std                     0
mfcc_9_std                    

(148682, 52)

In [11]:
feat_dataset.features.head()

Unnamed: 0,id_measurement,id_performance,datetime,plant,generation,num_eurythmy,initial_second,eurythmy_letter,mfcc_1_avg,mfcc_2_avg,...,flatness_ratio_100,hjorth_mobility,hjorth_complexity,mean,variance,standard_deviation,interquartile_range,skewness,kurtosis,dfa
0,1,1,2023-04-29,salad,1,1,0.0,,-232.006348,87.030777,...,1.0,0.000128,8171.888932,1.013423,0.013814,0.117533,0.18195,0.224347,-0.509566,1.5868
1,1,1,2023-04-29,salad,1,1,1.0,,-250.255188,85.806961,...,0.9894,0.000129,8190.134755,0.865816,0.015855,0.125916,0.18195,0.418608,-0.598494,1.466508
2,1,1,2023-04-29,salad,1,1,2.0,,-278.646332,68.209419,...,0.9941,7.7e-05,13453.068166,1.28905,0.05486,0.234221,0.454875,-0.314866,-1.620937,1.336079
3,1,1,2023-04-29,salad,1,1,3.0,,-276.146942,74.985809,...,1.0,0.000112,9082.708501,1.374193,0.010778,0.103816,0.090975,-0.524587,0.561958,1.404778
4,1,1,2023-04-29,salad,1,1,4.0,,-299.724091,62.226551,...,1.0,8.5e-05,12209.774692,1.289022,0.029832,0.172718,0.272925,-0.852706,-0.735823,1.509514


In [15]:
# Save Dataset
feat_dataset_path= r"..\data\processed\features_dataset"
feat_dataset.save(feat_dataset_path)
feat_dataset_path_csv= r"..\data\processed\features_dataset.csv"
feat_dataset.save_to_csv(feat_dataset_path_csv)