Imports

In [1]:
import sys
sys.path.append("../src") 

In [2]:
import pandas as pd
from data.wav_data_reader import WavDataReader
from data.signal_dataset import SignalDataset
from features.independent_wav_feature_extractor import IndependentWavFeatureExtractor
from features.features_dataset import FeaturesDataset
import data.preparation_eurythmy_data as ped

Reader

In [3]:
test_folder= r"..\data\interim\testing"

In [4]:
reader= WavDataReader(folder= test_folder, sample_rate= 10000, extract_key= True) #In your case extract_key= False

In [5]:
keys= reader.get_keys()

In [6]:
meas_labels= ped.return_meas_labels_by_keys(keys)

Signal Dataset

In [7]:
signal_dataset= SignalDataset(signals= reader.get_values(), labels= keys, sample_rate= 10000)
signal_dataset.standardize_signals("zscore") 

In [8]:
signal_dataset.add_labels(meas_labels)

In [9]:
time_intervals= ped.return_meas_time_intervals(keys)
signal_dataset.reduce_signals_given_intervals(time_intervals) ###
signal_dataset.segment_signals(segment_duration=1)
labels= signal_dataset.get_labels()

In [10]:
letter_labels= ped.return_meas_letters(keys, labels, time_intervals)

In [11]:
signal_dataset.add_labels(letter_labels)

In [12]:
signal_dataset.remove_signals_with_nan_labels()

In [13]:
signals, targets= signal_dataset.get_data()

Feature Extractor

In [14]:
extractor= IndependentWavFeatureExtractor(sample_rate= 10000, n_mfcc= 13, n_fft= 2000, hop_length= 500)
all_features, feature_labels= extractor.extract_features_multiple_waveforms(waveforms= signals, mfccs= True, temporal= True, statistical= True)

  feature_values.append(func(waveform_data))


Features Dataset

In [15]:
feat_dataset= FeaturesDataset(features= all_features, targets=targets, feature_labels= feature_labels)
feat_dataset.head()

Unnamed: 0,mfcc_1_avg,mfcc_2_avg,mfcc_3_avg,mfcc_4_avg,mfcc_5_avg,mfcc_6_avg,mfcc_7_avg,mfcc_8_avg,mfcc_9_avg,mfcc_10_avg,...,hjorth_complexity,hurst,hurst_r2,mean,variance,standard_deviation,interquartile_range,skewness,kurtosis,dfa
0,-394.696472,41.188519,17.060287,15.980519,13.269508,12.131279,11.005676,10.424345,9.895156,9.539418,...,10047.712778,,,0.798532,0.000256,0.015989,0.0,4.044715,14.359695,1.166856
1,-339.346985,69.101479,25.654896,22.290117,17.220842,14.407759,12.259462,11.038075,10.120054,9.394704,...,8979.443811,,,0.708258,0.01187,0.108948,0.072144,-2.697753,7.410974,1.404157
2,-134.742676,74.356049,15.740224,13.646601,7.173235,5.148582,4.481388,6.775134,8.556374,8.578783,...,27533.780331,,,-2.428805,1.463942,1.209935,1.731466,0.195608,-0.472574,1.899436
3,-129.837006,74.563416,15.547497,13.592448,6.508352,3.720716,3.01283,5.938602,9.694375,10.272288,...,8740.400726,,,-2.602803,1.104209,1.050813,1.515033,-0.081993,-0.633422,1.951924
4,-213.327225,87.220932,26.024193,23.286495,16.471249,13.276547,10.217538,9.119078,7.978412,7.074394,...,8157.016352,,,0.197959,0.176345,0.419935,0.577155,-1.12373,0.406367,1.749738


In [16]:
feat_dataset.shape()

(1238, 40)

In [17]:
feat_dataset.preprocess_features(targets= targets)
"""
You can also process it separately like this:
feat_dataset.remove_nan_columns()
feat_dataset.normalize_features(method='zscore')
feat_dataset.treat_outliers(iqr_multiplier=1.5)
feat_dataset.reduce_features(targets, corr_threshold=0.8) #You reduce the features that are correlated by more than the corr_treshold selecting by p_value
"""
processed_features= feat_dataset.get_features_dataframe()
processed_features.head()

Removed columns with NaNs: ['hurst', 'hurst_r2', 'skewness', 'kurtosis']
The Features were properly normalized using 'zscore' method.
Outliers have been treated based on the 1.5 * IQR criterion.


KeyError: False

In [None]:
processed_features.shape

In [None]:
feat_dataset.save_to_csv(filepath='features.csv')
old_feat_dataset= FeaturesDataset.load_from_csv(filepath='features.csv')
old_feat_dataset.head()