- Extract HuBert features from raw audio files. 
- The median length of audio from all cultures is 15 sec. 
- Raw audio files have been cropped to 15 secs, input to HuBert model and extracted features are stored in a dataframe in the 'feature' column. 
- Valence values are cropped to equivalent frame idx. 
- This is done separately for each culture. 

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
import os
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from transformers import Wav2Vec2Processor, HubertModel
import torch
from sklearn.decomposition import PCA
import librosa

In [3]:
# Load HuBert processor and model
processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")

Some weights of HubertModel were not initialized from the model checkpoint at facebook/hubert-large-ls960-ft and are newly initialized: ['hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
def extract_features_and_labels(input_folder, culture_key):
    pca = PCA(n_components=128)  
    all_data = []

    for folder_name in os.listdir(input_folder):
        folder_path = os.path.join(input_folder, folder_name)
        culture_id = folder_name.split('_')[1]

        if os.path.isdir(folder_path) and culture_id == culture_key:
            all_features = []
            all_labels = []

            for file_name in os.listdir(folder_path):
                if file_name.endswith('.wav') and not file_name.endswith('16khz.wav'):
                    audio_path = os.path.join(folder_path, file_name)
                    label_file_path = os.path.join(folder_path, f"{folder_name}_Arousal_A_Aligned.csv")
                    try:
                        audio, sr = librosa.load(audio_path, sr=16000, duration=15)
                        inputs = processor(audio, sampling_rate=sr, return_tensors="pt", padding=True)

                        with torch.no_grad():
                            features = model(inputs.input_values).last_hidden_state.squeeze(0).numpy()


                        # Flatten features if necessary before PCA
                        if features.ndim > 1:
                            features = features.reshape(features.shape[0], -1)

                        reduced_features = pca.fit_transform(features)
                        all_features.append(reduced_features)

                        labels_df = pd.read_csv(label_file_path)
                        labels_df = labels_df.iloc[:len(reduced_features)]
                        all_labels.append(labels_df['arousal'].values)
                        print(f"{audio_path} processed.")

                    except Exception as e:
                        print(f"Error processing {audio_path} or {label_file_path}: {e}")

            if all_features and all_labels:
                folder_features = np.vstack(all_features)
                folder_labels = np.concatenate(all_labels)
                all_data.append({'features': folder_features, 'label': folder_labels, 'culture_id': culture_id})

    complete_df = pd.DataFrame(all_data)
    return complete_df

input_folder = 'SEWAv02'

In [15]:
df_C1_PCA = extract_features_and_labels(input_folder, "C1")

SEWAv02/SSD_C1_S001_P001_VC1_004201_005201/SSD_C1_S001_VC1_004201_005201.wav processed.
SEWAv02/SVL_C1_S027_P053_VC1_002061_003937/SVL_C1_S027_VC1_002061_003937.wav processed.
SEWAv02/SSD_C1_S018_P036_VC1_002801_003601/SSD_C1_S018_VC1_002801_003601.wav processed.
SEWAv02/SVL_C1_S018_P036_VC1_002718_004073/SVL_C1_S018_VC1_002718_004073.wav processed.
SEWAv02/SSL_C1_S018_P035_VC1_002201_002701/SSL_C1_S018_VC1_002201_002701.wav processed.
SEWAv02/SAL_C1_S031_P062_VC1_001438_002301/SAL_C1_S031_VC1_001438_002301.wav processed.
SEWAv02/SSL_C1_S029_P057_VC1_003151_003901/SSL_C1_S029_VC1_003151_003901.wav processed.
SEWAv02/SSD_C1_S023_P045_VC1_003701_004001/SSD_C1_S023_VC1_003701_004001.wav processed.
SEWAv02/SSD_C1_S023_P045_VC1_001801_002201/SSD_C1_S023_VC1_001801_002201.wav processed.
SEWAv02/SAL_C1_S017_P034_VC1_001001_003801/SAL_C1_S017_VC1_001001_003801.wav processed.
SEWAv02/SSL_C1_S194_P388_VC1_005501_005901/SSL_C1_S194_VC1_005501_005901.wav processed.
SEWAv02/SAL_C1_S002_P004_VC1_002

In [18]:
df_C2_PCA = extract_features_and_labels(input_folder, "C2")

SEWAv02/SVL_C2_S062_P123_VC1_000489_001586/SVL_C2_S062_VC1_000489_001586.wav processed.
SEWAv02/SVL_C2_S048_P096_VC1_000837_001711/SVL_C2_S048_VC1_000837_001711.wav processed.
SEWAv02/SVL_C2_S050_P099_VC1_003700_004484/SVL_C2_S050_VC1_003700_004484.wav processed.
SEWAv02/SAH_C2_S044_P088_VC1_000834_001365/SAH_C2_S044_VC1_000834_001365.wav processed.
SEWAv02/SAH_C2_S037_P074_VC1_002225_003088/SAH_C2_S037_VC1_002225_003088.wav processed.
SEWAv02/SSD_C2_S042_P083_VC1_002694_003374/SSD_C2_S042_VC1_002694_003374.wav processed.
SEWAv02/SVH_C2_S059_P117_VC1_000601_001353/SVH_C2_S059_VC1_000601_001353.wav processed.
SEWAv02/SAL_C2_S041_P081_VC1_003415_004258/SAL_C2_S041_VC1_003415_004258.wav processed.
SEWAv02/SVL_C2_S199_P397_VC1_004085_004313/SVL_C2_S199_VC1_004085_004313.wav processed.
SEWAv02/SSD_C2_S048_P096_VC1_004264_004849/SSD_C2_S048_VC1_004264_004849.wav processed.
SEWAv02/SVL_C2_S197_P393_VC1_000787_002868/SVL_C2_S197_VC1_000787_002868.wav processed.
SEWAv02/SSL_C2_S059_P118_VC1_001

In [19]:
df_C3_PCA = extract_features_and_labels(input_folder, "C3")

SEWAv02/SSD_C3_S092_P183_VC1_003651_004631/SSD_C3_S092_VC1_003651_004631.wav processed.
SEWAv02/SVH_C3_S085_P170_VC1_000901_001561/SVH_C3_S085_VC1_000901_001561.wav processed.
SEWAv02/SSD_C3_S074_P148_VC1_001201_001631/SSD_C3_S074_VC1_001201_001631.wav processed.
SEWAv02/SVL_C3_S075_P150_VC1_005041_005637/SVL_C3_S075_VC1_005041_005637.wav processed.
SEWAv02/SAL_C3_S083_P165_VC1_003801_004791/SAL_C3_S083_VC1_003801_004791.wav processed.
SEWAv02/SVL_C3_S064_P127_VC1_001291_001731/SVL_C3_S064_VC1_001291_001731.wav processed.
SEWAv02/SSD_C3_S087_P173_VC1_002141_002821/SSD_C3_S087_VC1_002141_002821.wav processed.
SEWAv02/SSD_C3_S079_P157_VC1_006691_008021/SSD_C3_S079_VC1_006691_008021.wav processed.
SEWAv02/SAH_C3_S082_P164_VC1_000521_001701/SAH_C3_S082_VC1_000521_001701.wav processed.
SEWAv02/SSL_C3_S065_P129_VC1_002026_002801/SSL_C3_S065_VC1_002026_002801.wav processed.
SEWAv02/SAH_C3_S075_P150_VC1_001951_002465/SAH_C3_S075_VC1_001951_002465.wav processed.
SEWAv02/SAH_C3_S092_P184_VC1_001

In [20]:
df_C4_PCA = extract_features_and_labels(input_folder, "C4")

SEWAv02/SSD_C4_S111_P222_VC1_000901_001550/SSD_C4_S111_VC1_000901_001550.wav processed.
SEWAv02/SAL_C4_S095_P189_VC1_004951_005150/SAL_C4_S095_VC1_004951_005150.wav processed.
SEWAv02/SSD_C4_S097_P194_VC1_002451_003100/SSD_C4_S097_VC1_002451_003100.wav processed.
SEWAv02/SAH_C4_S097_P193_VC1_003701_004200/SAH_C4_S097_VC1_003701_004200.wav processed.
SEWAv02/SSL_C4_S097_P193_VC1_006351_007100/SSL_C4_S097_VC1_006351_007100.wav processed.
SEWAv02/SVH_C4_S116_P232_VC1_005851_006550/SVH_C4_S116_VC1_005851_006550.wav processed.
SEWAv02/SVH_C4_S115_P229_VC1_006051_006550/SVH_C4_S115_VC1_006051_006550.wav processed.
SEWAv02/SAH_C4_S122_P244_VC1_003301_004400/SAH_C4_S122_VC1_003301_004400.wav processed.
SEWAv02/SAL_C4_S111_P221_VC1_000901_001550/SAL_C4_S111_VC1_000901_001550.wav processed.
SEWAv02/SAH_C4_S110_P219_VC1_001651_002950/SAH_C4_S110_VC1_001651_002950.wav processed.
SEWAv02/SAL_C4_S109_P217_VC1_000001_001150/SAL_C4_S109_VC1_000001_001150.wav processed.
SEWAv02/SAL_C4_S096_P191_VC1_006

In [21]:
df_C5_PCA = extract_features_and_labels(input_folder, "C5")

SEWAv02/SSD_C5_S143_P285_VC1_002401_002651/SSD_C5_S143_VC1_002401_002651.wav processed.
SEWAv02/SVH_C5_S144_P287_VC1_003201_003701/SVH_C5_S144_VC1_003201_003701.wav processed.
SEWAv02/SSD_C5_S138_P275_VC1_001001_001401/SSD_C5_S138_VC1_001001_001401.wav processed.
SEWAv02/SAH_C5_S140_P279_VC1_002601_002901/SAH_C5_S140_VC1_002601_002901.wav processed.
SEWAv02/SAH_C5_S132_P263_VC1_001151_001601/SAH_C5_S132_VC1_001151_001601.wav processed.
SEWAv02/SSD_C5_S140_P280_VC1_001901_002051/SSD_C5_S140_VC1_001901_002051.wav processed.
SEWAv02/SAH_C5_S146_P292_VC1_002901_003151/SAH_C5_S146_VC1_002901_003151.wav processed.
SEWAv02/SAH_C5_S148_P295_VC1_002201_002651/SAH_C5_S148_VC1_002201_002651.wav processed.
SEWAv02/SSD_C5_S154_P307_VC1_000751_001201/SSD_C5_S154_VC1_000751_001201.wav processed.
SEWAv02/SVL_C5_S163_P325_VC1_003051_003451/SVL_C5_S163_VC1_003051_003451.wav processed.
SEWAv02/SVH_C5_S151_P301_VC1_006501_007001/SVH_C5_S151_VC1_006501_007001.wav processed.
SEWAv02/SSD_C5_S128_P255_VC1_005

In [22]:
df_C6_PCA = extract_features_and_labels(input_folder, "C6")

SEWAv02/SSD_C6_S190_P380_VC1_001527_006847/SSD_C6_S190_VC1_001527_006847.wav processed.
SEWAv02/SSL_C6_S179_P358_VC1_000223_002217/SSL_C6_S179_VC1_000223_002217.wav processed.
SEWAv02/SVL_C6_S189_P377_VC1_000002_000540/SVL_C6_S189_VC1_000002_000540.wav processed.
SEWAv02/SSD_C6_S165_P330_VC1_002018_003436/SSD_C6_S165_VC1_002018_003436.wav processed.
SEWAv02/SSL_C6_S180_P359_VC1_001744_005672/SSL_C6_S180_VC1_001744_005672.wav processed.
SEWAv02/SVL_C6_S176_P352_VC1_000002_000748/SVL_C6_S176_VC1_000002_000748.wav processed.
SEWAv02/SAH_C6_S183_P366_VC1_002795_003506/SAH_C6_S183_VC1_002795_003506.wav processed.
SEWAv02/SSL_C6_S185_P369_VC1_001136_002527/SSL_C6_S185_VC1_001136_002527.wav processed.
SEWAv02/SVH_C6_S180_P359_VC1_000977_001457/SVH_C6_S180_VC1_000977_001457.wav processed.
SEWAv02/SAH_C6_S188_P376_VC1_002183_002875/SAH_C6_S188_VC1_002183_002875.wav processed.
SEWAv02/SSD_C6_S164_P328_VC1_000613_003352/SSD_C6_S164_VC1_000613_003352.wav processed.
SEWAv02/SVL_C6_S190_P379_VC1_000