In [3]:
import pandas as pd
import torch
import soundfile as sf
from transformers import HubertModel, Wav2Vec2Processor

In [4]:
def load_mosi_dataset(pkl_path):
    # Load the dataset
    return pd.read_pickle(pkl_path)

def extract_features_hubert(audio_file, processor, model):
    # Load audio
    speech, sr = sf.read(audio_file)
    # Process audio
    input_values = processor(speech, sampling_rate=sr, return_tensors="pt").input_values
    # Extract features
    with torch.no_grad():
        hidden_states = model(input_values).last_hidden_state
    return hidden_states

processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")

Some weights of HubertModel were not initialized from the model checkpoint at facebook/hubert-large-ls960-ft and are newly initialized: ['hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
mosi_dataset = load_mosi_dataset("mosi_raw.pkl")

print(mosi_dataset['train']['audio'])

[[[ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]
  ...
  [ 2.22000000e+02  6.66666687e-01  9.89001915e-02 ...  8.80259797e-02
    4.09209915e-02  1.33800760e-01]
  [ 2.13843750e+02  9.79166687e-01  1.60358369e-01 ... -1.16521502e-02
    6.68889210e-02  1.73782691e-01]
  [ 2.06714279e+02  7.61904776e-01  1.31449461e-01 ...  5.32066263e-02
    1.14056304e-01  2.07019895e-01]]

 [[ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]
  ...
  [ 2.71062500e+02  2.5

In [9]:
def process_subset(subset, processor, model):
    # This assumes each subset is a list or similar iterable of sample details,
    # including an audio file path. Adjust as necessary for your data structure.
    for sample in subset:
        audio_path = sample['audio_filepath']  # Adjust based on actual key for audio path
        features = extract_features_hubert(audio_path, processor, model)
        # Process the extracted features as needed
        print(f"Processed sample {sample['id']}")  # Assuming there's an 'id' or similar identifier

# Iterate through each data subset
for subset_name in ['train', 'valid', 'test']:
    print(f"Processing {subset_name} subset")
    subset_data = mosi_dataset[subset_name]
    process_subset(subset_data, processor, model)

Feature extraction complete.
