In [53]:
from transformers import Wav2Vec2Model, Wav2Vec2FeatureExtractor
import torchaudio
import torch
import numpy as np
import os

# Step 1: Load the pre-trained Wav2Vec 2.0 model and feature extractor
model_name = "facebook/wav2vec2-large-xlsr-53"
model = Wav2Vec2Model.from_pretrained(model_name)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)

# Path to the folder containing audio files
audio_folder = "/home/aaa470/Dataset/wav_files/wav"

# List all the audio files in the folder
audio_files = [f for f in os.listdir(audio_folder) if f.endswith(".wav")]
total_files = len(audio_files)

# Frame size of 1 second and step size of 200 ms in samples
frame_size = int(1 * feature_extractor.sampling_rate)
step_size = int(0.2 * feature_extractor.sampling_rate)

# Initialize an empty list to store all frame snippets
all_frame_snippets = []

# Loop through audio files
for idx, audio_file in enumerate(audio_files):
    # Load the audio file
    audio_input, sample_rate = torchaudio.load(os.path.join(audio_folder, audio_file))

    # Loop through frame snippets in the audio file
    for i in range(step_size, len(audio_input[0]), frame_size + step_size):
        start_time = i / sample_rate
        end_time = (i + frame_size) / sample_rate

        frame = audio_input[:, i - step_size:i + frame_size - step_size]
        inputs = feature_extractor(frame.squeeze().numpy(), sampling_rate=sample_rate, return_tensors="pt")

        with torch.no_grad():
            embeddings = model(inputs.input_values).last_hidden_state

        # Append the current frame snippet to the all_frame_snippets list
        all_frame_snippets.append({
            "name": audio_file,
            "start_time": start_time,
            "end_time": end_time,
            "embeddings": embeddings[0].numpy()
        })

        remaining_files = total_files - idx
        # Print information about the processed snippet frame
        print(f"Processed frame for audio '{audio_file}':")
        print(f"  Start time: {start_time:.2f} seconds")
        print(f"  End time: {end_time:.2f} seconds")
        print(f"  Embeddings shape: {embeddings[0].shape}")
        print(f"Remaining files: {remaining_files}")
        print("=" * 40)

# Save the current progress to the output .npz file
np.savez("all_frame_snippets.npz", frame_snippets=all_frame_snippets)

print("All frame snippets saved step by step to all_frame_snippets.npz")
print(f"Total number of frame snippets: {len(all_frame_snippets)}")


Processed frame for audio 'M2BS24P3.wav':
  Start time: 0.20 seconds
  End time: 1.20 seconds
  Embeddings shape: torch.Size([49, 1024])
Remaining files: 1107
Processed frame for audio 'M2BS24P3.wav':
  Start time: 1.40 seconds
  End time: 2.40 seconds
  Embeddings shape: torch.Size([49, 1024])
Remaining files: 1107
Processed frame for audio 'M2BS24P3.wav':
  Start time: 2.60 seconds
  End time: 3.60 seconds
  Embeddings shape: torch.Size([49, 1024])
Remaining files: 1107
Processed frame for audio 'M2BS24P3.wav':
  Start time: 3.80 seconds
  End time: 4.80 seconds
  Embeddings shape: torch.Size([49, 1024])
Remaining files: 1107
Processed frame for audio 'M2BS24P3.wav':
  Start time: 5.00 seconds
  End time: 6.00 seconds
  Embeddings shape: torch.Size([49, 1024])
Remaining files: 1107
Processed frame for audio 'M2BS24P3.wav':
  Start time: 6.20 seconds
  End time: 7.20 seconds
  Embeddings shape: torch.Size([49, 1024])
Remaining files: 1107
Processed frame for audio 'M2BS24P3.wav':
  St

In [54]:
import numpy as np

# Load the saved .npz file
saved_data = np.load("all_frame_snippets.npz", allow_pickle=True)

# Retrieve the frame snippets and embeddings
frame_snippets = saved_data["frame_snippets"]


In [57]:
len(frame_snippets)


19107