In [2]:
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import torch
import logging
import torch.nn as nn

import os, sys
sys.path.insert(0, os.path.join(os.pardir, os.pardir))

from utils.data import save_pickle,load_pickle

# write a class for wav2vec
class wav2vec():
    def __init__(self, model_name="facebook/wav2vec2-base-960h", device="cpu"):
        self.model_name = model_name
        self.device = device
        self.processor = Wav2Vec2Processor.from_pretrained(self.model_name)
        self.model = Wav2Vec2Model.from_pretrained(self.model_name).to(self.device)
        self.logger = logging.getLogger(__name__)
        self.logger.info("wav2vec model loaded.")
        self.mlp = torch.nn.Sequential(
            nn.linear(768, 256),
            nn.GELU(),
            nn.linear(256, 128),
        )

    def forward(self, audio, return_tensors="pt", padding="longest"):
        input_values = self.processor(audio, return_tensors=return_tensors, padding=padding).input_values.to(self.device)
        representation = self.model(input_values)
        last_hidden_state = representation.last_hidden_state
        return self.mlp(last_hidden_state.permute(0, 2, 1))

    def save(self, path):
        torch.save(self.model.state_dict(), path)
        self.logger.info("Wav2Vec2 model saved.")

    def load(self, path):
        self.model.load_state_dict(torch.load(path, map_location=self.device))
        self.logger.info("Wav2Vec2 model loaded.")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")

Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2Model: ['lm_head.bias', 'lm_head.weight']
- This IS expected if you are initializing Wav2Vec2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
from datasets import load_dataset
ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")

Found cached dataset librispeech_asr_dummy (C:/Users/QWT/.cache/huggingface/datasets/patrickvonplaten___librispeech_asr_dummy/clean/2.1.0/f2c70a4d03ab4410954901bde48c54b85ca1b7f9bf7d616e7e2a72b5ee6ddbfc)


In [11]:
import torch
import torch.nn.functional as F

# Define the original sample rate and target sample rate
original_sample_rate = 22050
target_sample_rate = 16000
time_length = 3
# Generate a random audio array as an example
audio_array = torch.randn(original_sample_rate * time_length)  
# Calculate the original time length
original_time_length = len(audio_array) / original_sample_rate
# Calculate the target time length
target_time_length = original_time_length * (target_sample_rate / original_sample_rate)
# Reshape the audio array to match the time length
audio_array = audio_array.view(1, 1, -1)
# Perform downsampling using interpolate function
# downsampled_array = F.interpolate(audio_array, size=(int(target_time_length * target_sample_rate)))
downsampled_array = F.interpolate(audio_array, size=(int(time_length * target_sample_rate)))
# Reshape the downsampled array to its final shape
downsampled_array = downsampled_array.view(-1)

# Print the shapes and sample rates
print("Original array shape:", audio_array.shape)
print("Downsampled array shape:", downsampled_array.shape)
print("Original sample rate:", original_sample_rate)
print("Target sample rate:", target_sample_rate)


Original array shape: torch.Size([1, 1, 66150])
Downsampled array shape: torch.Size([48000])
Original sample rate: 22050
Target sample rate: 16000


In [12]:
print('start read file')
file = load_pickle("F:\\PKU2\\curriculum\\computer_science\\NLP\\Hw4\\mycode\\data\\meg.gwilliams2022neural\\sub-01\\dataset.origin")
print('read file done')

start read file
read file done


In [14]:
print(file['train'][0].keys())
train,test = file['train'],file['test']

dict_keys(['name', 'audio', 'data', 'chan_pos'])


In [21]:
audio_length = len(file['train'][0]["audio"][1])
print(audio_length)
import numpy as np
batch_audio = np.zeros((2,audio_length))
batch_audio[0] = file['train'][0]["audio"][1]
batch_audio[1] = file['train'][1]["audio"][1]


input_values = processor(batch_audio,return_tensors="pt", padding="longest").input_values  # Batch size 1

It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


66150
