In [1]:
import torch
from core.model import GPT, GPTConfig
from core.tokenizer import Tokenizer, AudioTokenizer
from huggingface_hub import hf_hub_download
import numpy as np
import soundfile as sf
from IPython.display import Audio


In [2]:
model_config = GPTConfig.from_pretrained('EleutherAI/pythia-410m')
model = GPT(model_config)
tokenizer = Tokenizer()
audio_tokenizer = AudioTokenizer()

number of parameters: 409.55M
Loading Audio Encoder


Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


Loaded Audio Encoder, size: 1029


In [3]:
# Step 1, load the model

model_name = 'alexedw/audio-clean-all-run-1'
model_checkpoint = '10000'

state_dict = torch.load(hf_hub_download(model_name, "model_state.pt", revision=model_checkpoint), map_location='cpu')
new_state_dict = {key.replace("_orig_mod.", ""): value for key, value in state_dict.items()}

model.load_state_dict(new_state_dict)

<All keys matched successfully>

In [4]:
# prepare tokens
input_text = 'AND YOU REPLIED THAT I WOULD REPORT HIS DECISION TO YOU AND I PROMISED HIM THAT I WOULD BRING YOU INTO A MORE REASONABLE FRAME OF MIND ONLY THINK MY DEAR CHILD OF THE POSITION THAT YOU ARE LOSING'.lower()
tokenized_text = tokenizer.encode(input_text)

full_text_tokens = np.concatenate([np.array([tokenizer.start_text_id]), tokenized_text, np.array([tokenizer.end_text_id])])
full_audio_tokens = np.concatenate([np.array([audio_tokenizer.start_text_id]), np.array([audio_tokenizer.text_id] * len(tokenized_text)), np.array([audio_tokenizer.end_text_id])])

full_text_tokens = torch.tensor(full_text_tokens).unsqueeze(0)
full_audio_tokens_1 = torch.tensor(full_audio_tokens).unsqueeze(0)
full_audio_tokens_2 = torch.tensor(full_audio_tokens).unsqueeze(0)

output_text, output_audio_1, output_audio_2 = model.generate(full_text_tokens, full_audio_tokens_1, full_audio_tokens_2, 300, temperature=0.7, top_p=0.9)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/300 [00:00<?, ?it/s]

 86%|████████▌ | 257/300 [00:59<00:16,  2.64it/s]

In [None]:
start_audio = len(full_audio_tokens) + 1
end_audio = len(output_text[0])

audio_tokens_1 = output_audio_1[0, start_audio:end_audio]
audio_tokens_2 = output_audio_2[0, start_audio:end_audio]

audio_tokens_stacked = torch.stack([audio_tokens_1, audio_tokens_2], dim=0)
audio_tokens_stacked = torch.clamp(audio_tokens_stacked, min=0, max=1023)

sound = audio_tokenizer.decode(audio_tokens_stacked)

wav_filename = "temp_audio2.wav"
sf.write(wav_filename, sound.detach().numpy(), 24000)
Audio(filename=wav_filename)

In [None]:
from datasets import load_dataset, Audio as AudioHF
librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
librispeech_dummy = librispeech_dummy.cast_column("audio", AudioHF(sampling_rate=24000))
audio_sample = librispeech_dummy[0]["audio"]["array"]

audio_tokens_raw = audio_tokenizer.encode(audio_sample)

full_text_tokens = np.concatenate([np.array([tokenizer.start_audio_id]), np.array([audio_tokenizer.text_id] * len(audio_tokens_raw[0])), np.array([tokenizer.end_audio_id])])
full_audio_tokens_1 = np.concatenate([np.array([audio_tokenizer.start_audio_id]), audio_tokens_raw[0], np.array([audio_tokenizer.end_audio_id])])
full_audio_tokens_2 = np.concatenate([np.array([audio_tokenizer.start_audio_id]), audio_tokens_raw[1], np.array([audio_tokenizer.end_audio_id])])

output_text, output_audio_1, output_audio_2 = model.generate(full_text_tokens, full_audio_tokens_1, full_audio_tokens_2, 25, temperature=0.7, top_p=0.9)
tokenizer.decode(output_text)