In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from huggingface_hub import login
import os

token_file_path = 'secrets.txt'

if os.path.exists(token_file_path):
    try:
        with open(token_file_path, 'r') as f:
            hf_token = f.read().strip()  # .strip() removes any leading/trailing whitespace

        login(hf_token)
        print("Successfully logged in to Hugging Face!")

    except Exception as e:
        print(f"An error occurred while trying to read the token file or log in: {e}")
else:
    print(f"Token file not found at {token_file_path}. Please create the file and add your token.")

  from .autonotebook import tqdm as notebook_tqdm


Successfully logged in to Hugging Face!


In [4]:
?login

[31mSignature:[39m
login(
    token: Optional[str] = [38;5;28;01mNone[39;00m,
    *,
    add_to_git_credential: bool = [38;5;28;01mFalse[39;00m,
    new_session: bool = [38;5;28;01mTrue[39;00m,
    write_permission: bool = [38;5;28;01mFalse[39;00m,
) -> [38;5;28;01mNone[39;00m
[31mDocstring:[39m
Login the machine to access the Hub.

The `token` is persisted in cache and set as a git credential. Once done, the machine
is logged in and the access token will be available across all `huggingface_hub`
components. If `token` is not provided, it will be prompted to the user either with
a widget (in a notebook) or via the terminal.

To log in from outside of a script, one can also use `hf auth login` which is
a cli command that wraps [`login`].

<Tip>

[`login`] is a drop-in replacement method for [`notebook_login`] as it wraps and
extends its capabilities.

</Tip>

<Tip>

When the token is not passed, [`login`] will automatically detect if the script runs
in a notebook or not. H

In [3]:
from src.data_utils import SIFT50MDataset
import os
from datasets import load_dataset, Dataset

sift_dataset = load_dataset(
    'amazon-agi/SIFT-50M',
    name='closed_ended_content_level',
    split='train',
    trust_remote_code=True
)
sift_dataset = sift_dataset.shuffle(seed=90)
df = sift_dataset.select(range(1000)).to_pandas()
print(len(sift_dataset))
allowed_values = ["common_voice_de",'vctk_en','common_voice_en'] # "multilingual_librispeech_de" "common_voice_en" The datasets were taking too long to get donwloaded so I restricted it to certain subsets

filtered_df = df[df["data_source"].isin(allowed_values)]

# Count number of entries per value
counts = filtered_df["data_source"].value_counts()
print(counts)
sift_data = Dataset.from_pandas(filtered_df)
# Define the base datasets paths (replace with your actual paths)


14473775
data_source
common_voice_en    186
common_voice_de     88
vctk_en             13
Name: count, dtype: int64


In [None]:
total_len = len(sift_data)
eval_len = int(0.2*total_len)

eval_ds = sift_data.select(range(eval_len))
train_ds = sift_data.select(range(eval_len,total_len))

base_datasets_root = "/home/jovyan/.cache/huggingface/datasets"
base_datasets_paths = {
    "common_voice_de": None, # No longer needs a path, handled by load_dataset
    #"multilingual_librispeech_de": None, # No longer needs a path, handled by load_dataset
    "common_voice_en": None, # No longer needs a path, handled by load_dataset
    "vctk_en": "./vctk_corpus" # VCTK still needs a root path for torchaudio
}


In [None]:
from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor

processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B", device_map="cuda")

In [None]:
import librosa
from dataset import DataLoader

sift_iterable_dataset_eval = SIFT50MDataset(sift_dataset=eval_ds, base_datasets_paths=base_datasets_paths)

loader = DataLoader(sift_iterable, )

for conversation in sift_iterable_dataset_eval:
    print(
    text = processor.apply_chat_template(conversation['messages'], add_generation_prompt=True, tokenize=False)
    audios = []
    found_audio_paths = []
    def find_audio_paths(content_list):
        paths = []
        if not isinstance(content_list, (list, tuple)):
            content_list = [content_list]
        for item in content_list:
            if isinstance(item, dict) and 'audio_path' in item.keys() and item['audio_path'] is not None:
                paths.append(item['audio_path'])
            elif isinstance(item, dict) and 'content' in item.keys():
                paths.extend(find_audio_paths(item['content']))
            elif isinstance(item, (list, tuple)):
                paths.extend(find_audio_paths(item))
        return paths

    for role_entry in conversation['messages']:
        if 'content' in role_entry and 'role' in role_entry and role_entry['role'] != 'assistant':
            found_audio_paths.extend(find_audio_paths(role_entry['content']))
    
    audio_signals = []
    for path in found_audio_paths:
        if os.path.exists(path):
            audio, _ = librosa.load(
                path,
                sr=processor.feature_extractor.sampling_rate
            )
            audio_signals.append(audio)
        else:
            # Log which file was not found
            print(f"File not found: {path}. This may cause an audio-token mismatch.")

    inputs = processor(text=text, audio=audio_signals, return_tensors="pt", padding=True)
    inputs.input_ids = inputs.input_ids.to("cuda")

    generate_ids = model.generate(**inputs, max_length=512)
    generate_ids = generate_ids[:, inputs.input_ids.size(1):]

    response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    print(response)

