In [1]:
%load_ext autoreload
%autoreload 2
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [2]:


from huggingface_hub import login
import os

token_file_path = 'secrets.txt'

if os.path.exists(token_file_path):
    try:
        with open(token_file_path, 'r') as f:
            hf_token = f.read().strip()  # .strip() removes any leading/trailing whitespace

        login(token=hf_token)
        print("Successfully logged in to Hugging Face!")

    except Exception as e:
        print(f"An error occurred while trying to read the token file or log in: {e}")
else:
    print(f"Token file not found at {token_file_path}. Please create the file and add your token.")



  from .autonotebook import tqdm as notebook_tqdm


Successfully logged in to Hugging Face!


In [3]:
from src.data_utils import SIFT50MDataset
import os
from datasets import load_dataset, Dataset

sift_dataset = load_dataset(
    'amazon-agi/SIFT-50M',
    #name='closed_ended_comparison',
    name = 'closed_ended_content_level',
    split='train',
    trust_remote_code=True
)
 

# Define the allowed data sources
allowed_values = ["common_voice_de", "common_voice_en", "vctk_en"]

# Define a filter function
def filter_data_source(example):
    return example["data_source"] in allowed_values

filtered_sift_data = sift_dataset.filter(filter_data_source)


In [4]:
len(filtered_sift_data)

3966476

In [4]:

filtered_sift_data = filtered_sift_data.add_column(
    "data_source_str", filtered_sift_data["data_source"]
)

stratifiable_dataset = filtered_sift_data.class_encode_column("data_source")

train_validation_split = stratifiable_dataset.train_test_split(
    test_size=0.2,
    seed=40,
    stratify_by_column="data_source"
)

train_ds, eval_ds = train_validation_split["train"], train_validation_split["test"]



In [5]:
#train_ds, eval_ds = train_validation_split["train"], train_validation_split["test"]

train_ds = train_ds.remove_columns("data_source").rename_column("data_source_str", "data_source")
eval_ds  = eval_ds.remove_columns("data_source").rename_column("data_source_str", "data_source")


In [21]:
len(eval_ds)

793296

In [6]:


base_datasets_root = "/home/jovyan/.cache/huggingface/datasets"
base_datasets_paths = {
    "common_voice_de": None, # No longer needs a path, handled by load_dataset
    #"multilingual_librispeech_de": None, # No longer needs a path, handled by load_dataset
    "common_voice_en": None, # No longer needs a path, handled by load_dataset
    "vctk_en": "./vctk_corpus" # VCTK still needs a root path for torchaudio
}



In [7]:
import torch
import gc
from datasets import Dataset
from tqdm import tqdm

def create_filtered_hf_dataset(sift_iterable_dataset_raw):
    """
    1. Iterates through the streaming SIFT50MDataset, which already handles 
       path resolution and internal validity checks.
    2. Collects all the valid entries into a Python list.
    3. Converts the collected list into a Hugging Face Dataset object.
    """
    print("Starting collection and conversion of evaluation data...")
    
    valid_entries = []
    
    try:
        for entry in tqdm(sift_iterable_dataset_raw, desc="Collecting valid eval samples"):
            valid_entries.append(entry)
            
    except Exception as e:
        print(f"Error occurred during data collection: {e}")
        # The process will still convert whatever valid data was collected before the error.

    print(f"Finished collecting {len(valid_entries)} valid entries.")
    
    if not valid_entries:
        print("Warning: Evaluation dataset is empty after filtering. Cannot create Dataset object.")
        return None
        
    filtered_dataset = Dataset.from_list(valid_entries)
    
    # Perform cleanup after loading a large object
    del valid_entries
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    return filtered_dataset

In [8]:
from src.data_utils import SIFT50MDataset

sift_iterable_dataset_raw_eval = SIFT50MDataset(
    sift_dataset=eval_ds.select(range(30000)), 
    base_datasets_paths=base_datasets_paths
)

sift_iterable_dataset_eval_filtered = create_filtered_hf_dataset(sift_iterable_dataset_raw_eval)

#print(sift_iterable_dataset_eval_filtered[0])


Starting collection and conversion of evaluation data...


Collecting valid eval samples: 28249it [14:40, 32.10it/s]


Finished collecting 28249 valid entries.


In [9]:
%reload_ext autoreload
 
from src.train_qlora import train_model
from transformers import AutoProcessor
from src.data_utils import SIFT50MDataset
from src.data_collator import CustomDataCollator
import torch # Import torch for CUDA memory management
import gc    # Import garbage collector
import sys

def main():

    processor = None
    data_collator = None
    sift_iterable_dataset_eval = None
    sift_iterable_dataset_train = None
    
    try:
        # 2. Setup (inside try block)
        print("Starting model setup...")
        processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct", sampling_rate=16000)
        data_collator = CustomDataCollator(processor)

        sift_iterable_dataset_eval = SIFT50MDataset(sift_dataset=sift_iterable_dataset_eval_filtered, base_datasets_paths=base_datasets_paths)
        sift_iterable_dataset_train = SIFT50MDataset(sift_dataset=train_ds.select(range(1000000)), base_datasets_paths=base_datasets_paths)

        print("Starting model training...")
        train_model(
            eval_ds=sift_iterable_dataset_eval,
            train_ds=sift_iterable_dataset_train,
            processor=processor,
            custom_data_collator=data_collator,
            resume=True
        )
        print("Training completed successfully.")

    except Exception as e:
        print(f"An error or exception occurred during execution: {e}")
        sys.exit(1)


    finally:
        print("Starting cleanup and memory release...")
        
        # Clear object references to aid garbage collection
        del processor
        del data_collator
        del sift_iterable_dataset_eval
        del sift_iterable_dataset_train
        
        # Force Python's garbage collector to run
        gc.collect() 
        
        # Release unused GPU memory cached by PyTorch
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            print("PyTorch CUDA memory cache released.")
        
        print("Cleanup complete.")


In [10]:
main()

Starting model setup...
Starting model training...


Loading checkpoint shards: 100%|██████████| 5/5 [00:14<00:00,  2.86s/it]
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 151645, 'pad_token_id': 151643}.
	per_device_train_batch_size: 4 (from args) != 2 (from trainer_state.json)
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1200,0.5419,0.619307,3.570768,154916.0,0.843831
1400,0.5068,0.481996,3.549057,312928.0,0.873887
1600,0.4196,0.416185,3.70552,471779.0,0.885401


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Training completed successfully.
Starting cleanup and memory release...
PyTorch CUDA memory cache released.
Cleanup complete.


In [11]:
import gc
import torch
del main
gc.collect()
torch.cuda.empty_cache()

In [12]:

gc.collect()

0