### ✅ Step 1: Transcribe Audio Using Whisper ASR

In [6]:
import whisper
import torchaudio
import torch
import torchaudio.transforms as T
import numpy as np
import random
from datasets import load_dataset
import os
import gc
import json

# Set environment variable to avoid memory fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# 1. Load TORGO dataset
dataset = load_dataset("abnerh/TORGO-database")

# 2. Filter samples by speaker type
def filter_by_speaker_type(data, speaker_prefix):
    return [item for item in data if item['speech_status'].startswith(speaker_prefix)]

dysarthric_data = filter_by_speaker_type(dataset['train'], 'dysarthria')
healthy_data = filter_by_speaker_type(dataset['train'], 'healthy')

# 3. Randomly select 10 from each category
selected_dysarthric = random.sample(dysarthric_data, 10)
selected_healthy = random.sample(healthy_data, 10)
selected_samples = selected_dysarthric + selected_healthy

# 4. Check GPU availability
print(f"Number of GPUs available: {torch.cuda.device_count()}")
assert torch.cuda.device_count() >= 2, "Requires at least 2 GPUs."

# Clear GPU memory before loading the model
torch.cuda.empty_cache()

# 5. Load Whisper large-v2 model and set up DataParallel
try:
    base_model = whisper.load_model("large-v2")  # Try large-v2 first
except torch.cuda.OutOfMemoryError:
    print("Large-v2 model too big for GPU memory. Falling back to 'medium'.")
    base_model = whisper.load_model("medium")  # Fallback to medium if OOM

# Define device_ids explicitly
device_ids = [0, 1]  # Use GPU 0 and GPU 1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
asr_model = torch.nn.DataParallel(base_model, device_ids=device_ids).to(device)
print(f"Model loaded on devices: {device_ids}")

# 6. Whisper sample rate
WHISPER_SR = 16000

# 7. Transcription function
def transcribe_whisper_array(audio_array, original_sr):
    audio_tensor = torch.tensor(audio_array, dtype=torch.float32)

    # Resample if sampling rate differs
    if original_sr != WHISPER_SR:
        resampler = T.Resample(orig_freq=original_sr, new_freq=WHISPER_SR)
        audio_tensor = resampler(audio_tensor)

    # Ensure mono
    if audio_tensor.ndim > 1:
        audio_tensor = torch.mean(audio_tensor, dim=0)

    # Convert to float32 numpy array
    audio_np = audio_tensor.numpy().astype(np.float32)

    # Transcribe with Whisper
    result = asr_model.module.transcribe(audio_np, language='en')
    
    # Clean up tensors within the function
    del audio_tensor, audio_np
    torch.cuda.empty_cache()
    return result["text"]

# 8. Transcribe and collect results
retrieved_samples = {}

for idx, sample in enumerate(selected_samples):
    try:
        audio_array = sample["audio"]["array"]
        sampling_rate = sample["audio"]["sampling_rate"]
        asr_transcript = transcribe_whisper_array(audio_array, sampling_rate)
        
        # Calculate duration in seconds
        duration = len(audio_array) / sampling_rate
        
        # Get original transcription if available
        transcription = sample.get("transcription", "No original transcription available")
        
        # Get speech status and gender
        speech_status = sample["speech_status"]
        gender = sample.get("gender", "Unknown")  # Fallback if gender field is missing
        
        # Store results in a structured dictionary
        retrieved_samples[f"sample_{idx}"] = {
            "speech_status": speech_status,
            "gender": gender,
            "duration_seconds": round(duration, 2),  # Rounded to 2 decimal places
            "original_transcript": transcription,
            "asr_transcript": asr_transcript
        }
        
        # Print for console feedback (optional)
        print(f"Sample {idx} (Speech Status: {speech_status}, Gender: {gender}):")
        print(f"  Duration: {round(duration, 2)} seconds")
        print(f"  Original Transcript: {transcription}")
        print(f"  ASR Transcript: {asr_transcript}")
        print("-" * 50)
        
    except torch.cuda.OutOfMemoryError:
        print(f"Out of memory at sample {idx}. Clearing cache and retrying.")
        torch.cuda.empty_cache()
        asr_transcript = transcribe_whisper_array(audio_array, sampling_rate)
        
        # Calculate duration in seconds
        duration = len(audio_array) / sampling_rate
        
        # Retry with same output logic
        transcription = sample.get("transcription", "No original transcription available")
        speech_status = sample["speech_status"]
        gender = sample.get("gender", "Unknown")
        retrieved_samples[f"sample_{idx}"] = {
            "speech_status": speech_status,
            "gender": gender,
            "duration_seconds": round(duration, 2),
            "original_transcript": transcription,
            "asr_transcript": asr_transcript
        }
        
        print(f"Sample {idx} (Speech Status: {speech_status}, Gender: {gender}) after retry:")
        print(f"  Duration: {round(duration, 2)} seconds")
        print(f"  Original Transcript: {transcription}")
        print(f"  ASR Transcript: {asr_transcript}")
        print("-" * 50)
        
    finally:
        # Clear GPU memory after each sample
        torch.cuda.empty_cache()
        gc.collect()

# 9. Convert results to JSON and save/output
json_output = json.dumps(retrieved_samples, indent=4)
print("Results in JSON format:")
print(json_output)

# Optionally save to a file
with open("transcription_results.json", "w") as f:
    f.write(json_output)
print("Results saved to 'transcription_results.json'")

# 10. Clean up after transcription is complete
print("Transcription completed.")
del asr_model, base_model  # Explicitly delete model objects
torch.cuda.empty_cache()  # Clear cache one last time
gc.collect()  # Final garbage collection
print("CUDA memory released.")

  from .autonotebook import tqdm as notebook_tqdm


Number of GPUs available: 2
Model loaded on devices: [0, 1]
Sample 0 (Speech Status: dysarthria, Gender: male):
  Duration: 3.16 seconds
  Original Transcript: storm
  ASR Transcript:  It's done.
--------------------------------------------------
Sample 1 (Speech Status: dysarthria, Gender: female):
  Duration: 4.45 seconds
  Original Transcript: She wore warm fleecy woolen overalls
  ASR Transcript:  Okay. She wore a white fleece brown overalls.
--------------------------------------------------
Sample 2 (Speech Status: dysarthria, Gender: female):
  Duration: 1.35 seconds
  Original Transcript: single
  ASR Transcript:  single
--------------------------------------------------
Sample 3 (Speech Status: dysarthria, Gender: male):
  Duration: 2.11 seconds
  Original Transcript: alpha
  ASR Transcript:  Alpha
--------------------------------------------------
Sample 4 (Speech Status: dysarthria, Gender: male):
  Duration: 3.0 seconds
  Original Transcript: sip
  ASR Transcript:  Sip.
---

### ✅ Step 2: Text to IPA

In [2]:
import json
from process_text import load_model, generate_ipa
import gc

# Load the LLaMA model for IPA generation
llama_model = load_model("/media/cairuser1/b916a6fe-2106-41d4-98be-bbdd1d3bcb16/model/mixtral-8x7b-v0.1.Q8_0.gguf")
print("LLaMA model loaded for IPA generation.")

# Load the existing JSON file
input_json_file = "transcription_results.json"  # Your existing file from Whisper
with open(input_json_file, "r") as f:
    retrieved_samples = json.load(f)

# Process each sample to add IPA
for sample_key, sample_data in retrieved_samples.items():
    # Extract transcripts
    original_transcript = sample_data["original_transcript"]
    asr_transcript = sample_data["asr_transcript"]
    
    # Generate IPA
    original_ipa = generate_ipa(llama_model, original_transcript) if original_transcript != "No original transcription available" else "N/A"
    asr_ipa = generate_ipa(llama_model, asr_transcript)
    
    # Add IPA to the sample data
    sample_data["original_ipa"] = original_ipa
    sample_data["asr_ipa"] = asr_ipa
    
    # Optional: Print for feedback
    print(f"Processed {sample_key}:")
    print(f"  Original Transcript: {original_transcript}")
    print(f"  ASR Transcript: {asr_transcript}")
    print(f"  Original IPA: {original_ipa}")
    print(f"  ASR IPA: {asr_ipa}")
    print("-" * 50)

# Save the updated JSON to a new file
output_json_file = "transcription_results_with_ipa.json"
json_output = json.dumps(retrieved_samples, indent=4)
with open(output_json_file, "w") as f:
    f.write(json_output)
print(f"Updated results saved to '{output_json_file}'")

# Clean up
del llama_model
gc.collect()
print("Memory cleaned up.")

llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3060) - 11169 MiB free
llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3060) - 11832 MiB free
llama_model_loader: loaded meta data with 25 key-value pairs and 995 tensors from /media/cairuser1/b916a6fe-2106-41d4-98be-bbdd1d3bcb16/model/mixtral-8x7b-v0.1.Q8_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mixtral-8x7b-v0.1
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_m

Attempting to load model from /media/cairuser1/b916a6fe-2106-41d4-98be-bbdd1d3bcb16/model/mixtral-8x7b-v0.1.Q8_0.gguf
Model loading failed: Failed to load model from file: /media/cairuser1/b916a6fe-2106-41d4-98be-bbdd1d3bcb16/model/mixtral-8x7b-v0.1.Q8_0.gguf
Model cleaned up.
Retrying after cleanup...
Permanent model loading failure: Failed to load model from file: /media/cairuser1/b916a6fe-2106-41d4-98be-bbdd1d3bcb16/model/mixtral-8x7b-v0.1.Q8_0.gguf
LLaMA model loaded for IPA generation.


AttributeError: 'NoneType' object has no attribute 'generate'

In [2]:
import json
from process_text import load_model, generate_ipa,evaluate_transcriptions
import gc
import os


In [None]:

# Define the three model paths
model_paths = [
    "/media/cairuser1/b916a6fe-2106-41d4-98be-bbdd1d3bcb16/model/llama-chat-3.1-q8.gguf",  # Model 1
    "/media/cairuser1/b916a6fe-2106-41d4-98be-bbdd1d3bcb16/model/llama-2-7b-chat.Q8_0.gguf",  # Model 2
    "/media/cairuser1/b916a6fe-2106-41d4-98be-bbdd1d3bcb16/model/Ministral-8B-Instruct-2410-Q8_0.gguf"  # Model 3
]

# Load the existing JSON file
input_json_file = "transcription_results.json"
print(f"Loading input JSON from {input_json_file}")
with open(input_json_file, "r") as f:
    retrieved_samples = json.load(f)
print("Input JSON loaded successfully.")

# Process each model sequentially
for i, model_path in enumerate(model_paths, 1):
    model_name = f"model_{i}"
    is_mixtral = "mixtral" in model_path.lower()
    print(f"\nStarting processing for {model_name} from {model_path}")
    
    # Check if the model file exists and its size
    if not os.path.exists(model_path):
        print(f"Error: Model file {model_path} does not exist.")
        continue
    file_size = os.path.getsize(model_path) / (1024 * 1024)  # Size in MB
    print(f"Model file {model_path} found, size: {file_size:.2f} MB")
    
    # Load the model with error handling
    try:
        model = load_model(model_path, is_mixtral=is_mixtral)
        if model is None:
            raise ValueError(f"load_model returned None for {model_path}")
        print(f"{model_name} loaded successfully.")
    except Exception as e:
        print(f"Failed to load {model_name} from {model_path}: {str(e)}")
        continue
    
    # Process each sample with the current model
    for sample_key, sample_data in retrieved_samples.items():
        original_transcript = sample_data["original_transcript"]
        asr_transcript = sample_data["asr_transcript"]
        
        print(f"Generating IPA for {sample_key} with {model_name}")
        try:
            original_ipa = (
                generate_ipa(model, original_transcript)
                if original_transcript != "No original transcription available"
                else "N/A"
            )
            asr_ipa = generate_ipa(model, asr_transcript)
        except Exception as e:
            print(f"Error generating IPA for {sample_key} with {model_name}: {str(e)}")
            original_ipa = "Error"
            asr_ipa = "Error"
        
        # Add IPA to the sample data
        sample_data[f"original_ipa_{model_name}"] = original_ipa
        sample_data[f"asr_ipa_{model_name}"] = asr_ipa
        
        print(f"Processed {sample_key} with {model_name}:")
        print(f"  Original Transcript: {original_transcript}")
        print(f"  ASR Transcript: {asr_transcript}")
        print(f"  Original IPA ({model_name}): {original_ipa}")
        print(f"  ASR IPA ({model_name}): {asr_ipa}")
        print("-" * 50)
    
    # Clean up the current model
    print(f"Unloading {model_name}")
    del model
    gc.collect()
    print(f"{model_name} unloaded successfully.")

# Save the updated JSON
output_json_file = "transcription_results_with_ipa.json"
print(f"\nSaving updated JSON to {output_json_file}")
json_output = json.dumps(retrieved_samples, indent=4,ensure_ascii=False)
with open(output_json_file, "w") as f:
    f.write(json_output)
print(f"Updated results saved to '{output_json_file}'")

# Final memory cleanup
gc.collect()
print("Final memory cleanup completed.")

In [9]:
ipas = json.load(open('./transcription_results_with_ipa.json','r'))
model = load_model(model_path=model_paths[0])



Attempting to load model from /media/cairuser1/b916a6fe-2106-41d4-98be-bbdd1d3bcb16/model/llama-chat-3.1-q8.gguf
Loading LLaMA model...


llama_init_from_model: n_ctx_per_seq (2048) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


Model loaded successfully from /media/cairuser1/b916a6fe-2106-41d4-98be-bbdd1d3bcb16/model/llama-chat-3.1-q8.gguf


In [18]:
import Levenshtein
# Example IPA strings
ipa_original = "dɪs ɪz ə tɛst"
ipa_transcript = "ðɪs ɪz ə tɛst"

def calculate_distance(ipa_original,ipa_transcript):
    # Step 1: Clean up and remove spaces if needed
    ipa_original_clean = ipa_original.replace(" ", "")
    ipa_transcript_clean = ipa_transcript.replace(" ", "")

    # Step 2: Calculate raw Levenshtein distance (number of edits)
    distance = Levenshtein.distance(ipa_original_clean, ipa_transcript_clean)

    # Step 3: Normalize (optional, to get a percentage)
    normalized_error = distance / max(len(ipa_original_clean), 1)

    # Step 4: Similarity Score (optional)
    similarity_score = 1 - normalized_error

    # Output
    return {"IPA Mismatches": distance,"Normalized Error Rate": normalized_error,"Similarity Score": similarity_score}
calculate_distance(ipa_original,ipa_transcript)


{'IPA Mismatches': 1, 'Normalized Error Rate': 0.1, 'Similarity Score': 0.9}

In [None]:
ipas = json.load(open('./transcription_results_with_ipa.json','r'))
model = load_model(model_path=model_paths[0])
ipas_Evaluated = {}
for key in ipas.keys():
    sample = ipas[key]
    evaluation = evaluate_transcriptions(
        model,
        sample['original_transcript'],
        sample['asr_transcript'],
        [sample['original_ipa_model_1'],sample['original_ipa_model_2'],sample['original_ipa_model_3']],
        [sample['asr_ipa_model_1'],sample['asr_ipa_model_2'],sample['asr_ipa_model_3']],
        
    )
    sample['evaluated_ipa'] = evaluation 
    try:
        sample['evaluated_ipa']['error']=calculate_distance(evaluation['best_ipa_original'],evaluation['best_ipa_transcript'])
    except:
        print("parsing failed")
        sample['evaluated_ipa']['error']= {"IPA Mismatches": None,"Normalized Error Rate": None,"Similarity Score": None}
    ipas_Evaluated[key] = sample
    
json.dump(ipas_Evaluated,open('transcription_results_with_evaluated_ipa.json','w'),ensure_ascii=False,indent= 2)
    

NameError: name 'ipas' is not defined

In [6]:
import pandas as pd
import json

In [3]:
data = pd.read_json('transcription_results_with_evaluated_ipa.json')

In [7]:
data = json.load(open('transcription_results_with_evaluated_ipa.json','r'))

In [15]:
status = {}
for idx,key in enumerate(data.keys()):
    status[f"{idx}"] = []
    status[f"{idx}"].append(data[key]['speech_status'])
    status[f"{idx}"].append(data[key]['evaluated_ipa']['error']['IPA Mismatches'])
    status[f"{idx}"].append(data[key]['evaluated_ipa']['error']['Similarity Score'])

In [29]:
df = pd.DataFrame(status).T
df.columns = ["speech_status","IPA Mismatches","Similarity Score"]

In [30]:
df

Unnamed: 0,speech_status,IPA Mismatches,Similarity Score
0,dysarthria,5,0.285714
1,dysarthria,27,0.181818
2,dysarthria,1,0.875
3,dysarthria,0,1.0
4,dysarthria,0,1.0
5,dysarthria,1,0.8
6,dysarthria,0,1.0
7,dysarthria,26,0.16129
8,dysarthria,14,0.621622
9,dysarthria,0,1.0


In [31]:
df.groupby('speech_status').mean()

Unnamed: 0_level_0,IPA Mismatches,Similarity Score
speech_status,Unnamed: 1_level_1,Unnamed: 2_level_1
dysarthria,7.4,0.692544
healthy,1.9,0.796538
