# Qwen2-Audio Model Testing

This notebook tests the Qwen2-Audio-7B-Instruct model on TREA dataset samples.

## Contents
1. Load model and dataset
2. Test basic inference
3. Analyze prediction patterns
4. Compute baseline accuracy
5. Test FES and FCS generators

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from collections import Counter
from IPython.display import Audio, display
import librosa

from src.data_loader import load_trea_dataset
from src.model_wrapper import Qwen2AudioWrapper
from src.fes_generator import FESGenerator
from src.fcs_generator import FCSGenerator

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

## 1. Load Model and Dataset

In [None]:
# Load dataset (small subset for testing)
dataset = load_trea_dataset(
    data_dir='../TREA_dataset',
    tasks=['count', 'order', 'duration'],
    samples_per_task=10,  # Start with 10 samples per task
    random_seed=42
)

print(f"Loaded {len(dataset)} samples")

In [None]:
# Initialize model
# NOTE: This requires ~14GB GPU memory for 7B model
# Set device='cpu' if GPU not available (will be slower)

model = Qwen2AudioWrapper(
    model_name="Qwen/Qwen2-Audio-7B-Instruct",
    device="cuda",  # Change to "cpu" if no GPU
    dtype="float16",  # Use "float32" for CPU
    max_length=512
)

print("Model loaded successfully!")
print(f"Model info:")
info = model.get_model_info()
for key, value in info.items():
    print(f"  {key}: {value}")

## 2. Test Single Prediction

In [None]:
# Select a sample
sample = dataset.data[0]

print(f"Task: {sample['task'].upper()}")
print(f"Question: {sample['question']}")
print(f"\nOptions:")
for key, value in sample['options'].items():
    print(f"  ({key}) {value}")
print(f"\nGround Truth: {sample['correct_answer']}")

# Play audio
audio, sr = librosa.load(sample['audio_path'], sr=16000)
print(f"\nAudio (duration: {len(audio)/sr:.2f}s):")
display(Audio(audio, rate=sr))

In [None]:
# Get model prediction
prediction, probs = model.predict(
    sample['audio_path'],
    sample['question'],
    sample['options'],
    return_probs=True
)

print(f"Model Prediction: {prediction}")
print(f"Correct: {'✓ YES' if prediction == sample['correct_answer'] else '✗ NO'}")

print(f"\nProbabilities:")
for option, prob in sorted(probs.items()):
    bar = '█' * int(prob * 50)
    marker = '←' if option == prediction else ''
    print(f"  ({option}) {prob:.4f} {bar} {marker}")

## 3. Batch Predictions

In [None]:
# Get predictions for all samples
results = []

for sample in tqdm(dataset.data, desc="Getting predictions"):
    prediction, _ = model.predict(
        sample['audio_path'],
        sample['question'],
        sample['options']
    )
    
    results.append({
        'task': sample['task'],
        'prediction': prediction,
        'ground_truth': sample['correct_answer'],
        'correct': prediction == sample['correct_answer']
    })

results_df = pd.DataFrame(results)
print(f"\nCompleted {len(results)} predictions")

## 4. Accuracy Analysis

In [None]:
# Overall accuracy
overall_acc = results_df['correct'].mean()
print(f"Overall Accuracy: {overall_acc:.2%}")

# Task-wise accuracy
print(f"\nTask-wise Accuracy:")
task_acc = results_df.groupby('task')['correct'].mean()
for task, acc in task_acc.items():
    print(f"  {task}: {acc:.2%}")

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Task-wise accuracy
task_acc.plot(kind='bar', ax=axes[0], color=['#FF6B6B', '#4ECDC4', '#45B7D1'])
axes[0].set_title('Accuracy by Task', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Accuracy')
axes[0].set_xlabel('Task')
axes[0].axhline(y=0.25, color='red', linestyle='--', label='Random (25%)')
axes[0].legend()
axes[0].set_ylim([0, 1])
axes[0].grid(axis='y', alpha=0.3)
plt.setp(axes[0].xaxis.get_majorticklabels(), rotation=0)

# Confusion matrix-style
pred_counts = results_df.groupby(['task', 'correct']).size().unstack(fill_value=0)
pred_counts.plot(kind='bar', stacked=True, ax=axes[1], color=['#FF6B6B', '#95E1D3'])
axes[1].set_title('Correct vs Incorrect by Task', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Count')
axes[1].set_xlabel('Task')
axes[1].legend(['Incorrect', 'Correct'])
axes[1].grid(axis='y', alpha=0.3)
plt.setp(axes[1].xaxis.get_majorticklabels(), rotation=0)

plt.tight_layout()
plt.show()

## 5. Prediction Distribution

In [None]:
# Analyze prediction distribution
pred_dist = results_df['prediction'].value_counts()
gt_dist = results_df['ground_truth'].value_counts()

# Plot
fig, ax = plt.subplots(figsize=(10, 6))
x = np.arange(len(pred_dist))
width = 0.35

ax.bar(x - width/2, pred_dist.values, width, label='Predictions', alpha=0.8)
ax.bar(x + width/2, gt_dist.reindex(pred_dist.index, fill_value=0).values, 
       width, label='Ground Truth', alpha=0.8)

ax.set_title('Prediction vs Ground Truth Distribution', fontsize=14, fontweight='bold')
ax.set_xlabel('Answer Option', fontsize=12)
ax.set_ylabel('Count', fontsize=12)
ax.set_xticks(x)
ax.set_xticklabels(pred_dist.index)
ax.legend()
ax.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

## 6. Test FES Generator

In [None]:
# Initialize FES generator
fes_gen = FESGenerator(
    n_audio_samples=5,  # Small number for testing
    n_text_samples=3,
    sr=16000
)

# Generate FES samples for first sample
sample = dataset.data[0]
fes_samples = fes_gen.generate(
    sample['audio_path'],
    sample['question'],
    sample['task'],
    sample['options']
)

print(f"Generated {len(fes_samples)} FES samples")
print(f"\nOriginal question: {sample['question']}")
print(f"\nFES text variations:")
unique_questions = set(s['question'] for s in fes_samples)
for i, q in enumerate(unique_questions, 1):
    print(f"  {i}. {q}")

In [None]:
# Test FES predictions
print("Getting predictions on FES samples...")
fes_predictions = []

for fes_sample in tqdm(fes_samples[:10], desc="FES predictions"):  # Test on subset
    pred, _ = model.predict(
        fes_sample['audio_path'],
        fes_sample['question'],
        fes_sample['options']
    )
    fes_predictions.append(pred)

# Analyze consistency
original_pred, _ = model.predict(
    sample['audio_path'],
    sample['question'],
    sample['options']
)

print(f"\nOriginal prediction: {original_pred}")
print(f"FES predictions: {Counter(fes_predictions)}")

consistency = sum(1 for p in fes_predictions if p == original_pred) / len(fes_predictions)
print(f"\nConsistency: {consistency:.1%}")
print(f"Model is {'CONSISTENT' if consistency > 0.7 else 'INCONSISTENT'} on FES samples")

## 7. Test FCS Generator

In [None]:
# Initialize FCS generator
fcs_gen = FCSGenerator(
    n_audio_samples=5,
    n_text_samples=3,
    sr=16000,
    synthetic_silence_dir='../TREA_dataset/synthetic_silences'
)

# Generate FCS samples
fcs_samples = fcs_gen.generate(
    sample['audio_path'],
    sample['question'],
    sample['task'],
    sample['options'],
    original_pred
)

print(f"Generated {len(fcs_samples)} FCS samples")
print(f"\nOriginal question: {sample['question']}")
print(f"\nFCS text variations (complementary):")
unique_fcs_questions = set(s['question'] for s in fcs_samples)
for i, q in enumerate(unique_fcs_questions, 1):
    print(f"  {i}. {q}")

In [None]:
# Test FCS predictions
print("Getting predictions on FCS samples...")
fcs_predictions = []

for fcs_sample in tqdm(fcs_samples[:10], desc="FCS predictions"):
    pred, _ = model.predict(
        fcs_sample['audio_path'],
        fcs_sample['question'],
        fcs_sample['options']
    )
    fcs_predictions.append(pred)

print(f"\nOriginal prediction: {original_pred}")
print(f"FCS predictions: {Counter(fcs_predictions)}")

# Should be different from original
sensitivity = sum(1 for p in fcs_predictions if p != original_pred) / len(fcs_predictions)
print(f"\nSensitivity (% different): {sensitivity:.1%}")
print(f"Model is {'SENSITIVE' if sensitivity > 0.5 else 'INSENSITIVE'} to complementary samples")

## 8. Stochastic Sampling Test

In [None]:
# Test stochastic sampling for output entropy baseline
sample_probs = model.predict_with_sampling(
    sample['audio_path'],
    sample['question'],
    sample['options'],
    num_samples=20,
    temperature=0.7
)

print(f"Stochastic sampling results (20 samples, T=0.7):")
for option, prob in sorted(sample_probs.items()):
    bar = '█' * int(prob * 50)
    print(f"  ({option}) {prob:.4f} {bar}")

# Compute entropy
from src.baselines import BaselineUncertainty
baseline = BaselineUncertainty()

# Generate samples based on probabilities
samples = []
for option, prob in sample_probs.items():
    samples.extend([option] * int(prob * 20))

entropy = baseline.output_entropy(samples)
print(f"\nOutput Entropy: {entropy:.4f}")

## Summary

This notebook tested the Qwen2-Audio model:
- ✅ Loaded model successfully
- ✅ Tested basic inference
- ✅ Computed baseline accuracy per task
- ✅ Tested FES generator and consistency
- ✅ Tested FCS generator and sensitivity
- ✅ Tested stochastic sampling

**Next Steps:**
1. Run full FESTA evaluation (notebook 03)
2. Compare with baseline methods
3. Analyze uncertainty calibration

In [None]:
# Cleanup temporary files
fes_gen.cleanup_temp_files()
fcs_gen.cleanup_temp_files()
print("Cleaned up temporary files")