# Speaker Embedding Visualization

This notebook uses pyannote-audio to annotate audio files, extract speaker embeddings, and visualize them using dimensionality reduction techniques.

The process involves:
1. Loading audio files from the audio/ directory
2. Using pyannote-audio for speaker diarization
3. Extracting speaker embeddings for each segment
4. Mapping speaker IDs to real names using speakers.json
5. Visualizing the embeddings using T-SNE or UMAP

In the visualization, points from the same speaker (real name) will have the same color, and points from the same audio file will have the same marker.


## 1. Setup and Imports


In [None]:
# Suppress warnings
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Standard imports
import os
import json
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

# PyTorch   
import torch

# Pyannote.audio
from pyannote.audio import Pipeline
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
from pyannote.audio import Audio
from pyannote.core import Segment

# Dimensionality reduction
from sklearn.manifold import TSNE

# Rich for pretty printing
from rich.console import Console
from rich.progress import track

# Initialize console
console = Console()


## 2. Load Speaker Mapping

Load the mapping between speaker IDs and real names from speakers.json.


In [None]:
# Load speaker mapping from speakers.json
with open('speakers.json', 'r') as f:
    speaker_mapping = json.load(f)

# Display the mapping
console.print("[bold]Speaker Mapping:[/bold]")
for audio_file, speakers in speaker_mapping.items():
    console.print(f"[cyan]{audio_file}[/cyan]")
    for speaker_id, speaker_name in speakers.items():
        console.print(f"  {speaker_id}: {speaker_name}")


## 3. Initialize Models

Load the pre-trained speaker diarization and embedding models.


In [None]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
console.print(f"Using device: {device}")

# Load pre-trained speaker diarization pipeline
console.print("Loading pre-trained speaker diarization model...")
pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization-3.1",
    use_auth_token="HF_TOKEN_REDACTED"
)
pipeline.to(device)

# Load pre-trained speaker embedding model
console.print("Loading pre-trained speaker embedding model...")
embedding_model = PretrainedSpeakerEmbedding(
    "speechbrain/spkrec-ecapa-voxceleb",
    device=device
)

# Initialize audio processor
audio = Audio(mono="downmix")


## 4. Process Audio Files

Process all WAV files in the audio/ directory, perform diarization, and extract speaker embeddings.


In [None]:
# Get all WAV files in the audio directory
audio_files = sorted(glob.glob('audio/*.wav'))
console.print(f"Found {len(audio_files)} audio files: {audio_files}")


In [None]:
# Initialize lists to store embeddings and metadata
all_embeddings = []
all_metadata = []

# Process each audio file
for audio_file in track(audio_files, description="Processing audio files"):
    # Get the base filename without path and extension
    base_filename = os.path.basename(audio_file)
    console.print(f"\n[bold]Processing {base_filename}[/bold]")
    
    # Get file duration
    file_duration = audio.get_duration(audio_file)
    console.print(f"File duration: {file_duration:.2f} seconds")
    
    # Get speaker mapping for this file
    if base_filename in speaker_mapping:
        file_speaker_mapping = speaker_mapping[base_filename]
    else:
        console.print(f"[red]Warning: No speaker mapping found for {base_filename}[/red]")
        file_speaker_mapping = {}
    
    # Perform diarization
    console.print(f"Performing diarization on {base_filename}...")
    diarization = pipeline(audio_file)
    
    # Process each segment
    for turn, _, speaker_id in diarization.itertracks(yield_label=True):
        # Get segment start and end times
        seg_start = turn.start
        seg_end = min(turn.end, file_duration)  # Ensure we don't go beyond file duration
        
        # Map speaker ID to real name
        if speaker_id in file_speaker_mapping:
            speaker_name = file_speaker_mapping[speaker_id]
        else:
            speaker_name = speaker_id  # Use ID if no mapping is found
        
        # Extract audio segment
        speaker_segment = Segment(seg_start, seg_end)
        waveform, sample_rate = audio.crop(audio_file, speaker_segment)
        
        # Extract embedding
        embedding = embedding_model(waveform[None])
        
        # Store embedding and metadata
        all_embeddings.append(embedding)
        all_metadata.append({
            'audio_file': base_filename,
            'speaker_id': speaker_id,
            'speaker_name': speaker_name,
            'start_time': seg_start,
            'end_time': seg_end,
            'duration': seg_end - seg_start
        })
        
        console.print(f"  Segment: {seg_start:.2f}s - {seg_end:.2f}s, Speaker: {speaker_name} ({speaker_id})")

# Concatenate all embeddings
all_embeddings = np.concatenate(all_embeddings, axis=0)
console.print(f"\nTotal embeddings extracted: {all_embeddings.shape[0]}")
console.print(f"Embedding dimension: {all_embeddings.shape[1]}")

# Create a DataFrame with metadata
metadata_df = pd.DataFrame(all_metadata)

# Add embeddings as a column in the DataFrame
metadata_df['embedding'] = list(all_embeddings)

# Verify the embedding column exists
print(f"DataFrame columns: {metadata_df.columns.tolist()}")
print(f"First embedding shape: {metadata_df['embedding'].iloc[0].shape}")

metadata_df.head()


## 4.5 Compute Mean Embeddings per Speaker-Audio Combination

Compute the mean embedding for each combination of speaker name and audio file. This step:
1. Groups the embeddings by speaker name and audio file
2. Calculates the mean embedding for each group
3. Creates a new DataFrame with the mean embeddings and relevant metadata

This allows us to compare speakers across different audio files by having a single representative embedding per speaker-audio combination.


In [None]:
# Group by speaker name and audio file
console.print("\n[bold]Computing mean embeddings per speaker-audio combination...[/bold]")

# Initialize list to store mean embeddings and their metadata
mean_embeddings = []
mean_metadata = []

# Group by speaker name and audio file
for (speaker_name, audio_file), group in metadata_df.groupby(['speaker_name', 'audio_file']):
    # Extract embeddings for this group
    group_embeddings = np.array(group['embedding'].tolist())
    
    # Compute mean embedding
    mean_emb = np.mean(group_embeddings, axis=0)
    
    # Store mean embedding and metadata
    mean_embeddings.append(mean_emb)
    mean_metadata.append({
        'speaker_name': speaker_name,
        'audio_file': audio_file,
        'segment_count': len(group),
        'total_duration': group['duration'].sum()
    })
    
    console.print(f"  {speaker_name} in {audio_file}: {len(group)} segments, {group['duration'].sum():.2f}s total")

# Create DataFrame with mean embeddings
mean_df = pd.DataFrame(mean_metadata)

# Add mean embeddings as a column
mean_df['mean_embedding'] = list(mean_embeddings)

# Display the mean embeddings DataFrame
mean_df.head()


## 5. Dimensionality Reduction

Apply T-SNE to reduce the embeddings to 2 dimensions for visualization.


In [None]:
# Apply T-SNE for dimensionality reduction
console.print("Applying T-SNE dimensionality reduction...")
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
embeddings_2d_tsne = tsne.fit_transform(all_embeddings)

# Add T-SNE coordinates to the DataFrame
metadata_df['tsne_x'] = embeddings_2d_tsne[:, 0]
metadata_df['tsne_y'] = embeddings_2d_tsne[:, 1]

# Display the first few rows
metadata_df.head()



## 6. Visualization

Visualize the embeddings using T-SNE, with colors for speakers and markers for audio files.


In [None]:
# Get unique speakers and audio files
unique_speakers = metadata_df['speaker_name'].unique()
unique_audio_files = metadata_df['audio_file'].unique()

# Create color and marker mappings
speaker_colors = {speaker: plt.cm.tab10(i % 10) for i, speaker in enumerate(unique_speakers)}
file_markers = {file: marker for file, marker in zip(unique_audio_files, ['o', 's', '^', 'D', 'v', '<', '>', 'p', '*', 'h'])}

# Display the mappings
console.print("\n[bold]Speaker Color Mapping:[/bold]")
for speaker, color in speaker_colors.items():
    console.print(f"  {speaker}: {color}")

console.print("\n[bold]Audio File Marker Mapping:[/bold]")
for file, marker in file_markers.items():
    console.print(f"  {file}: {marker}")


In [None]:
# Function to create scatter plot
def plot_embeddings(x, y, title, method):
    fig, ax = plt.subplots(figsize=(12, 10))
    
    # Plot each point
    for speaker in unique_speakers:
        for audio_file in unique_audio_files:
            # Filter data for this speaker and audio file
            mask = (metadata_df['speaker_name'] == speaker) & (metadata_df['audio_file'] == audio_file)
            if mask.sum() > 0:
                ax.scatter(
                    x[mask], y[mask],
                    color=speaker_colors[speaker],
                    marker=file_markers[audio_file],
                    label=f"{speaker} - {audio_file}",
                    alpha=0.7,
                    s=100
                )
    
    # Create combined legend elements for audio files (markers) and speaker names (colors)
    combined_legend_elements = []
    
    # Add audio file markers with their respective labels
    for file, marker in file_markers.items():
        combined_legend_elements.append(
            Line2D([0], [0], marker=marker, color='gray', 
                  label=file, markersize=10, linestyle='None')
        )
    
    # Add speaker colors with their respective labels
    for speaker, color in speaker_colors.items():
        combined_legend_elements.append(
            Line2D([0], [0], marker='o', color='w', markerfacecolor=color, 
                 label=speaker, markersize=10)
        )
    
    # Create a second legend for additional information if needed
    speaker_legend_elements = [Line2D([0], [0], marker='o', color='w', markerfacecolor=color, 
                                     label=speaker, markersize=10) 
                              for speaker, color in speaker_colors.items()]
    
    # Add the combined legend as the first legend
    fig.legend(handles=combined_legend_elements, title="Audio Files (markers) and Speakers (colors)", 
              loc="upper left", bbox_to_anchor=(1, 1))
    
    # Add a second legend below the first one if needed
    # fig.legend(handles=speaker_legend_elements, title="Additional Info", 
    #           loc="upper left", bbox_to_anchor=(1, 0.5))
    
    ax.set_title(title, fontsize=16)
    ax.set_xlabel(f"{method} Dimension 1", fontsize=12)
    ax.set_ylabel(f"{method} Dimension 2", fontsize=12)
    plt.tight_layout()
    ax.grid(alpha=0.3)
    plt.show()


In [None]:
# Plot T-SNE visualization
plot_embeddings(
    metadata_df['tsne_x'], 
    metadata_df['tsne_y'], 
    "Speaker Embeddings - T-SNE Projection",
    "T-SNE"
)



## 7. Analysis

Let's analyze the results to see if speakers are well-separated in the embedding space.


In [None]:
# Group by speaker and calculate statistics
speaker_stats = metadata_df.groupby('speaker_name').agg({
    'duration': ['count', 'sum', 'mean'],
    'tsne_x': ['mean', 'std'],
    'tsne_y': ['mean', 'std']
})

# Flatten the column names
speaker_stats.columns = ['_'.join(col).strip() for col in speaker_stats.columns.values]
speaker_stats = speaker_stats.reset_index()

# Display statistics
speaker_stats


In [None]:
# Group by audio file and calculate statistics
file_stats = metadata_df.groupby('audio_file').agg({
    'duration': ['count', 'sum', 'mean'],
    'speaker_name': 'nunique'
})

# Flatten the column names
file_stats.columns = ['_'.join(col).strip() for col in file_stats.columns.values]
file_stats = file_stats.reset_index()

# Display statistics
file_stats


## 11. Speaker Identification with ChromaDB

In this section, we'll:
1. Create a ChromaDB collection (deleting it if it already exists)
2. Add the averaged speaker embeddings to the collection with metadata (audio file, speaker name)
3. Diarize a test audio file and compute embeddings for each segment
4. Query the ChromaDB collection to find the most similar speakers for each segment
5. Display the results in a pandas dataframe

This demonstrates how speaker embeddings can be used for speaker identification in a practical application.

In [None]:

# Import ChromaDB
import chromadb
from chromadb.config import Settings

# Define the path to the ChromaDB database
DB_PATH = os.path.abspath("./chroma_db")

# Create a persistent ChromaDB client
console.print("\n[bold]Creating ChromaDB client...[/bold]")
client = chromadb.PersistentClient(path=DB_PATH)

# Create or recreate the collection
collection_name = "speaker-embeddings"
console.print(f"Creating collection: {collection_name}")

# Delete the collection if it already exists
try:
    client.delete_collection(collection_name)
    console.print(f"Deleted existing collection: {collection_name}")
except:
    console.print(f"No existing collection to delete: {collection_name}")

# Create a new collection
collection = client.create_collection(
    name=collection_name,
    metadata={"hnsw:space": "cosine"}
)


In [None]:
# Add the averaged speaker embeddings to the collection
console.print("\n[bold]Adding averaged speaker embeddings to ChromaDB...[/bold]")

# For each speaker-audio combination, add the mean embedding to the collection
for idx, row in mean_df.iterrows():
    speaker_name = row['speaker_name']
    audio_file = row['audio_file']
    embedding = row['mean_embedding']
    
    # Add to collection with metadata
    collection.add(
        ids=[f"{speaker_name}_{audio_file}_{idx}"],
        embeddings=[embedding.tolist()],
        metadatas=[{
            "speaker": speaker_name,
            "audio_file": audio_file,
            "segment_count": int(row['segment_count']),
            "total_duration": float(row['total_duration'])
        }],
        documents=[f"Speaker: {speaker_name}, Audio: {audio_file}"]
    )
    
    console.print(f"Added embedding for {speaker_name} from {audio_file}")

# Get collection count
count = collection.count()
console.print(f"Total embeddings in collection: {count}")


In [None]:
# Diarize the test audio file
console.print("\n[bold]Diarizing test audio file...[/bold]")

# Path to the test audio file
test_audio_file = "audio/test/locuteur_a_07.wav"
console.print(f"Processing: {test_audio_file}")

# Get file duration
file_duration = audio.get_duration(test_audio_file)
console.print(f"File duration: {file_duration:.2f} seconds")

# Perform diarization
diarization = pipeline(test_audio_file)

# Initialize lists to store segment data
segment_data = []

# Process each segment
for turn, _, speaker_id in diarization.itertracks(yield_label=True):
    # Get segment start and end times
    seg_start = turn.start
    seg_end = min(turn.end, file_duration)  # Ensure we don't go beyond file duration
    
    # Extract audio segment
    speaker_segment = Segment(seg_start, seg_end)
    waveform, sample_rate = audio.crop(test_audio_file, speaker_segment)
    
    # Extract embedding
    embedding = embedding_model(waveform[None])
    
    # Query ChromaDB for similar speakers
    results = collection.query(
        query_embeddings=[embedding[0].tolist()],
        n_results=3,
        include=["distances", "metadatas", "documents"]
    )
    
    # Store segment data
    segment_info = {
        "segment": f"Segment {len(segment_data) + 1}",
        "start": seg_start,
        "end": seg_end,
        "speaker": speaker_id,
    }
    
    # Add top 3 similar speakers
    for i in range(min(3, len(results["metadatas"][0]))):
        similarity = 1.0 - results["distances"][0][i]
        metadata = results["metadatas"][0][i]
        segment_info[f"similar_{i+1}_speaker"] = metadata["speaker"]
        segment_info[f"similar_{i+1}_audio"] = metadata["audio_file"]
        segment_info[f"similar_{i+1}_similarity"] = similarity
    
    segment_data.append(segment_info)
    
    console.print(f"Processed segment: {seg_start:.2f}s - {seg_end:.2f}s, Speaker: {speaker_id}")


In [None]:
# Create a DataFrame with the segment data
console.print("\n[bold]Creating DataFrame with segment data...[/bold]")
segments_df = pd.DataFrame(segment_data)

# Display the DataFrame
console.print("Segment data:")
segments_df


In [None]:
# Create a more readable visualization of the results
console.print("\n[bold]Speaker identification results:[/bold]")

# Format the DataFrame for better readability
formatted_df = segments_df.copy()

# Format time columns
formatted_df["time_range"] = formatted_df.apply(
    lambda row: f"{row['start']:.2f}s - {row['end']:.2f}s", axis=1
)

# Format similarity columns
for i in range(1, 4):
    if f"similar_{i}_similarity" in formatted_df.columns:
        formatted_df[f"match_{i}"] = formatted_df.apply(
            lambda row: f"{row[f'similar_{i}_speaker']} ({row[f'similar_{i}_audio']}) - {row[f'similar_{i}_similarity']:.4f}",
            axis=1
        )

# Select and reorder columns
display_columns = ["segment", "time_range", "speaker"]
for i in range(1, 4):
    if f"match_{i}" in formatted_df.columns:
        display_columns.append(f"match_{i}")

# Display the formatted DataFrame
formatted_df[display_columns]


## 12. Success Rate Evaluation for Locuteur_A Identification

Calculate the success rate for correctly identifying Locuteur_A in the test file using 1-NN, 2-NN, and 3-NN approaches.
For each segment in the test file, we check if "Locuteur_A" is among the top k nearest neighbors.


In [None]:
# Calculate success rates for identifying Locuteur_A
console.print("\n[bold]Calculating success rates for Locuteur_A identification...[/bold]")

# Function to check if Locuteur_A is in the top k matches
def is_locuteur_a_in_top_k(row, k):
    for i in range(1, k+1):
        if f"similar_{i}_speaker" in row and "Locuteur_A" in row[f"similar_{i}_speaker"]:
            return True
    return False

# Calculate success rates for k=1, k=2, and k=3
total_segments = len(segments_df)
success_counts = {1: 0, 2: 0, 3: 0}

for k in [1, 2, 3]:
    for _, row in segments_df.iterrows():
        if is_locuteur_a_in_top_k(row, k):
            success_counts[k] += 1
    
    success_rate = (success_counts[k] / total_segments) * 100
    console.print(f"{k}-NN Success Rate: {success_counts[k]}/{total_segments} segments ({success_rate:.2f}%)")

# Create a bar chart to visualize the success rates
plt.figure(figsize=(10, 6))
k_values = list(success_counts.keys())
success_rates = [(success_counts[k] / total_segments) * 100 for k in k_values]

plt.bar(k_values, success_rates, color='skyblue')
plt.xlabel('Number of Nearest Neighbors (k)', fontsize=12)
plt.ylabel('Success Rate (%)', fontsize=12)
plt.title('Success Rate for Identifying Locuteur_A with k-NN', fontsize=14)
plt.xticks(k_values)
plt.ylim(0, 100)

# Add value labels on top of each bar
for i, rate in enumerate(success_rates):
    plt.text(k_values[i], rate + 2, f'{rate:.1f}%', ha='center', fontsize=12)

# Add a grid for better readability
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()


## 13. Processing Negative Class Audio File

In this section, we'll:
1. Diarize the negative class audio file (`audio/test/negative_class.wav`)
2. Extract speaker embeddings for each segment
3. Query the ChromaDB collection to find the 3 most similar speakers for each segment
4. Display the results in a pandas dataframe
5. Compute the Classification Rate for 1-NN, 2-NN, and 3-NN
6. Show the average similarity of the 1-NN

This demonstrates how speaker embeddings can be used for speaker identification with a negative class example.


In [None]:
# Diarize the negative class audio file
console.print("\n[bold]Diarizing negative class audio file...[/bold]")

# Path to the negative class audio file
negative_audio_file = "audio/test/negative_class.wav"
console.print(f"Processing: {negative_audio_file}")

# Get file duration
file_duration = audio.get_duration(negative_audio_file)
console.print(f"File duration: {file_duration:.2f} seconds")

# Perform diarization
diarization = pipeline(negative_audio_file)

# Initialize lists to store segment data
negative_segment_data = []

# Process each segment
for turn, _, speaker_id in diarization.itertracks(yield_label=True):
    # Get segment start and end times
    seg_start = turn.start
    seg_end = min(turn.end, file_duration)  # Ensure we don't go beyond file duration
    
    # Extract audio segment
    speaker_segment = Segment(seg_start, seg_end)
    waveform, sample_rate = audio.crop(negative_audio_file, speaker_segment)
    
    # Extract embedding
    embedding = embedding_model(waveform[None])
    
    # Query ChromaDB for similar speakers
    results = collection.query(
        query_embeddings=[embedding[0].tolist()],
        n_results=3,
        include=["distances", "metadatas", "documents"]
    )
    
    # Store segment data
    segment_info = {
        "segment": f"Segment {len(negative_segment_data) + 1}",
        "start": seg_start,
        "end": seg_end,
        "speaker": speaker_id,
    }
    
    # Add top 3 similar speakers
    for i in range(min(3, len(results["metadatas"][0]))):
        similarity = 1.0 - results["distances"][0][i]
        metadata = results["metadatas"][0][i]
        segment_info[f"similar_{i+1}_speaker"] = metadata["speaker"]
        segment_info[f"similar_{i+1}_audio"] = metadata["audio_file"]
        segment_info[f"similar_{i+1}_similarity"] = similarity
    
    negative_segment_data.append(segment_info)
    
    console.print(f"Processed segment: {seg_start:.2f}s - {seg_end:.2f}s, Speaker: {speaker_id}")


In [None]:
# Create a DataFrame with the segment data
console.print("\n[bold]Creating DataFrame with negative class segment data...[/bold]")
negative_segments_df = pd.DataFrame(negative_segment_data)

# Display the DataFrame
console.print("Negative class segment data:")
negative_segments_df


In [None]:
# Create a more readable visualization of the results
console.print("\n[bold]Speaker identification results for negative class:[/bold]")

# Format the DataFrame for better readability
formatted_negative_df = negative_segments_df.copy()

# Format time columns
formatted_negative_df["time_range"] = formatted_negative_df.apply(
    lambda row: f"{row['start']:.2f}s - {row['end']:.2f}s", axis=1
)

# Format similarity columns
for i in range(1, 4):
    if f"similar_{i}_similarity" in formatted_negative_df.columns:
        formatted_negative_df[f"match_{i}"] = formatted_negative_df.apply(
            lambda row: f"{row[f'similar_{i}_speaker']} ({row[f'similar_{i}_audio']}) - {row[f'similar_{i}_similarity']:.4f}",
            axis=1
        )

# Select and reorder columns
display_columns = ["segment", "time_range", "speaker"]
for i in range(1, 4):
    if f"match_{i}" in formatted_negative_df.columns:
        display_columns.append(f"match_{i}")

# Display the formatted DataFrame
formatted_negative_df[display_columns]


## 14. Locuteur_A Rate Calculation for Negative Class

Calculate the Locuteur_A Rate for 1-NN, 2-NN, and 3-NN for the negative class audio file.
The Locuteur_A Rate is defined as the percentage of segments where Locuteur_A is identified among the top k nearest neighbors.


In [None]:
# Calculate Locuteur_A Rates for the negative class
console.print("\n[bold]Calculating Locuteur_A Rates for negative class...[/bold]")

# Calculate Locuteur_A Rates for k=1, k=2, and k=3
total_negative_segments = len(negative_segments_df)
locuteur_a_counts = {1: 0, 2: 0, 3: 0}

for k in [1, 2, 3]:
    for _, row in negative_segments_df.iterrows():
        if is_locuteur_a_in_top_k(row, k):
            locuteur_a_counts[k] += 1
    
    locuteur_a_rate = (locuteur_a_counts[k] / total_negative_segments) * 100
    console.print(f"{k}-NN Locuteur_A Rate: {locuteur_a_counts[k]}/{total_negative_segments} segments ({locuteur_a_rate:.2f}%)")

# Create a bar chart to visualize the Locuteur_A Rates
plt.figure(figsize=(10, 6))
k_values = list(locuteur_a_counts.keys())
locuteur_a_rates = [(locuteur_a_counts[k] / total_negative_segments) * 100 for k in k_values]

plt.bar(k_values, locuteur_a_rates, color='salmon')
plt.xlabel('Number of Nearest Neighbors (k)', fontsize=12)
plt.ylabel('Locuteur_A Rate (%)', fontsize=12)
plt.title('Locuteur_A Rate for Negative Class with k-NN', fontsize=14)
plt.xticks(k_values)
plt.ylim(0, 100)

# Add value labels on top of each bar
for i, rate in enumerate(locuteur_a_rates):
    plt.text(k_values[i], rate + 2, f'{rate:.1f}%', ha='center', fontsize=12)

# Add a grid for better readability
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()


## 15. Average Similarity of 1-NN for Negative Class

Calculate and display the average similarity of the 1-NN (first nearest neighbor) for the negative class audio file.


In [None]:
# Calculate average similarity of 1-NN for negative class
console.print("\n[bold]Calculating average similarity of 1-NN for negative class...[/bold]")

# Extract 1-NN similarities
nn1_similarities = []
for _, row in negative_segments_df.iterrows():
    if "similar_1_similarity" in row:
        nn1_similarities.append(row["similar_1_similarity"])

# Calculate average similarity
if nn1_similarities:
    avg_similarity = sum(nn1_similarities) / len(nn1_similarities)
    console.print(f"Average 1-NN similarity for negative class: {avg_similarity:.4f}")
    
    # Create a histogram of 1-NN similarities
    plt.figure(figsize=(10, 6))
    plt.hist(nn1_similarities, bins=10, color='lightgreen', edgecolor='black')
    plt.axvline(avg_similarity, color='red', linestyle='dashed', linewidth=2, label=f'Average: {avg_similarity:.4f}')
    plt.xlabel('1-NN Similarity', fontsize=12)
    plt.ylabel('Frequency', fontsize=12)
    plt.title('Distribution of 1-NN Similarities for Negative Class', fontsize=14)
    plt.legend()
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.show()
else:
    console.print("No 1-NN similarities found for negative class.")
