## Plot utterances to the UMAP

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

In [None]:
import pandas as pd
import yaml
import pathlib
from sentence_transformers import SentenceTransformer
import numpy as np
import matplotlib.pyplot as plt
import umap  # Requires: pip install umap-learn
import torch
from sklearn.metrics.pairwise import cosine_distances
import seaborn as sns  # For better plots, optional: pip install seaborn

# Define paths and model name
config_path = 'path/to/your/config.yaml'  # Replace with your actual config path
model_name = 'all-mpnet-base-v2'
patients_df = pd.read_csv('patients.csv')  # Replace with your patients file path
controls_df = pd.read_csv('controls.csv')  # Replace with your controls file path

# Extract utterances
patients = patients_df['utterance'].tolist()
controls = controls_df['utterance'].tolist()

# Load centroid from YAML config
cfg = yaml.safe_load(pathlib.Path(config_path).read_text(encoding="utf-8"))
concepts = cfg["stories"][0]["concepts"]
assert len(concepts) == 34, f"Expected 34 concepts, got {len(concepts)}"

# Load the model and use GPU if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
embedder = SentenceTransformer(model_name)

# Compute centroid embedding
concept_embeds = embedder.encode(concepts, convert_to_tensor=True, normalize_embeddings=True, device=device)
centroid = torch.mean(concept_embeds, dim=0, keepdim=True).cpu().numpy()  # Convert to numpy for consistency

# Compute embeddings for patients and controls on GPU
patient_embeddings = embedder.encode(patients, device=device, batch_size=32, show_progress_bar=True, normalize_embeddings=True)
control_embeddings = embedder.encode(controls, device=device, batch_size=32, show_progress_bar=True, normalize_embeddings=True)

# Compute cosine distances to centroid (note: since normalized, distance = 1 - cosine similarity)
patient_distances = cosine_distances(patient_embeddings, centroid.reshape(1, -1)).flatten()
control_distances = cosine_distances(control_embeddings, centroid.reshape(1, -1)).flatten()

# Print average distances
print(f"Average cosine distance for patients: {np.mean(patient_distances):.4f} (std: {np.std(patient_distances):.4f})")
print(f"Average cosine distance for controls: {np.mean(control_distances):.4f} (std: {np.std(control_distances):.4f})")

# Plot histograms of distances
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(patient_distances, bins=30, alpha=0.7, label='Patients', color='red')
plt.hist(control_distances, bins=30, alpha=0.7, label='Controls', color='blue')
plt.xlabel('Cosine Distance to Centroid')
plt.ylabel('Frequency')
plt.title('Distribution of Distances to Centroid')
plt.legend()

# UMAP plot with centroid
all_embeddings = np.vstack([patient_embeddings, control_embeddings])
labels = ['patient'] * len(patients) + ['control'] * len(controls)

reducer = umap.UMAP(n_components=2, random_state=42, metric='cosine')
embeddings_2d = reducer.fit_transform(all_embeddings)

# Project centroid into 2D using the same reducer
centroid_2d = reducer.transform(centroid.reshape(1, -1))

plt.subplot(1, 2, 2)
colors = ['red' if label == 'patient' else 'blue' for label in labels]
scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=colors, alpha=0.6)
plt.scatter(centroid_2d[:, 0], centroid_2d[:, 1], c='green', s=100, marker='*', label='Centroid')
plt.xlabel('UMAP Component 1')
plt.ylabel('UMAP Component 2')
plt.title('UMAP Embeddings with Centroid')
from matplotlib.lines import Line2D
legend_elements = [Line2D([0], [0], marker='o', color='w', markerfacecolor='red', markersize=10, label='Patient'),
                   Line2D([0], [0], marker='o', color='w', markerfacecolor='blue', markersize=10, label='Control'),
                   Line2D([0], [0], marker='*', color='w', markerfacecolor='green', markersize=15, label='Centroid')]
plt.legend(handles=legend_elements)

plt.tight_layout()
plt.show()