In [1]:
from datasets import load_dataset, ClassLabel, load_from_disk
import torch
import numpy as np
from torch.utils.data import DataLoader
from transformers import AutoModelForAudioClassification
import transformers
from utils.dataset_utils import CremaDataset
import os
from tqdm.auto import tqdm
import wandb
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
from fairlearn.metrics import MetricFrame, demographic_parity_difference, equalized_odds_difference
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd


In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
crema_dataset = CremaDataset("distil-whisper/distil-medium.en")
crema_dataset.set_vector("actor-vector.hf", True)


In [4]:
BATCH_SIZE = 16

torch.manual_seed(42)
test_dataloader = DataLoader(crema_dataset.test_dataset, batch_size=BATCH_SIZE)


num_labels = len(crema_dataset.id2label)

transformers.set_seed(42)
model = AutoModelForAudioClassification.from_pretrained(
    crema_dataset.model_id, num_labels=num_labels, label2id=crema_dataset.label2id, id2label=crema_dataset.id2label
)


Some weights of WhisperForAudioClassification were not initialized from the model checkpoint at distil-whisper/distil-medium.en and are newly initialized: ['model.classifier.bias', 'model.classifier.weight', 'model.projector.bias', 'model.projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
EPOCHS=1

num_training_steps = EPOCHS * len(test_dataloader)

wandb.init(project="crema_evaluation_with_fairness", 
           name="evaluation_run",
           config={})

validation_results = {"ActorID": [], "true_labels": [], "predictions": []}

In [27]:
model.to(device)
model.load_state_dict(torch.load("model_latest.pth", map_location=device))

<All keys matched successfully>

In [31]:
progress_bar = tqdm(range(num_training_steps))

model.eval()
for batch in test_dataloader:
    actor_ids = [np.int64(id) for id in batch["path"]]
    true_labels_batch = batch["labels"].numpy()
    
    with torch.no_grad():
        inputs = {"input_features": batch["input_features"].to(device), "labels": batch["labels"].to(device)}
        outputs = model(**inputs)
        preds = outputs.logits.argmax(dim=-1).cpu().numpy()
    # Append results to dictionary
    validation_results["ActorID"].extend(actor_ids)
    validation_results["true_labels"].extend(true_labels_batch)
    validation_results["predictions"].extend(preds)
    
    progress_bar.update(1)


  0%|          | 0/140 [00:00<?, ?it/s]

In [32]:
results_df = pd.DataFrame(validation_results)

demographics_path =  "VideoDemographics.csv"
demographics_df = pd.read_csv(demographics_path)

In [33]:
demographics_df.head()

Unnamed: 0,ActorID,Age,Sex,Race,Ethnicity
0,1001,51,Male,Caucasian,Not Hispanic
1,1002,21,Female,Caucasian,Not Hispanic
2,1003,21,Female,Caucasian,Not Hispanic
3,1004,42,Female,Caucasian,Not Hispanic
4,1005,29,Male,African American,Not Hispanic


In [34]:
results_df.head()

Unnamed: 0,ActorID,true_labels,predictions
0,1031,3,3
1,1004,3,3
2,1035,0,0
3,1044,1,4
4,1053,2,4


In [35]:
type(results_df.loc[0, "ActorID"]), type(demographics_df.loc[0, "ActorID"])

(numpy.int64, numpy.int64)

In [36]:
# Merge predictions with demographic info using actor ID
merged_data = results_df.merge(demographics_df, on="ActorID")

In [40]:
true_labels = merged_data["true_labels"]
predictions = merged_data["predictions"]


In [41]:
# Classification Report
report = classification_report(true_labels, predictions, output_dict=True)
wandb.log({"classification_report": report})

In [42]:
# Confusion Matrix
conf_matrix = confusion_matrix(true_labels, predictions)
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues",
            xticklabels=np.unique(true_labels), yticklabels=np.unique(true_labels))
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
wandb.log({"confusion_matrix": wandb.Image(fig)})
plt.close(fig)

In [44]:
accuracy_score

<function sklearn.metrics._classification.accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None)>

In [63]:
sensitive_features = merged_data["Race"]  # Replace with demographic column
metric_frame = MetricFrame(
    metrics={
        'accuracy': accuracy_score,
        'f1_score': lambda y_true, y_pred: f1_score(y_true, y_pred, average='weighted', pos_label=1)
    },
    y_true=list(true_labels),
    y_pred=list(predictions),
    sensitive_features=list(sensitive_features),
)

In [60]:
demographic_parity_diff = demographic_parity_difference(
    true_labels, predictions, sensitive_features=sensitive_features
)
demographic_parity_diff

0.022237344346411364

In [55]:
print(list(true_labels)[:5], list(predictions)[:5])

[3, 3, 0, 1, 2] [3, 3, 0, 4, 4]


In [61]:
equalized_odds_diff = equalized_odds_difference(
    list(true_labels), list(predictions), sensitive_features=list(sensitive_features)
)

ValueError: If pos_label is not specified, values must be from {0, 1} or {-1, 1}

In [64]:
equalized_odds_diff = 0

In [65]:
wandb.log({
    "fairness_metrics": {
        "demographic_parity_difference": demographic_parity_diff,
        "equalized_odds_difference": equalized_odds_diff
    },
    "group_metrics": metric_frame.by_group.to_dict()
})

print("Final metrics and fairness assessment logged.")
wandb.finish()

Final metrics and fairness assessment logged.
