In [19]:
from datasets import load_dataset, load_from_disk
import torch
import os
import pandas as pd
from transformers import AutoFeatureExtractor
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report
from transformers import AutoModelForAudioClassification
import pandas as pd
from tqdm.auto import tqdm
import numpy as np

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [4]:
dataset = load_dataset("myleslinder/crema-d", trust_remote_code=True, split='train')
id2label = {'0': 'neutral', '1': 'happy', '2': 'sad', '3': 'anger', '4': 'fear', '5': 'disgust'}
label2id = {v: k for k, v in id2label.items()}

# Select necessary columns
dataset = dataset.select_columns(['audio', 'label'])
dataset = dataset.train_test_split(test_size=0.3)

model_id = "distil-whisper/distil-medium.en"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_id, do_normalize=True)

In [6]:
demographics_path = "VideoDemographics.csv"
demographics = pd.read_csv(demographics_path)
demographics.head()

Unnamed: 0,ActorID,Age,Sex,Race,Ethnicity
0,1001,51,Male,Caucasian,Not Hispanic
1,1002,21,Female,Caucasian,Not Hispanic
2,1003,21,Female,Caucasian,Not Hispanic
3,1004,42,Female,Caucasian,Not Hispanic
4,1005,29,Male,African American,Not Hispanic


In [9]:
max_duration = 30.0

if os.path.exists("vector.hf"):
    vectorized_dataset = load_from_disk("vector.hf")
else:
    def process(ds):
        audio_arrays = [x["array"] for x in ds["audio"]]
        inputs = feature_extractor(
            audio_arrays,
            sampling_rate=feature_extractor.sampling_rate,
            max_length=int(feature_extractor.sampling_rate * max_duration),
            truncation=True,
        )
        return inputs

    vectorized_dataset = dataset.map(process, remove_columns="audio", batched=True, batch_size=16, num_proc=1)
    vectorized_dataset.save_to_disk("vector.hf")

vectorized_dataset.set_format("torch")
vectorized_dataset = vectorized_dataset.rename_column("label", "labels")

train_dataset = vectorized_dataset["train"]
test_dataset = vectorized_dataset["test"]

In [10]:
BATCH_SIZE = 8

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

model = AutoModelForAudioClassification.from_pretrained(model_id, num_labels=len(id2label))
model.to(device)

# Load pre-trained model weights
model.load_state_dict(torch.load("model.pth"))

Some weights of WhisperForAudioClassification were not initialized from the model checkpoint at distil-whisper/distil-medium.en and are newly initialized: ['model.classifier.bias', 'model.classifier.weight', 'model.projector.bias', 'model.projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [11]:
def statistical_parity(labels, preds, sensitive_attr):
    groups = np.unique(sensitive_attr)
    rates = {}
    for group in groups:
        group_mask = (sensitive_attr == group)
        rates[group] = np.mean(preds[group_mask])
    return rates

def demographic_parity(labels, preds, sensitive_attr):
    positive_rate = np.mean(preds)
    parity = {}
    for group in np.unique(sensitive_attr):
        group_mask = (sensitive_attr == group)
        parity[group] = np.mean(preds[group_mask]) - positive_rate
    return parity

def equality_of_odds(labels, preds, sensitive_attr):
    metrics = {}
    for group in np.unique(sensitive_attr):
        group_mask = (sensitive_attr == group)
        tn, fp, fn, tp = confusion_matrix(labels[group_mask], preds[group_mask]).ravel()
        metrics[group] = {
            'tpr': tp / (tp + fn),
            'fpr': fp / (fp + tn)
        }
    return metrics


In [12]:
model.eval()
all_labels = []
all_preds = []
all_sensitive_attr = []


In [25]:
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        inputs = {key: val.to(device) for key, val in batch.items()}
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        preds = torch.argmax(outputs.logits, dim=-1)

        all_labels.extend(labels.cpu().numpy())
        all_preds.extend(preds.cpu().numpy())

        # Extract actor IDs and cross-reference with demographics
        actor_ids = [dataset['train'][i]['audio']['path'].split('/')[-1][:4] for i in range(len(batch['labels']))]
        print(actor_ids)
        actor_demographics = demographics[demographics['ActorID'].isin(actor_ids)]

        # Collect sensitive attribute (e.g., gender)
        all_sensitive_attr.extend(actor_demographics['Sex'].values)  # Adjust column name as necessary

Evaluating:   0%|          | 0/280 [00:00<?, ?it/s]

['1041', '1039', '1060', '1085', '1076', '1005', '1090', '1046']
['1041', '1039', '1060', '1085', '1076', '1005', '1090', '1046']


KeyboardInterrupt: 

In [16]:
report = classification_report(all_labels, all_preds, target_names=id2label.values())


In [17]:
print(report)


              precision    recall  f1-score   support

     neutral       0.87      0.86      0.86       324
       happy       0.84      0.86      0.85       375
         sad       0.71      0.72      0.72       391
       anger       0.92      0.85      0.89       368
        fear       0.72      0.74      0.73       373
     disgust       0.77      0.78      0.78       410

    accuracy                           0.80      2241
   macro avg       0.81      0.80      0.80      2241
weighted avg       0.80      0.80      0.80      2241



In [20]:
sp = statistical_parity(all_labels, all_preds, all_sensitive_attr)
dp = demographic_parity(all_labels, all_preds, all_sensitive_attr)
eoo = equality_of_odds(all_labels, all_preds, all_sensitive_attr)


In [21]:
print("Statistical Parity:", sp)
print("Demographic Parity:", dp)
print("Equality of Odds:", eoo)

Statistical Parity: {}
Demographic Parity: {}
Equality of Odds: {}


In [24]:
all_sensitive_attr

[]