In [13]:
from datasets import load_from_disk

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils import resample
from tqdm import tqdm
from collections import Counter

In [49]:
# Load the dataset from the folder path
dataset = load_from_disk("../results/gysbert_embs_300")

# Optionally convert to pandas DataFrame
df = dataset.to_pandas()

# Filter only MALE and FEMALE
df = df[df['gender'].isin(['MALE', 'FEMALE'])]

### TF-IDF

In [7]:
# Store per-class results
results = {
    'MALE': {'precision': [], 'recall': [], 'f1-score': []},
    'FEMALE': {'precision': [], 'recall': [], 'f1-score': []},
    'accuracy': []
}

In [27]:
import string

def clean_text(text):
    # Lowercase and remove punctuation
    text = text.lower()
    return text.translate(str.maketrans('', '', string.punctuation))

In [44]:
# Run 50 iterations
for i in tqdm(range(50), desc="Evaluations"):
    # Balance the dataset
    male_df = df[df['gender'] == 'MALE']
    female_df = df[df['gender'] == 'FEMALE']
    min_size = min(len(male_df), len(female_df))

    male_sample = resample(male_df, n_samples=min_size, random_state=i)
    female_sample = resample(female_df, n_samples=min_size, random_state=i)

    balanced_df = pd.concat([male_sample, female_sample]).sample(frac=1, random_state=i).reset_index(drop=True)

    # Features and labels
    X = balanced_df['speech_chunk']
    y = balanced_df['gender']

    # TF-IDF
    vectorizer = TfidfVectorizer(
    ngram_range=(2, 3),
    min_df=3,
    max_df=0.9,
    preprocessor=clean_text
)
    X_tfidf = vectorizer.fit_transform(X)

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=i, stratify=y)

    # Train model
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train, y_train)

    # Evaluate
    y_pred = clf.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)

    for gender in ['MALE', 'FEMALE']:
        for metric in ['precision', 'recall', 'f1-score']:
            results[gender][metric].append(report[gender][metric])
    results['accuracy'].append(report['accuracy'])

# Calculate and display averages
print("\n✅ Average Results over 50 iterations:")
for gender in ['MALE', 'FEMALE']:
    print(f"\n{gender}:")
    for metric in ['precision', 'recall', 'f1-score']:
        avg = np.mean(results[gender][metric])
        print(f"  {metric.capitalize()}: {avg:.4f}")

print(f"\nOverall Accuracy: {np.mean(results['accuracy']):.4f}")

Evaluations: 100%|██████████| 50/50 [00:58<00:00,  1.16s/it]


✅ Average Results over 50 iterations:

MALE:
  Precision: 0.7543
  Recall: 0.7608
  F1-score: 0.7569

FEMALE:
  Precision: 0.7609
  Recall: 0.7527
  F1-score: 0.7561

Overall Accuracy: 0.7567





### Embeddings

In [50]:
# Run 50 iterations
for i in tqdm(range(50), desc="Evaluations"):
    # Balance the dataset
    male_df = df[df['gender'] == 'MALE']
    female_df = df[df['gender'] == 'FEMALE']
    min_size = min(len(male_df), len(female_df))

    male_sample = resample(male_df, n_samples=min_size, random_state=i)
    female_sample = resample(female_df, n_samples=min_size, random_state=i)

    balanced_df = pd.concat([male_sample, female_sample]).sample(frac=1, random_state=i).reset_index(drop=True)

    # Prepare features and labels
    X = np.vstack(balanced_df['embedding'].values)
    y = balanced_df['gender'].values

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i, stratify=y)

    # Print class sizes
    train_counts = Counter(y_train)
    test_counts = Counter(y_test)
    #print(f"\nIteration {i + 1}")
    #print(f"  Train set → MALE: {train_counts['MALE']}, FEMALE: {train_counts['FEMALE']}")
    #print(f"  Test set  → MALE: {test_counts['MALE']}, FEMALE: {test_counts['FEMALE']}")

    # Train classifier
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train, y_train)

    # Evaluate
    y_pred = clf.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)

    for gender in ['MALE', 'FEMALE']:
        for metric in ['precision', 'recall', 'f1-score']:
            results[gender][metric].append(report[gender][metric])
    results['accuracy'].append(report['accuracy'])

# Aggregate and print results
print("\n✅ Average Results over 50 iterations:")
for gender in ['MALE', 'FEMALE']:
    print(f"\n{gender}:")
    for metric in ['precision', 'recall', 'f1-score']:
        avg = np.mean(results[gender][metric])
        print(f"  {metric.capitalize()}: {avg:.4f}")

print(f"\nOverall Accuracy: {np.mean(results['accuracy']):.4f}")

Evaluations: 100%|██████████| 50/50 [00:17<00:00,  2.92it/s]


✅ Average Results over 50 iterations:

MALE:
  Precision: 0.7516
  Recall: 0.7547
  F1-score: 0.7525

FEMALE:
  Precision: 0.7558
  Recall: 0.7510
  F1-score: 0.7527

Overall Accuracy: 0.7528



