In [None]:
import random
from collections import namedtuple

import numpy as np
import pandas as pd
from datasets import load_dataset

seed = 42
random.seed(seed)
np.random.seed(seed)

In [None]:
# Constsants

VAL_FRAC = 0.002
TEST_FRAC = 0.05

GENDERS = ["female", "male"]


DIALECTS = [
    "Bornholmsk",
    "Fynsk",
    "Københavnsk",
    "Nordjysk",
    "Sjællandsk",
    "Sydømål",
    "Sønderjysk",
    "Vestjysk",
    "Østjysk",
]


age_group = namedtuple("age_group", ["min", "max"])
AGE_GROUPS = {
    "0-24": age_group(0, 25),
    "25-49": age_group(25, 50),
    "50-": age_group(50, int(1e6)),
}

ACCENTS = ["native", "foreign"]

# In the test set, we want to have samples that represent at least:
# 40% of each gender (ignore nonbinary?)
# 10% of each dialect
# 20% of each age group
# 5% with a foreign accent
# Currently, the code only relies on the dialect criteria.
# The code will add samples of each dialect until there are at least `DIALECT_CRITERA*100`% samples in
# the dataset with that dialect.
DIALECT_CRITERA = 0.08

# Map Dialects as given in the huggingface coral dataset to the values in the dialect excel sheet column "Underregionsprog".
# One note, in `Underregionsprog` we have `Amagermål`. Should we use `Københavnsk` instead?
# Using `Københavnsk` for now.
SUB_DIALECT_TO_DIALECT = {
    "Sydsjællandsk (sydligt sydsjællandsk)": "Sjællandsk",
    "Vestjysk": "Vestjysk",
    "Amagermål": "Københavnsk",
    "Sjællandsk": "Sjællandsk",
    "Fynsk": "Fynsk",
    "Midtøstjysk": "Østjysk",
    "Himmerlandsk": "Nordjysk",
    "Østjysk": "Østjysk",
    "Nørrejysk": "Nordjysk",
    "Thybomål": "Nordjysk",
    "Mellemslesvisk": "Sønderjysk",
    "Sønderjysk": "Sønderjysk",
    "Østligt sønderjysk (m. Als)": "Sønderjysk",
    "Djurslandsk (Nord-, Syddjurs m. Nord- og Sydsamsø, Anholt)": "Østjysk",
    "Sydvestjysk (m. Fanø)": "Vestjysk",
    "Vendsysselsk (m. Hanherred og Læsø)": "Nordjysk",
    "Nordfalstersk": "Sjællandsk",
    "Nordsjællandsk": "Sjællandsk",
    "Morsingmål": "Nordjysk",
    "Sallingmål": "Vestjysk",
    "Nordvestsjællandsk": "Sjællandsk",
    "Sydøstjysk": "Østjysk",
    "Vestlig sønderjysk (m. Mandø og Rømø)": "Sønderjysk",
    "Sydømål": "Sydømål",
    "Bornholmsk": "Bornholmsk",
    "Ommersysselsk": "Østjysk",
    "Lollandsk": "Sjællandsk",
    "Vestfynsk (nordvest-, sydvestfynsk)": "Fynsk",
    "Langelandsk": "Fynsk",
    "Sydfynsk": "Fynsk",
    "Sydvestsjællandsk": "Sjællandsk",
    "Østsjællandsk": "Sjællandsk",
}
assert set(SUB_DIALECT_TO_DIALECT.values()) == set(DIALECTS)

In [None]:
# Some helper functions
def age_to_group(age: int) -> str:
    """Map age to age group.

    Args:
        age (int):
            Age of the speaker.

    Returns:
        group (str):
            Age group.
    """
    for group, (min_age, max_age) in AGE_GROUPS.items():
        if min_age <= age < max_age:
            return group
    raise ValueError(f"Age {age} not in any group.")


def get_speakers(df: pd.DataFrame) -> list:
    """Get unique speakers.

    Args:
        df (pd.DataFrame):
            Dataframe with speakers.

    Returns:
        speakers (list):
            List of unique speakers.
    """
    speakers = df["id_speaker"].unique().tolist()
    return speakers


def get_speaker_df(df: pd.DataFrame, dialect: str) -> pd.DataFrame:
    """Get dataframe with unique speakers for a dialect.

    Args:
        df (pd.DataFrame):
            Dataframe with speakers.
        dialect (str):
            Dialect to filter on.

    Returns:
        df_speaker (pd.DataFrame):
            Dataframe with unique speakers for a dialect.
    """
    df_dialect = df[df["dialect"] == dialect]
    df_speaker = df_dialect.drop_duplicates(subset="id_speaker")
    return df_speaker


def get_probs(scores: list[float]) -> list[float]:
    """Get probabilities from scores.

    Args:
        scores (list[float]):
            List of scores.

    Returns:
        list[float]:
            List of probabilities.
    """
    # use numpy
    exp = np.exp(scores)
    probs = exp / exp.sum()
    return probs.tolist()


def give_score(row: pd.Series, age_group_weights: dict, accent_weights: dict) -> float:
    """Score a row based on age group and accent.

    Args:
        row (pd.Series):
            Row in the dataframe.
        age_group_weights (dict):
            Weights for age groups.
        accent_weights (dict):
            Weights for accents.

    Returns:
        score (float):
            Score for the row.
    """
    return age_group_weights[row["age_group"]] + accent_weights[row["accent"]]


def random_sample(samples: list[str], seen: set[str], probs: list[float]) -> str:
    """Take a random weighted sample from a list of samples.

    Args:
        samples (list[str]):
            List of samples.
        seen (set[str]):
            Set of seen samples.
        probs (list[float]):
            List of probabilities.

    Returns:
        sample (str):
            Sample.
    """
    assert set(samples) - seen != set(), "No more samples to sample from"

    sample = np.random.choice(samples, p=probs)
    while sample in seen:
        sample = np.random.choice(samples, p=probs)
    return sample

# Load the coral dataset

In [None]:
dataset_id = "alexandrainst/coral"
coral = load_dataset(path=dataset_id, split="train")

# Remove audio column, as we will not be processing the audio data
coral = coral.remove_columns("audio")
coral_length = len(coral)

df = pd.DataFrame(coral)

## Data exploration

In [None]:
df["gender"].value_counts().plot(kind="bar", title="Gender distribution");

In [None]:
# Number of unique nonbinary speakers
df[df["gender"] == "nonbinary"]["id_speaker"].nunique()

In [None]:
df["dialect"] = df["dialect"].map(SUB_DIALECT_TO_DIALECT)

df["dialect"].value_counts().plot(kind="bar", title="Dialect distribution");

In [None]:
# Number of speakers with the dialect "Sydømål"
df[df["dialect"] == "Sydømål"]["id_speaker"].nunique()

In [None]:
df["accent"] = df["language_native"].apply(
    lambda x: "native" if x == "da" else "foreign"
)
df["accent"].value_counts().plot(kind="bar", title="Native language distribution");

In [None]:
# Map age to age group
df["age_group"] = df["age"].apply(age_to_group)
df["age_group"].value_counts().plot(kind="bar", title="Age group distribution");

### Notes

- Kun 1 speaker med dialekt "Sydømål".
- Kun 2 speakers med "nonbinary" gender.

Plan for at bygge test- og valideringsdatasæt:
1. Inkluder samples med hensyn til "Sydømål" dialekt og "nonbinary" gender.
2. Inkluder samples for hver dialekt, en dialekt af gangen.
   1. Find alle speakers for det køn som er underrepræsenteret.
   2. Lav sampling med vægtning baseret på aldersgruppe og accent (native/foreign).

## Make splits

In [None]:
seen_speakers = set()

In [None]:
class Dataset:
    """Dataset class to keep track of the samples in the dataset.

    Args:
        df (pd.DataFrame):
            Dataframe of the Coral dataset.

    Attributes:
        df (pd.DataFrame):
            Dataframe of the Coral dataset.
        frac (float):
            Approximately this size of the dataset compared to the Coral dataset.
        indices (list):
            List of indices of the Coral dataset that will be included in the dataset.
        self.gender_count (dict):
            Keep track of the number of samples in the datasets for each gender.
        self.dialect_count (dict):
            Keep track of the number of samples in the datasets for each dialect.
        self.age_group_count (dict):
            Keep track of the number of samples in the datasets for each age group.
        self.accent_count (dict):
            Keep track of the number of samples in the datasets for each accent.
        self.age_group_weights (dict):
            Age group weights used to calculate the score of a sample.
        self.accent_weights (dict):
            Accent weights used to calculate the score of a sample.
        self.betas (dict):
            Shift the weights of the least represented feature.
    """

    def __init__(self, df: pd.DataFrame, frac: float) -> None:
        """Initialize the Dataset class."""
        self.df = df
        self.frac = frac
        self.indices = []

        # Initialize counts to 1 to avoid division by zero
        self.gender_count = {gender: 1 for gender in GENDERS}
        self.dialect_count = {dialect: 1 for dialect in DIALECTS}
        self.age_group_count = {age_group: 1 for age_group in AGE_GROUPS.keys()}
        self.accent_count = {accent: 1 for accent in ACCENTS}

        self.age_group_weights = self._make_weights(count=self.age_group_count)
        self.accent_weights = self._make_weights(count=self.accent_count)

        self.betas = {"age_group": 5.0, "accent": 0.5}

    def add_speaker_samples(self, speaker: str) -> None:
        """Add all samples of a speaker to the dataset.

        Args:
            speaker (str):
                The id of the speaker
        """
        speaker_samples = self.df[self.df["id_speaker"] == speaker]
        n_samples = len(speaker_samples)
        indices = speaker_samples.index.tolist()
        self.indices.extend(indices)

        # Assuming that all samples of a speaker have the
        # same gender, dialect, age_group, and native_language
        row = speaker_samples.iloc[0]
        gender = row["gender"]
        dialect = row["dialect"]
        age_group = age_to_group(age=row["age"])
        accent = row["accent"]

        # Don't count nonbinary
        if gender != "nonbinary":
            self.gender_count[gender] += n_samples
        self.dialect_count[dialect] += n_samples
        self.age_group_count[age_group] += n_samples
        self.accent_count[accent] += n_samples

        self._update_weights()

    def _update_weights(self) -> None:
        """Update the weights of the age group and accent."""
        self.age_group_weights = self._make_weights(
            count=self.age_group_count, beta=self.betas["age_group"]
        )
        self.accent_weights = self._make_weights(
            count=self.accent_count, beta=self.betas["accent"]
        )

    def _make_weights(self, count: dict, beta: float = 5.0) -> dict:
        """Make weights based on counts.

        Args:
            count (dict):
                Counts for a feature.
            beta (float, optional):
                Shift the weights of the least represented feature.

        Returns:
            weights (dict):
                Weights for the feature.
        """
        inv_count = {key: 1 / value for key, value in count.items()}
        normalizer = sum(inv_count.values())
        weights = {key: value / normalizer for key, value in inv_count.items()}

        # Increase chance of sampling the least represented feature
        max_key = max(weights, key=weights.get)
        weights[max_key] += weights[max_key] * beta
        return weights

    def __repr__(self) -> str:
        """Representation of the Dataset class."""
        return f"Gender count: {self.gender_count}\nDialect count: {self.dialect_count}\nAge group count: {self.age_group_count}\nAccent count: {self.accent_count}"

    def __len__(self) -> int:
        """Length of the dataset."""
        return len(self.indices)

### Make test split

In [None]:
test_dataset = Dataset(df=df, frac=TEST_FRAC)

Tilføj samples for nonbinary gender (kun to speakers, så tag en speaker til test og en til validation).

In [None]:
df_nonbinary = df[df["gender"] == "nonbinary"]
speakers = get_speakers(df=df_nonbinary)
speakers = list(set(speakers) - seen_speakers)

speaker = random_sample(samples=speakers, seen=seen_speakers, probs=None)
test_dataset.add_speaker_samples(speaker=speaker)
seen_speakers.add(speaker)

test_dataset

Tilføj samples for Sydømål (kun én speaker).

In [None]:
df_sydømål = df[df["dialect"] == "Sydømål"]

speakers = get_speakers(df=df_sydømål)
speakers = list(set(speakers) - seen_speakers)

speaker = speakers[0]
test_dataset.add_speaker_samples(speaker=speaker)
seen_speakers.add(speaker)

test_dataset

Tilføj samples for de resterende dialekter, én af gangen.

In [None]:
# Samples med dialekt `Sydømål` er allerede blevet tilføjet.
dialects = list(set(DIALECTS) - set(["Sydømål"]))


def get_dialect_samples(dataset: Dataset, dialects: list[str]):
    """Get samples of dialects each dialect such that the dataset has ~10% of each dialect.

    Args:
        dataset (Dataset):
            Dataset object (test or val).
        dialects (list[str]):
            List of dialects.

    Returns:
        dataset (Dataset):
            Dataset object with samples of each dialect.
    """
    n_samples_required = int(coral_length * dataset.frac * DIALECT_CRITERA)
    for dialect in dialects:
        while dataset.dialect_count[dialect] < n_samples_required:
            df_speaker = get_speaker_df(df=df, dialect=dialect)

            # Remove rows that have nonbinary gender
            df_speaker = df_speaker[df_speaker["gender"] != "nonbinary"]

            # Remove speakers of the gender that is most frequent in test dataset.
            most_frequent_gender = max(
                dataset.gender_count, key=dataset.gender_count.get
            )
            df_speaker = df_speaker[df_speaker["gender"] != most_frequent_gender]

            assert len(df_speaker) > 0, "No speakers left"

            df_speaker["score"] = df_speaker.apply(
                lambda x: give_score(
                    row=x,
                    age_group_weights=dataset.age_group_weights,
                    accent_weights=dataset.accent_weights,
                ),
                axis=1,
            )
            speakers = df_speaker["id_speaker"].tolist()
            scores = df_speaker["score"].tolist()
            probs = get_probs(scores=scores)

            speaker = random_sample(samples=speakers, seen=seen_speakers, probs=probs)
            dataset.add_speaker_samples(speaker=speaker)
            seen_speakers.add(speaker)

    return dataset


test_dataset = get_dialect_samples(dataset=test_dataset, dialects=dialects)
test_dataset

### Make validation set

In [None]:
val_dataset = Dataset(df=df, frac=VAL_FRAC)

Tilføj sidste nonbinary speaker.

In [None]:
df_nonbinary = df[df["gender"] == "nonbinary"]
speakers = get_speakers(df=df_nonbinary)
speakers = list(set(speakers) - seen_speakers)

speaker = random_sample(samples=speakers, seen=seen_speakers, probs=None)
val_dataset.add_speaker_samples(speaker=speaker)
seen_speakers.add(speaker)

val_dataset

Tilføj dialekter (ingen speaker med Sydømål tilbage)

In [None]:
val_dataset = get_dialect_samples(dataset=val_dataset, dialects=dialects)
val_dataset

In [None]:
test_dataset

### HF splits

In [None]:
test_indices = test_dataset.indices
val_indices = val_dataset.indices

hf_test_dataset = coral.select(indices=test_indices)
hf_val_dataset = coral.select(indices=val_indices)
train_indices = list(set(range(len(coral))) - set(test_indices + val_indices))
hf_train_dataset = coral.select(indices=train_indices)

In [None]:
assert len(train_indices) + len(val_indices) + len(test_indices) == len(coral)
len(train_indices), len(val_indices), len(test_indices)