In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import random
import os

SEED = 1234
random.seed(SEED)
np.random.seed(SEED)

DATA_PATH = "./data"
OUTPUT_PATH = "../dataset"

In [3]:
from labelling import Labelling

data_path = os.path.join("./labelling.json")
labelling = Labelling(data_path)

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/hjal/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/hjal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Changes to the names and surnames
+ for the surnames I deleted the ones that had "'" such as O'brain (that's why there are only 248 surnames and not 250)
+ for both I deleted the (sur)names that ended in "s"

In [4]:
surnames = pd.read_csv(os.path.join(DATA_PATH, 'Top250Surnames1991-2000.txt'), lineterminator="\n", header=None) 
surnames = [name for name in list(surnames[0]) if (name[-1] != "s" or name.find("'") != -1)]

In [5]:
train_ds, test_ds = labelling.to_dataset(surnames)

In [6]:
import json
def to_jsonl_file(output_file: str, lines: list[dict]):
    with open(output_file, 'w') as file:
        for line in lines:
            file.write(json.dumps(line, ensure_ascii=False) + '\n')

def save_dataset(train_ds, test_ds, version_mapping):
    for name, dataset in [("train", train_ds), ("test", test_ds)]:
        for version_name, version in dataset.items():
            version_name = version_mapping[version_name]
            output_dir = os.path.join(OUTPUT_PATH, version_name)
            os.makedirs(output_dir, exist_ok=True)

            output_file = os.path.join(output_dir, f"{name}.jsonl")
            lines = [entry.model_dump() for entry in version]
            to_jsonl_file(output_file, lines)

save_dataset(train_ds, test_ds, {
    "all": "non_binary_gender_all",
    "subj": "non_binary_gender_subj"
})

In [7]:
# Create binary version

for sentence in labelling:
    for version in sentence.versions:
        if "neutral" in sentence.versions[version].gender:
            del sentence.versions[version].gender["neutral"]

train_ds_binary, test_ds_binary = labelling.to_dataset(surnames)

save_dataset(train_ds_binary, test_ds_binary, {
    "all": "binary_gender_all",
    "subj": "binary_gender_subj"
})