In [None]:
import pandas as pd
import numpy as np
import random
import pickle
import os

SEED = 1234
random.seed(SEED)
np.random.seed(SEED)

DATA_PATH = "./data"
os.makedirs(DATA_PATH, exist_ok=True)

In [None]:
# Start from the labeled phrases
df_original = pd.read_excel(os.path.join(DATA_PATH, "all_phrases.xlsx"), index_col=None, header=None)

In [None]:
df = df_original.copy()
df = df[df[0].isna()]  # here I deleted all the phrases that should be checked

original_phrases = (
    df[df[2] == "Original"]
    .drop([0, 1, 2], axis=1)
    .reset_index()
    .drop(["index"], axis=1)
)

# ground_truths
all_female_phrases_gt = (
    df[df[2] == "All Female"]
    .drop([0, 1, 2], axis=1)
    .reset_index()
    .drop(["index"], axis=1)
)

all_male_phrases_gt = (
    df[df[2] == "All Male"]
    .drop([0, 1, 2], axis=1)
    .reset_index()
    .drop(["index"], axis=1)
)

female_sub_phrases_gt = (
    df[df[2] == "Subject Female"]
    .drop([0, 1, 2], axis=1)
    .reset_index()
    .drop(["index"], axis=1)
)

male_sub_phrases_gt = (
    df[df[2] == "Subject Male"]
    .drop([0, 1, 2], axis=1)
    .reset_index()
    .drop(["index"], axis=1)
)

# ===================


In [None]:
def change_words(original_phrase, ground_truth):
    """
    Replaces the words in the original phrase with the words in the ground truth
    """

    new_phrase = original_phrase.copy()
    for j in list(ground_truth.index):
        if not pd.isna(ground_truth.loc[j]):
            new_phrase.loc[j] = ground_truth.loc[j]

    return list(new_phrase)

In [None]:
all_fem = []
all_male = []
subj_fem = []
subj_male = []
for idx in list(original_phrases.index):
    all_fem_phrase = change_words(
        original_phrases.loc[idx], all_female_phrases_gt.loc[idx]
    )
    all_fem.append(all_fem_phrase)

    all_male_phrase = change_words(
        original_phrases.loc[idx], all_male_phrases_gt.loc[idx]
    )
    all_male.append(all_male_phrase)

    subj_fem_phrase = change_words(
        original_phrases.loc[idx], female_sub_phrases_gt.loc[idx]
    )
    subj_fem.append(subj_fem_phrase)

    subj_male_phrase = change_words(
        original_phrases.loc[idx], male_sub_phrases_gt.loc[idx]
    )
    subj_male.append(subj_male_phrase)

all_fem = pd.DataFrame(all_fem)
all_male = pd.DataFrame(all_male)
subj_fem = pd.DataFrame(subj_fem)
subj_male = pd.DataFrame(subj_male)

In [None]:
# Example how the phrases look like

example_df = pd.DataFrame(
    [
        list(original_phrases.loc[0]),
        list(all_female_phrases_gt.loc[0]),
        list(all_fem.loc[0]),
    ],
    index=["Original Phrase", "+ Female All Ground Truth", "= Female Phrase"],
)

example_df

## Changes to the names and surnames
+ for the surnames I deleted the ones that had "'" such as O'brain (that's why there are only 248 surnames and not 250)
+ for both I deleted the (sur)names that ended in "s"

In [None]:
female_names = pd.read_csv(os.path.join(DATA_PATH, 'Top250Female1996-2019.txt'), lineterminator="\n", header=None)
male_names = pd.read_csv(os.path.join(DATA_PATH, 'Top250Male1996-2019.txt'), lineterminator="\n", header=None)
surnames = pd.read_csv(os.path.join(DATA_PATH, 'Top250Surnames1991-2000.txt'), lineterminator="\n", header=None) 

print(female_names.shape,male_names.shape,surnames.shape)

In [None]:
female_names = pd.DataFrame([name for name in list(female_names[0]) if name[-1] != "s"])
male_names = pd.DataFrame([name for name in list(male_names[0]) if name[-1] != "s"])
surnames = pd.DataFrame(
    [name for name in list(surnames[0]) if (name[-1] != "s" or name.find("'") != -1)]
)

female_names = list(name[0] for name in female_names.values)
male_names = list(name[0] for name in male_names.values)
surnames = list(name[0] for name in surnames.values)

print(len(female_names), len(male_names), len(surnames))

In [None]:
def change_names(phrase):
    """
    Replace "name_male_1", "name_female_1" and "surname_1" / "surname_1_pl" with random names from the lists.

    The number determines the index of the name in the list and "surname_1_pl" is the plural form of the surname.
    """
    p = phrase.copy()

    for word_idx in list(p.index):
        if str(p.loc[word_idx]).startswith("name_male"):
            # Replace male name
            idx = int(p.loc[word_idx][-1]) - 1
            p.loc[word_idx] = male_names[idx]
        elif str(p.loc[word_idx]).startswith("name_female"):
            # Replace female name
            idx = int(p.loc[word_idx][-1]) - 1
            p.loc[word_idx] = female_names[idx]
        elif str(p.loc[word_idx]).startswith("surname"):
            if p.loc[word_idx][-2:] == "pl":
                # Replace plural surname
                idx = int(p.loc[word_idx][-4]) - 1
                p.loc[word_idx] = surnames[idx] + "s"
            else:
                # Replace singular surname
                idx = int(p.loc[word_idx][-1]) - 1
                p.loc[word_idx] = surnames[idx]
    return p

In [None]:
final_all_fem = []
final_all_male = []
final_subj_fem = []
final_subj_male = []
for idx in all_fem.index:
    random.shuffle(female_names)
    random.shuffle(male_names)
    random.shuffle(surnames)

    phrase1 = change_names(all_fem.iloc[idx])
    final_all_fem.append(phrase1)

    phrase2 = change_names(all_male.iloc[idx])
    final_all_male.append(phrase2)

    phrase3 = change_names(subj_fem.iloc[idx])
    final_subj_fem.append(phrase3)

    phrase4 = change_names(subj_male.iloc[idx])
    final_subj_male.append(phrase4)

In [None]:
final_all_fem = pd.DataFrame(final_all_fem)
final_all_male = pd.DataFrame(final_all_male)
final_subj_fem = pd.DataFrame(final_subj_fem)
final_subj_male = pd.DataFrame(final_subj_male)

In [None]:
def create_train_val_datasets(final_male, final_female):
    SEED = 1234
    random.seed(SEED)
    frac = 0.8

    # creating training and validation indexes
    indexes = list(final_male.index)
    train_idx = random.sample(indexes, k=int(len(indexes) * frac))
    val_idx = [i for i in list(range(len(final_male))) if i not in train_idx]

    # trianing dataset
    # selects the lines for the training dataset
    # these are two datasets that will then be used to choose between female and male after
    df_training_male = final_male.iloc[train_idx]
    df_training_female = final_female.iloc[train_idx]

    # creating training dataset
    indexes_male = random.sample(
        list(df_training_male.index), k=int(len(list(df_training_male.index)) / 2)
    )
    indexes_female = [i for i in train_idx if i not in indexes_male]

    # selects the lines that are female or male from the training datasets
    df_male = df_training_male.loc[indexes_male]
    df_female = df_training_female.loc[indexes_female]

    df_male.insert(0, "target", 1)
    df_female.insert(0, "target", 0)

    # concatenate dataframes to create dataset
    training_df = pd.concat([df_male, df_female]).sort_index()

    # validation dataset
    # selects the lines for the validation dataset
    df_validation_male = final_male.iloc[val_idx]
    df_validation_female = final_female.iloc[val_idx]

    df_validation_male.insert(0, "target", np.ones(len(df_validation_male)))
    df_validation_female.insert(0, "target", np.zeros(len(df_validation_female)))

    return training_df, df_validation_male, df_validation_female, val_idx, train_idx

In [None]:
(
    training_df_all,
    df_validation_male_all,
    df_validation_female_all,
    val_idx_all,
    train_idx_all,
) = create_train_val_datasets(final_all_male, final_all_fem)

(
    training_df_subj,
    df_validation_male_subj,
    df_validation_female_subj,
    val_idx_subj,
    train_idx_subj,
) = create_train_val_datasets(final_subj_male, final_subj_fem)

In [None]:
files = {
    "training_df_all.pkl": training_df_all,
    "df_validation_male_all.pkl": df_validation_male_all,
    "df_validation_female_all.pkl": df_validation_female_all,
    "training_df_subj.pkl": training_df_subj,
    "df_validation_male_subj.pkl": df_validation_male_subj,
    "df_validation_female_subj.pkl": df_validation_female_subj
}

for filename, df in files.items():
    with open(os.path.join(DATA_PATH, filename), "wb") as f:
        pickle.dump(df, f)

In [None]:
def create_gt(original_phrases, female_gt, male_gt):
    gt_out = pd.DataFrame(np.zeros(female_sub_phrases_gt.shape))

    for i in range(len(original_phrases)):
        phrase_f = female_gt.iloc[i]
        phrase_m = male_gt.iloc[i]

        for word_idx in range(len(phrase_f)):
            word_idx += 3
            if str(phrase_f[word_idx]) != str(phrase_m[word_idx]):
                gt_out.iloc[i][word_idx] = 1
    
    return gt_out


# Ground truth for change type "all"
gt_all = create_gt(original_phrases, all_female_phrases_gt, all_male_phrases_gt)
gt_all_val = gt_all.loc[val_idx_all]
gt_all_train = gt_all.loc[train_idx_all]


# Ground truth for change type "subj"
gt_subj = create_gt(original_phrases, female_sub_phrases_gt, male_sub_phrases_gt)
gt_subj_val = gt_subj.loc[val_idx_subj]
gt_subj_train = gt_subj.loc[train_idx_subj]

In [None]:
gt_files = {
    "gt_subj_val.pkl": gt_subj_val,
    "gt_subj_train.pkl": gt_subj_train,
    "gt_all_val.pkl": gt_all_val,
    "gt_all_train.pkl": gt_all_train
}

for filename, df in gt_files.items():
    with open(os.path.join(DATA_PATH, filename), "wb") as f:
        pickle.dump(df, f)