In [1]:
import pandas as pd
import numpy as np
import random
import pickle
import os

SEED = 1234
random.seed(SEED)
np.random.seed(SEED)

DATA_PATH = "./data"
OUTPUT_PATH = "../raw"
os.makedirs(DATA_PATH, exist_ok=True)
os.makedirs(OUTPUT_PATH, exist_ok=True)

In [2]:
# Start from the labeled phrases
df_original = pd.read_excel(os.path.join(DATA_PATH, "all_phrases.xlsx"), index_col=None, header=None)

  warn(msg)


In [3]:
from dataclasses import dataclass

@dataclass
class Phrases:
    original: pd.DataFrame
    all_female: pd.DataFrame
    all_male: pd.DataFrame
    subj_female: pd.DataFrame
    subj_male: pd.DataFrame
    idx: pd.DataFrame = None
    resp: pd.DataFrame = None

def filter_row_drop(df: pd.DataFrame, key: str):
    return (
        df[df[2] == key]
        .drop([0, 1, 2], axis=1)
        .reset_index()
        .drop(["index"], axis=1)
    )

def filter_row(df: pd.DataFrame, key: str):
    return (
        df[df[2] == key]
        .reset_index()
        .drop(["index"], axis=1)
    )

def load_excel(path: str, new_format: bool):
    df = pd.read_excel(path, index_col=None, header=None)

    if new_format:
        # Drop first row
        df = df.drop([0], axis=0)
    else:
        df = df[df[0].isna()]  # here I deleted all the phrases that should be checked

    return Phrases(
        original=filter_row_drop(df, "Original"),
        all_female=filter_row_drop(df, "All Female"),
        all_male=filter_row_drop(df, "All Male"),
        subj_female=filter_row_drop(df, "Subject Female"),
        subj_male=filter_row_drop(df, "Subject Male"),
        idx=filter_row(df, "Original")[0],
        resp=filter_row(df, "Original")[1]
    )

phrases = load_excel(os.path.join(DATA_PATH, "all_phrases.xlsx"), new_format=False)

  warn(msg)


In [4]:
# Fix labelling errors

new_dfs = {
    "Rick": "./data/newly_labelled/intersection_template_rick.xlsx",
    "Hjalmar": "./data/newly_labelled/intersection_template_hjalmar.xlsx",
    "Artur": "./data/newly_labelled/intersection_template_Artur_v2.xlsx",
}

for resp in new_dfs.keys():
    new_dfs[resp] = load_excel(new_dfs[resp], new_format=True)

all_male_sentences = new_dfs["Rick"].original.copy()

# Merge the new data
base = new_dfs["Rick"]

for resp in ["Hjalmar", "Artur"]:
    base.all_female[base.resp == resp] = new_dfs[resp].all_female[
        new_dfs[resp].resp == resp
    ]
    base.all_male[base.resp == resp] = new_dfs[resp].all_male[
        new_dfs[resp].resp == resp
    ]
    base.subj_female[base.resp == resp] = new_dfs[resp].subj_female[
        new_dfs[resp].resp == resp
    ]
    base.subj_male[base.resp == resp] = new_dfs[resp].subj_male[
        new_dfs[resp].resp == resp
    ]

# Replace with the "real" original sentences, not the all_male one
base.original = phrases.original.iloc[base.idx]

def flip_if_needed(
    original_phrases: pd.DataFrame,
    labelling_female: pd.DataFrame,
    labelling_male: pd.DataFrame,
):
    for i in range(len(original_phrases)):
        for j in range(len(original_phrases.columns)):
            if (
                labelling_female.iloc[i, j] is not None
                and original_phrases.iloc[i, j] == labelling_female.iloc[i, j]
            ):
                labelling_male.iloc[i, j] = all_male_sentences.iloc[i, j]

flip_if_needed(
    base.original,
    base.all_female,
    base.all_male
)

flip_if_needed(
    base.original,
    base.subj_female,
    base.subj_male
)

# Set the index to the original phrases
base.all_female.set_index(base.idx, inplace=True)
base.all_male.set_index(base.idx, inplace=True)
base.subj_female.set_index(base.idx, inplace=True)
base.subj_male.set_index(base.idx, inplace=True)

# Replace the original phrases with the new ones
phrases.all_female.iloc[base.idx.to_list()] = base.all_female
phrases.all_male.iloc[base.idx.to_list()] = base.all_male
phrases.subj_female.iloc[base.idx.to_list()] = base.subj_female
phrases.subj_male.iloc[base.idx.to_list()] = base.subj_male

idx = 1275
test_df = pd.DataFrame(
    {
        "original": phrases.original.iloc[idx],
        "all_female": phrases.all_female.iloc[idx],
        "all_male": phrases.all_male.iloc[idx],
        "subj_female": phrases.subj_female.iloc[idx],
        "subj_male": phrases.subj_male.iloc[idx],
    }
)
test_df

Unnamed: 0,original,all_female,all_male,subj_female,subj_male
3,On,,,,
4,the,,,,
5,other,,,,
6,hand,,,,
7,",",,,,
8,Oliver,name_female_1,name_male_1,name_female_1,name_male_1
9,proves,,,,
10,to,,,,
11,be,,,,
12,of,,,,


# ===================


In [5]:
def change_words(original_phrase, ground_truth):
    """
    Replaces the words in the original phrase with the words in the ground truth
    """

    new_phrase = original_phrase.copy()
    for j in list(ground_truth.index):
        if not pd.isna(ground_truth.loc[j]):
            new_phrase.loc[j] = ground_truth.loc[j]

    return list(new_phrase)

In [6]:
all_fem = []
all_male = []
subj_fem = []
subj_male = []
for idx in list(phrases.original.index):
    all_fem_phrase = change_words(
        phrases.original.loc[idx], phrases.all_female.loc[idx]
    )
    all_fem.append(all_fem_phrase)

    all_male_phrase = change_words(
        phrases.original.loc[idx], phrases.all_male.loc[idx]
    )
    all_male.append(all_male_phrase)

    subj_fem_phrase = change_words(
        phrases.original.loc[idx], phrases.subj_female.loc[idx]
    )
    subj_fem.append(subj_fem_phrase)

    subj_male_phrase = change_words(
        phrases.original.loc[idx], phrases.subj_male.loc[idx]
    )
    subj_male.append(subj_male_phrase)

all_fem = pd.DataFrame(all_fem)
all_male = pd.DataFrame(all_male)
subj_fem = pd.DataFrame(subj_fem)
subj_male = pd.DataFrame(subj_male)

In [7]:
# Example how the phrases look like

example_df = pd.DataFrame(
    [
        list(phrases.original.loc[0]),
        list(phrases.all_female.loc[0]),
        list(all_fem.loc[0]),
    ],
    index=["Original Phrase", "+ Female All Ground Truth", "= Female Phrase"],
)

example_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
Original Phrase,Shortly,afterwards,",",he,visits,Netherfield,",",Mr.,Bingley,'s,...,.,,,,,,,,,
+ Female All Ground Truth,,,,she,,,,Mrs.,surname_1,,...,,,,,,,,,,
= Female Phrase,Shortly,afterwards,",",she,visits,Netherfield,",",Mrs.,surname_1,'s,...,.,,,,,,,,,


## Changes to the names and surnames
+ for the surnames I deleted the ones that had "'" such as O'brain (that's why there are only 248 surnames and not 250)
+ for both I deleted the (sur)names that ended in "s"

In [8]:
female_names = pd.read_csv(os.path.join(DATA_PATH, 'Top250Female1996-2019.txt'), lineterminator="\n", header=None)
male_names = pd.read_csv(os.path.join(DATA_PATH, 'Top250Male1996-2019.txt'), lineterminator="\n", header=None)
surnames = pd.read_csv(os.path.join(DATA_PATH, 'Top250Surnames1991-2000.txt'), lineterminator="\n", header=None) 

print(female_names.shape,male_names.shape,surnames.shape)

(250, 1) (250, 1) (248, 1)


In [9]:
female_names = pd.DataFrame([name for name in list(female_names[0]) if name[-1] != "s"])
male_names = pd.DataFrame([name for name in list(male_names[0]) if name[-1] != "s"])
surnames = pd.DataFrame(
    [name for name in list(surnames[0]) if (name[-1] != "s" or name.find("'") != -1)]
)

female_names = list(name[0] for name in female_names.values)
male_names = list(name[0] for name in male_names.values)
surnames = list(name[0] for name in surnames.values)

print(len(female_names), len(male_names), len(surnames))

245 231 193


In [10]:
def change_names(phrase):
    """
    Replace "name_male_1", "name_female_1" and "surname_1" / "surname_1_pl" with random names from the lists.

    The number determines the index of the name in the list and "surname_1_pl" is the plural form of the surname.
    """
    p = phrase.copy()

    for word_idx in list(p.index):
        if str(p.loc[word_idx]).startswith("name_male"):
            # Replace male name
            idx = int(p.loc[word_idx][-1]) - 1
            p.loc[word_idx] = male_names[idx]
        elif str(p.loc[word_idx]).startswith("name_female"):
            # Replace female name
            idx = int(p.loc[word_idx][-1]) - 1
            p.loc[word_idx] = female_names[idx]
        elif str(p.loc[word_idx]).startswith("surname"):
            if p.loc[word_idx][-2:] == "pl":
                # Replace plural surname
                idx = int(p.loc[word_idx][-4]) - 1
                p.loc[word_idx] = surnames[idx] + "s"
            else:
                # Replace singular surname
                idx = int(p.loc[word_idx][-1]) - 1
                p.loc[word_idx] = surnames[idx]
    return p

In [11]:
final_all_fem = []
final_all_male = []
final_subj_fem = []
final_subj_male = []
for idx in all_fem.index:
    random.shuffle(female_names)
    random.shuffle(male_names)
    random.shuffle(surnames)

    phrase1 = change_names(all_fem.iloc[idx])
    final_all_fem.append(phrase1)

    phrase2 = change_names(all_male.iloc[idx])
    final_all_male.append(phrase2)

    phrase3 = change_names(subj_fem.iloc[idx])
    final_subj_fem.append(phrase3)

    phrase4 = change_names(subj_male.iloc[idx])
    final_subj_male.append(phrase4)

In [12]:
final_all_fem = pd.DataFrame(final_all_fem)
final_all_male = pd.DataFrame(final_all_male)
final_subj_fem = pd.DataFrame(final_subj_fem)
final_subj_male = pd.DataFrame(final_subj_male)

In [13]:
def create_train_val_datasets(final_male, final_female):
    SEED = 1234
    random.seed(SEED)
    frac = 0.8

    # creating training and validation indexes
    indexes = list(final_male.index)
    train_idx = random.sample(indexes, k=int(len(indexes) * frac))
    val_idx = [i for i in list(range(len(final_male))) if i not in train_idx]

    # trianing dataset
    # selects the lines for the training dataset
    # these are two datasets that will then be used to choose between female and male after
    df_training_male = final_male.iloc[train_idx]
    df_training_female = final_female.iloc[train_idx]

    df_training_male.insert(0, "target", 1)
    df_training_female.insert(0, "target", 0)

    df_training_male.insert(1, "sentence_idx", np.arange(len(train_idx)))
    df_training_female.insert(1, "sentence_idx", np.arange(len(train_idx)))

    # concatenate dataframes to create dataset
    training_df = pd.concat([df_training_male, df_training_female])

    # validation dataset
    # selects the lines for the validation dataset
    df_validation_male = final_male.iloc[val_idx]
    df_validation_female = final_female.iloc[val_idx]

    df_validation_male.insert(0, "target", np.ones(len(df_validation_male)))
    df_validation_female.insert(0, "target", np.zeros(len(df_validation_female)))

    df_validation_male.insert(1, "sentence_idx", np.arange(len(val_idx)))
    df_validation_female.insert(1, "sentence_idx", np.arange(len(val_idx)))


    return training_df, df_validation_male, df_validation_female, val_idx, train_idx

In [14]:
(
    training_df_all,
    df_validation_male_all,
    df_validation_female_all,
    val_idx_all,
    train_idx_all,
) = create_train_val_datasets(final_all_male, final_all_fem)

(
    training_df_subj,
    df_validation_male_subj,
    df_validation_female_subj,
    val_idx_subj,
    train_idx_subj,
) = create_train_val_datasets(final_subj_male, final_subj_fem)

In [15]:
files = {
    "training_df_all.pkl": training_df_all,
    "df_validation_male_all.pkl": df_validation_male_all,
    "df_validation_female_all.pkl": df_validation_female_all,
    "training_df_subj.pkl": training_df_subj,
    "df_validation_male_subj.pkl": df_validation_male_subj,
    "df_validation_female_subj.pkl": df_validation_female_subj
}

for filename, df in files.items():
    with open(os.path.join(OUTPUT_PATH, filename), "wb") as f:
        pickle.dump(df, f)

In [16]:
def create_gt(original_phrases, female_gt, male_gt):
    gt_out = pd.DataFrame(np.zeros(phrases.subj_female.shape))
    offset = 3

    for i in range(len(original_phrases)):
        phrase_f = female_gt.iloc[i]
        phrase_m = male_gt.iloc[i]

        for word_idx in range(len(phrase_f)):
            if str(phrase_f[word_idx + offset]) != str(phrase_m[word_idx + offset]):
                gt_out.iloc[i][word_idx] = 1
    
    return gt_out


# Ground truth for change type "all"
gt_all = create_gt(phrases.original, phrases.all_female, phrases.all_male)
gt_all_val = gt_all.loc[val_idx_all]
gt_all_train = gt_all.loc[train_idx_all]

# Concat ground truth to match training set (df_training_male, df_training_female)
gt_all_train = pd.concat([gt_all_train, gt_all_train])

# Ground truth for change type "subj"
gt_subj = create_gt(phrases.original, phrases.subj_female, phrases.subj_male)
gt_subj_val = gt_subj.loc[val_idx_subj]
gt_subj_train = gt_subj.loc[train_idx_subj]

# Concat ground truth to match training set (df_training_male, df_training_female)
gt_subj_train = pd.concat([gt_subj_train, gt_subj_train])

In [17]:
gt_files = {
    "gt_subj_val.pkl": gt_subj_val,
    "gt_subj_train.pkl": gt_subj_train,
    "gt_all_val.pkl": gt_all_val,
    "gt_all_train.pkl": gt_all_train
}

for filename, df in gt_files.items():
    with open(os.path.join(OUTPUT_PATH, filename), "wb") as f:
        pickle.dump(df, f)


In [18]:
# Sanity check

idx = df_validation_male_all[1] == "highlight"
cur_gt = gt_all_val[idx].iloc[0]
cur_sentence_male = df_validation_male_all[idx].iloc[0][2:]
cur_sentence_female = df_validation_female_all[idx].iloc[0][2:]

pd.DataFrame([
    cur_sentence_male,
    cur_sentence_female,
    cur_gt
], index=["Male", "Female", "Ground Truth"]).T

Unnamed: 0,Male,Female,Ground Truth
0,To,To,0.0
1,highlight,highlight,0.0
2,the,the,0.0
3,hypocrisy,hypocrisy,0.0
4,required,required,0.0
5,to,to,0.0
6,condone,condone,0.0
7,slavery,slavery,0.0
8,within,within,0.0
9,an,an,0.0
