In [1]:
import pandas as pd
import numpy as np
import random
import os

SEED = 1234
random.seed(SEED)
np.random.seed(SEED)

In [2]:
from dataclasses import dataclass

@dataclass
class Phrases:
    original: pd.DataFrame
    all_female: pd.DataFrame
    all_male: pd.DataFrame
    subj_female: pd.DataFrame
    subj_male: pd.DataFrame
    idx: pd.DataFrame = None
    resp: pd.DataFrame = None

def filter_row_drop(df: pd.DataFrame, key: str):
    return (
        df[df[2] == key]
        .drop([0, 1, 2], axis=1)
        .reset_index()
        .drop(["index"], axis=1)
    )

def filter_row(df: pd.DataFrame, key: str):
    return (
        df[df[2] == key]
        .reset_index()
        .drop(["index"], axis=1)
    )

def load_excel(path: str, new_format: bool):
    df = pd.read_excel(path, index_col=None, header=None)

    if new_format:
        # Drop first row
        df = df.drop([0], axis=0)
    else:
        df = df[df[0].isna()]  # here I deleted all the phrases that should be checked

    return Phrases(
        original=filter_row_drop(df, "Original"),
        all_female=filter_row_drop(df, "All Female"),
        all_male=filter_row_drop(df, "All Male"),
        subj_female=filter_row_drop(df, "Subject Female"),
        subj_male=filter_row_drop(df, "Subject Male"),
        idx=filter_row(df, "Original")[0],
        resp=filter_row(df, "Original")[1]
    )

phrases = load_excel(os.path.join("./all_phrases.xlsx"), new_format=False)

  warn(msg)


In [3]:
# Fix labelling errors

new_dfs = {
    "Rick": "./newly_labelled/intersection_template_rick.xlsx",
    "Hjalmar": "./newly_labelled/intersection_template_hjalmar.xlsx",
    "Artur": "./newly_labelled/intersection_template_Artur_v2.xlsx",
}

for resp in new_dfs.keys():
    new_dfs[resp] = load_excel(new_dfs[resp], new_format=True)

all_male_sentences = new_dfs["Rick"].original.copy()

# Merge the new data
base = new_dfs["Rick"]

for resp in ["Hjalmar", "Artur"]:
    base.all_female[base.resp == resp] = new_dfs[resp].all_female[
        new_dfs[resp].resp == resp
    ]
    base.all_male[base.resp == resp] = new_dfs[resp].all_male[
        new_dfs[resp].resp == resp
    ]
    base.subj_female[base.resp == resp] = new_dfs[resp].subj_female[
        new_dfs[resp].resp == resp
    ]
    base.subj_male[base.resp == resp] = new_dfs[resp].subj_male[
        new_dfs[resp].resp == resp
    ]

# Replace with the "real" original sentences, not the all_male one
base.original = phrases.original.iloc[base.idx]

def flip_if_needed(
    original_phrases: pd.DataFrame,
    labelling_female: pd.DataFrame,
    labelling_male: pd.DataFrame,
):
    for i in range(len(original_phrases)):
        for j in range(len(original_phrases.columns)):
            if (
                labelling_female.iloc[i, j] is not None
                and original_phrases.iloc[i, j] == labelling_female.iloc[i, j]
            ):
                labelling_male.iloc[i, j] = all_male_sentences.iloc[i, j]

flip_if_needed(
    base.original,
    base.all_female,
    base.all_male
)

flip_if_needed(
    base.original,
    base.subj_female,
    base.subj_male
)

# Set the index to the original phrases
base.all_female.set_index(base.idx, inplace=True)
base.all_male.set_index(base.idx, inplace=True)
base.subj_female.set_index(base.idx, inplace=True)
base.subj_male.set_index(base.idx, inplace=True)

# Replace the original phrases with the new ones
phrases.all_female.iloc[base.idx.to_list()] = base.all_female
phrases.all_male.iloc[base.idx.to_list()] = base.all_male
phrases.subj_female.iloc[base.idx.to_list()] = base.subj_female
phrases.subj_male.iloc[base.idx.to_list()] = base.subj_male

idx = 1275
test_df = pd.DataFrame(
    {
        "original": phrases.original.iloc[idx],
        "all_female": phrases.all_female.iloc[idx],
        "all_male": phrases.all_male.iloc[idx],
        "subj_female": phrases.subj_female.iloc[idx],
        "subj_male": phrases.subj_male.iloc[idx],
    }
)
test_df

Unnamed: 0,original,all_female,all_male,subj_female,subj_male
3,On,,,,
4,the,,,,
5,other,,,,
6,hand,,,,
7,",",,,,
8,Oliver,name_female_1,name_male_1,name_female_1,name_male_1
9,proves,,,,
10,to,,,,
11,be,,,,
12,of,,,,


In [6]:
tags = {
    1: {
        "id": 1,
        "tag": "name_female",
        "color": "#c1fba4",
    },
    2: {"id": 2, "tag": "name_male", "color": "#ffef9f"},
    3: {"id": 3, "tag": "surname", "color": "#90f1ef"},
    4: {"id": 4, "tag": "surname_pl", "color": "#ffd6e0"},
    5: {"id": 5, "tag": "name", "color": "#c1fba4"},
}


def get_template():
    template = {
        "tags": tags,
        "sentences": {},
    }

    return template


def detect_tag(word):
    # Special case surname_pl
    if word.startswith("surname_") and word.endswith("_pl"):
        return next((tag["id"] for tag in tags.values() if tag["tag"] == "surname_pl"), None)
    else:
        for tag in tags.values():
            if word.startswith(tag["tag"]):
                return tag["id"]
    return None


def to_pos_format(sentence):
    sentence = list(sentence)
    ret = {}
    for idx, word in enumerate(sentence):
        if pd.isna(word):
            continue

        tag = detect_tag(word)
        if tag is not None:
            ret[idx] = {
                "tag": tag,
            }
        else:
            ret[idx] = {
                "replacement": word,
            }

    return ret


def get_label_base(type: str, male_sentence, female_sentence):
    return {
        "type": type,
        "gender": {
            "male": {"type": "male", "labelling": to_pos_format(male_sentence)},
            "female": {"type": "female", "labelling": to_pos_format(female_sentence)},
            "neutral": {"type": "neutral", "labelling": {}},
        },
    }


def sentence_to_format(p_idx, phrases: Phrases, template):
    ret = {
        "idx": p_idx,
        "sentence": list(
            map(
                str,
                filter(lambda x: not pd.isna(x), phrases.original.iloc[p_idx]),
            )
        ),
        "tags": {},
        "versions": {
            "all": get_label_base(
                "all",
                male_sentence=phrases.all_male.iloc[p_idx],
                female_sentence=phrases.all_female.iloc[p_idx],
            ),
            "subj": get_label_base(
                "subj",
                male_sentence=phrases.subj_male.iloc[p_idx],
                female_sentence=phrases.subj_female.iloc[p_idx],
            ),
        },
    }

    return ret


def phrases_to_labelling_format(phrases):
    template = get_template()

    for idx in range(len(phrases.original)):
        template["sentences"][idx] = sentence_to_format(idx, phrases, template)

    return template

In [7]:
import json 
  
out = phrases_to_labelling_format(phrases)

with open(os.path.join("./all_phrases.json"), "w") as f:
  json.dump(out, f)