In [22]:
import pandas as pd
import re
import pickle

In [6]:
# via https://stackoverflow.com/questions/24893977/whats-the-best-way-to-regex-replace-a-string-in-python-but-keep-its-case

def replace_keep_case(word, replacement, text):
    def func(match):
        g = match.group()
        if g.islower(): return replacement.lower()
        if g.istitle(): return replacement.title()
        if g.isupper(): return replacement.upper()
        return replacement      
    return re.sub(word, func, text, flags=re.I)

In [176]:
repl = pd.read_csv("gender_options_words.txt", sep="\t")

In [177]:
repl

Unnamed: 0,F-Pro,M-Pro,Non-Pro
0,she,he,they
1,her,him,them
2,his second daughter,his second son,his second child
3,a lady,a gentleman,a gentleperson
4,this young lady,this young man,this young person
...,...,...,...
115,mamma,pappa,parent
116,mother,father,parent
117,Mrs.,Mr.,Msr.
118,Mrs. Bennet,Mr. Bennet,Msr. Bennet


In [416]:
idlookup = pickle.load(open("ids_to_names_genders.p", "rb"))

In [417]:
idlookup

{-1: {'label': 'NONE', 'orig_gender': 'O'},
 0: {'label': 'Elizabeth', 'orig_gender': 'F'},
 1: {'label': 'Darcy', 'orig_gender': 'M'},
 2: {'label': 'Jane', 'orig_gender': 'F'},
 3: {'label': 'Mrs. Bennet', 'orig_gender': 'F'},
 4: {'label': 'Mr. Bennet', 'orig_gender': 'M'},
 5: {'label': 'Lydia', 'orig_gender': 'F'},
 6: {'label': 'Bingley', 'orig_gender': 'M'},
 7: {'label': 'Wickham', 'orig_gender': 'M'},
 8: {'label': 'Miss Bingley', 'orig_gender': 'F'},
 9: {'label': 'Charlotte', 'orig_gender': 'F'},
 10: {'label': 'Gardiner', 'orig_gender': 'M'},
 11: {'label': 'Mrs. Gardiner', 'orig_gender': 'F'},
 12: {'label': 'Lady Lucas', 'orig_gender': 'F'},
 13: {'label': 'Denny', 'orig_gender': 'M'},
 14: {'label': 'Sir William Lucas', 'orig_gender': 'M'},
 15: {'label': 'Maria Lucas', 'orig_gender': 'F'},
 16: {'label': 'Georgiana', 'orig_gender': 'F'},
 17: {'label': 'Lady Catherine', 'orig_gender': 'F'},
 18: {'label': 'Kitty', 'orig_gender': 'F'},
 19: {'label': 'Mary', 'orig_gender

In [180]:
name_variants = {
    0: ["Elizabeth", "Lizzy", "Eliza", "Miss Bennet", "Elizabeth Bennet", "Miss Elizabeth Bennet", 
        "Miss Eliza", "Miss Elizabeth", "Miss Lizzy"],
    1: ["Darcy", "Mr. Darcy", "Fitzwilliam Darcy"],
    2: ["Jane", "Miss Jane Bennet", "Miss Bennet", "Jane Bennet"],
    6: ["Bingley", "Charles Bingley", "Mr. Bingley"],
    3: ["Mrs. Bennet"],
    4: ["Mr. Bennet"],
    5: ["Lydia"],
    18: ["Kitty", "Catherine"],
    19: ["Mary"],
    24: ["The Bennets", "daughters", "girls", "ladies", "sisters"]
}

## Would Do This Via Web Form or Something

Also, it's confusing not to fix Miss Bingely - she collides with the new one here

In [419]:
name_replacements = {
    0: {"Elizabeth": "Edward", "LIZZY": "EDDIE", "Lizzy": "Eddie", "Eliza": "Ed", "Miss Bennet": "Mr. Bennet", "Elizabeth Bennet": "Edward Bennet", "Miss Eliza": "Mr. Ed", "Miss Elizabeth": "Mr. Edward", "Miss Lizzy": "Mr. Eddie"},
    1: {"Darcy": "Darcy", "Mr. Darcy": "Miss Darcy", "Fitzwilliam Darcy": "Philomena Darcy"},
    2: {"Jane": "John", "Jane Bennet": "John Bennet", "Miss Bennet": "Mr. Bennet"},
    6: {"Bingley": "Bingley", "Charles": "Charlotte", "Mr. Bingley": "Miss Bingley", "Charles Bingley": "Charlotte Bingley"},
    3: {"Mrs. Bennet": "Mr. Bennet"},
    4: {"Mr. Bennet": "Mrs. Bennet"},
    5: {"Lydia": "Luke"},
    18: {"Kitty": "Connie", "Catherine": "Connor"},
    19: {"Mary": "Morris"},
    24: {"daughters": "sons", "girls": "boys", "ladies": "gentlemen", "sisters": "brothers"}
}

swaps = [0, 1, 2, 3, 4, 6, 5, 18, 19, 24]

### Code For It

In [320]:
def get_best_repl(inputtext, inputcol, outputcol):
    hits = repl[[bool(re.findall(r"\b" + x + r"\b", r"\b" + inputtext + r"\b")) for x in repl[inputcol]]]
    hits['len'] = hits[inputcol].apply(lambda x: len(x))
    if len(hits) > 0:
        best = hits.sort_values(by="len", ascending=False).iloc[0]
        result = replace_keep_case(best[inputcol], best[outputcol], inputtext)
        #print("got best match", result)
    else:
        result = inputtext
    return result

def check_substring(id, name):
    for key, val in name_replacements[id].items():
        if key in name:
            return replace_keep_case(key, val, name)
    return None

def try_name(id, string):
    if string in name_replacements[id]:
        return name_replacements[id][string]
    else:
        return check_substring(id, string)

def get_exact(repl, col, match, returncol):
    res = repl[repl[col]==match]
    if len(res) > 0:
        #print("got exact")
        return res[returncol].values[0]
    else:
        return None

In [247]:
def do_replace(entid, oldstring):
    if entid in swaps:
        new = None
        if newlookup[entid]['orig_gender'] == "M":
            sourcepron = 'M-Pro'
            newpron = "F-Pro"
        if newlookup[entid]['orig_gender'] == "F":
            newpron = "M-Pro"
            sourcepron = "F-Pro"
        new = try_name(entid, oldstring)
        if not new:
            new = get_exact(repl, sourcepron, oldstring, newpron)
        if not new:
            new = get_best_repl(oldstring, sourcepron, newpron)
        if new:
            return new
        else:
            return oldstring
    return oldstring

In [420]:
def special_case_check(i, df, start, oldstring, newstring):
    # fix a possessive his / him problem in compound nouns
    # Why does this not work all the time
    if oldstring == "her":
        if (df.loc[i+1]['start'] == start) and (newstring == "him"):
            print("special case match, returning his", oldstring, newstring)
            return "his"
        else:
            return newstring
    else:
        return newstring

In [326]:
!mkdir drafts

mkdir: drafts: File exists


In [290]:
!mkdir final

In [421]:
ann_sourcepath = "cl-coref-annotator/newchaps"
text_path = "cl-coref-annotator/chaps"

In [422]:
def write_chapter_draft(num):
    mergedf = pd.read_csv(f"{ann_sourcepath}/merged_data_{num}.tsv", sep="\t")
    mergedf['len'] = mergedf['string'].apply(lambda x: len(x))
    mergedf.sort_values(by=["start", "len"], ascending=True, inplace=True)
    filename = f"{text_path}/austen_chap{num}_brat.txt"
    text = None
    with open(filename) as handle:
        text = handle.read()
    with open(f"./drafts/chapter_{str(num)}.txt", "w") as handle:
        current = 0
        for i, row in mergedf.iterrows():
            start = row['start']
            ent = row['id']
            oldstring = row['string']
            end = row['end']
            if row['start'] > current:
                handle.write(text[current: start-1] + " ")
                replacement = do_replace(ent, oldstring)
                replacement = special_case_check(i, mergedf, start, oldstring, replacement)
                print("new replacement", replacement)
                handle.write(replacement + " ")
                if replacement != oldstring:
                    replaced = True
                else:
                    replaced = False
                lastlen = end - start
                current = row['end'] + 1
            else:
                print("overlapping entity", oldstring, oldstring[lastlen:])
                if replaced:
                    print("replaced is true", oldstring[lastlen:])
                    oldstring = oldstring[lastlen:]
                replacement = do_replace(ent, oldstring)
                print("after replacement2", replacement)
                handle.write(replacement + " ")
                lastlen = end - start
                current = row['end'] + 1
        handle.write(text[current:])

In [423]:
def clean_draft(text):
    text = text.replace("`` ", '"')
    text = text.replace(" ''", '"')
    text = text.replace(" _ ", " ")
    text = text.replace("_ ", "")
    text = text.replace(" ,", ",")
    text = text.replace(" .", ".")
    text = text.replace(" ,", ",")
    text = text.replace(" !", "!")
    text = text.replace(" ?", "?")
    text = text.replace(" ;", ";")
    text = text.replace(" :", ":")
    text = text.replace("  ", " ")
    text = text.replace(" n't", "n't")
    text = text.replace(" 'm", "'m")
    text = text.replace(" 's", "'s")
    text = text.replace("-LRB-", " ")
    text = text.replace("-RRB-", " ")
    text = text.replace("  ", " ")
    text = re.sub(r"(Chapter [0-9]+) ", r"\1\n\n", text)
    return text

In [424]:
def write_clean_chapter(i):
    with open(f"./drafts/chapter_{str(i)}.txt", "r") as handle:
        text = handle.read()
        text = clean_draft(text)
    with open(f"./final/chapter_{str(i)}.txt", "w") as handle:
        handle.write(text)

In [None]:
for i in range(0, 60):
    write_chapter_draft(i)
    write_clean_chapter(i)