In [1]:
import re
import os
import json
from tqdm import tqdm
import pandas as pd
from bs4 import BeautifulSoup

In [116]:
#     def get_first_trans_defs(self):
#         try:
#             first_trans = self.soup.find_all(class_="trans")[0]
#             first_trans_soup = BeautifulSoup(str(first_trans))
#             for d in first_trans_soup.find_all(class_="def"):
#                 self.first_trans_defs += d.text.split(', ')
#                 self.first_trans_defs = [re.sub(r"[;:`,\(?\)\[\]\n]+", "", d) for d in self.first_trans_defs]
#                 self.first_trans_defs = [d.lower().strip() for d in self.first_trans_defs]
#         except IndexError:
#             pass
#         return self.first_trans_defs

In [10]:
class Entry():

    headword = '<h3.*?>((\?|\d)\s)*(.*?)</h3>'
    formae = 'Forms:\s*(.*?)</p>'
    link = 'see.*?href=\".*?\">((\?|\d)\s)*(.*?)</a>'
    alph = 'abcdefghijklmnopqrstuvwxyzáóúíéṡḟōäïāūæēṅǽüöβīḯ'
    bad_forms = ["n", "m", "f", "a", "in", "is", "na", "con", "co", "ra", "ar", "ol", "bar", "for", "far", "an", "ro", "i"]
    punctuation = " ?!,.:;†*—/\-%'$~1234567890̆ "
    prefixes = ['co', 'con', 'for', 'do', 'at', 'as', 'ad', 'ní', 'ro', 'ra', 'a', 'ar', 'ath', 'aith',
                'd', 'da', 'dan', 'der', 'derb', 'di', 'dob', 'dom', 'don', 'dot', 'é', 'fo', 'id', 'in',
                'ind', 'imm', 'm', 'mí', 'n', 'nd', 'no', 'prím', 's', 't', 'to']


    def __init__(self, file):
        self.text = file.read()
        self.soup = BeautifulSoup(self.text)
        self.forms = []
        self.defs = []
        self.trans = []
        self.first_trans = []
        self.filtered_defs = []
        self.first_trans_defs = []
        self.lemma = ''
        self.border = ''
        self.stem = ''
        
        
    def get_header(self):
        if re.search(self.headword, self.text):
            self.header = re.search(self.headword, self.text).group(3)
        else:
            self.header = ""
        return self.header
        
    def get_trans(self):
        for d in self.soup.find_all(class_="trans"):
            self.trans.append(d.text.replace("\n", ""))
            self.trans = [re.sub(r"[:`]", "", t) for t in self.trans]
            self.trans = [t.lower().strip(", \n:;.") for t in self.trans]
        return self.trans
    
    
    def get_first_trans(self):
        try:
            self.first_trans = self.trans[0].split(',')
            self.first_trans = [re.sub(r"[;:`,\(?\)\[\]\n]+", "", d) for d in self.first_trans]
            self.first_trans = [d.lower().strip() for d in self.first_trans]
        except IndexError:
            pass
        return self.first_trans
    
            
    def get_defs(self):
        for d in self.soup.find_all(class_="def"):
            self.defs += d.text.split(', ')
            self.defs = [re.sub(r"[;:`,\(?\)\[\]\n]+", "", d) for d in self.defs]
            self.defs = [d.lower().strip() for d in self.defs]
        return self.defs
    
    def intersect_defs_trans(self):
        self.filtered_defs = list(set(self.first_trans) & set(self.defs))
        return self.filtered_defs
    
    
    def get_first_trans_defs(self):
        try:
            first_trans = self.soup.find_all(class_="trans")[0]
            first_trans_soup = BeautifulSoup(str(first_trans))
            for d in first_trans_soup.find_all(class_="def"):
                self.first_trans_defs += d.text.split(', ')
                self.first_trans_defs = [re.sub(r"[;:`,\(?\)\[\]\n]+", "", d) for d in self.first_trans_defs]
                self.first_trans_defs = [d.lower().strip() for d in self.first_trans_defs]
                self.first_trans_defs = list(set(self.first_trans_defs))
        except IndexError:
            pass
        return self.first_trans_defs
        

    def get_forms(self):
        res_headword = re.search(self.headword, self.text)
        res_forms = re.search(self.formae, self.text)
        res_link = re.search(self.link, self.text)
        if res_headword:
            if res_forms:
                self.forms, self.lemma = self.process_forms(res_forms.group(1), res_headword.group(3))
            elif res_link and '(' not in res_link.group(3):
                self.forms, self.lemma = self.process_forms(res_headword.group(3), res_link.group(3))
            else:
                self.forms, self.lemma = self.process_forms(res_headword.group(3), res_headword.group(3))
        return self.forms, self.lemma


    def process_forms(self, forms, lemma):
        """
        :param forms: string with forms
        :param lemma: string with lemmas
        """
        self.lemma = lemma.split(",")[0].strip(self.punctuation)
        if '(?) ' in self.lemma:
            self.lemma = self.lemma[self.lemma.index(" ")+1:]
        if self.lemma not in self.prefixes:
            self.forms = forms.split(",") + lemma.split(",")
            self.forms = [form.strip("1234567890?†* ") for form in self.forms]
            self.forms = self.remove_junk()
            self.forms = [form for form in self.forms if len(form) != 0]
            for form in self.forms:
                form = self.check_brackets(form)
            self.border = self.find_border()
            self.stem = self.find_stem()
            for form in self.forms:
                self.normalize(form)
            self.forms = [form for form in self.forms if len(form) > 0 and form[0] != "-"]
            self.forms = [form.strip(self.punctuation) for form in self.forms]
        else:
            pass
        return self.forms, self.lemma

    def remove_junk(self):
            """
            :return: a list of forms without junk like zero-length forms and hardly restorable
            variations in the middle of the form ("-rrt(h)-" etc.)
            """
            for form in self.forms:
                if len(form) != 0:
                    if len(form) == 1 and form in self.punctuation:
                        self.forms.pop(self.forms.index(form))
                    elif form[0] == "-" and form[-1] == "-":
                        self.forms.pop(self.forms.index(form))
                    elif form[-1] == "." and len(form) <= 3:
                        self.forms.pop(self.forms.index(form))
                    elif form in self.bad_forms:
                        self.forms.pop(self.forms.index(form))
                    elif form[0] == '(' and form[-1] == ')':
                        self.forms.pop(self.forms.index(form))
                else:
                    self.forms.pop(self.forms.index(form))
            return self.forms

    def check_brackets(self, form):
        """
        Checks if there are multiple variants of the form indicated by "()" and makes
        two different forms from one form with brackets
        """
        if "(" in form and ")" in form:
            i = form.index("(")
            j = form.index(")")
            extraForm = form[:i] + form[i+1:j] + form[j+1:]
            newForm = form[:i] + form[j+1:]
            self.forms.append(extraForm)
            self.forms.append(newForm)
        elif "[" in form and "]" in form:
            i = form.index("[")
            j = form.index("]")
            extraForm = form[:i] + form[i+1:j] + form[j+1:]
            newForm = form[:i] + form[j+1:]
            self.forms.append(extraForm)
            self.forms.append(newForm)

    def find_border(self):
        for form in self.forms:
            if len(form) >=2 and form[0] == "-":
                self.border = form[1]
                break
        return self.border

    def find_stem(self):
        if len(self.forms) > 1:
            for form in self.forms:
                if len(form) > 1:
                    if form[0] != '-' and self.border != '' and self.border in form:
                        parts = form.split(self.border)
                        self.stem = self.border.join(parts[:-1])
                        break
                    elif form[0] != '-' and self.border != '' and self.border in self.lemma:
                        parts = self.lemma.split(self.border)
                        self.stem = self.border.join(parts[:-1])
                        break
        else:
            self.stem = self.lemma
        return self.stem

    def normalize(self, form):
        """Normalizes contracted forms"""
        try:
            if len(form) >= 2 and form[0] == "-":
                if self.stem[-1] == 'i' and self.border in ['l', 'm', 'n', 'r']:
                    form = self.stem[:-1] + form[1:]
                    self.forms.append(form)
                else:
                    form = self.stem + form[1:]
                    self.forms.append(form)
        except IndexError:
            pass

    def make_lemmadict(self, words):
        self.lemma = self.lemma.lower()
        for form in self.forms:
            form = form.lower()
            if len(form) != 0 and form not in self.punctuation and form not in self.bad_forms:
                if form not in words.keys():
                    words[form] = (self.lemma,)
                if self.lemma not in words.keys():
                    words[self.lemma] = (self.lemma,)
                else:
                    if self.lemma not in words[form]:
                        words[form] += (self.lemma,)
        for k, v in words.items():
            if len(v) == 0:
                words[k] = k
        words = {k:words[k] for k in words if len(k) != 0}
        return words
    
    def make_dil(self, dil):
        if len(self.defs) != 0:
            self.lemma = self.lemma.lower()
            self.lemma = re.sub("[\[\]\(\)?]+", "", self.lemma)
            self.forms = [re.sub(r"[()[],:?]", "", f) for f in self.forms]
            filtered_forms = []
            filtered_forms.append(self.lemma)
            for form in self.forms:
                form = form.lower()
                if len(form) != 0 and form not in self.punctuation and form not in self.bad_forms:
                    filtered_forms.append(form)
            dil[self.lemma] = {'lemma': self.lemma, 
                               'forms': list(set(filtered_forms)),
                               'defs': list(set(self.defs)),
                               'first_trans_defs': self.first_trans_defs,
                               'filtered_defs': self.filtered_defs,
                              'trans': list(set(self.trans))}
        return dil
        

def write_data(data, filename="parsed_dil.json"):
    with open(filename, "w", encoding = "utf-8") as f:
        json.dump(data, f, sort_keys = True, ensure_ascii = False)


## Testing

In [124]:
with open("./dil/22696.txt", 'r', encoding='utf-8') as f:
    entry = Entry(f)
    forms, lemma = entry.get_forms()
    defs = entry.get_defs()
    trans = entry.get_trans()
    first_trans = entry.get_first_trans()
    first_trans_defs = entry.get_first_trans_defs()
    filtered_defs = entry.intersect_defs_trans()

In [125]:
trans

['finds, gets',
 'finds, meets with, discovers (in a place)',
 'discovers',
 'finds',
 'gets, gains, obtains, procures',
 'gets, induces (some one to do something) causes (something to be done)',
 'meets with, experiences, undergoes',
 'dies',
 'gets (an opportunity of doing), is able (usually with vn. obj.; cf. c, d)',
 'gets = spends, lives (through)',
 'invents, devises',
 "finds in oneself,' dares, presumes (late)",
 'imparts, communicates']

In [126]:
first_trans_defs

['gets', 'finds']

In [127]:
filtered_defs

['gets', 'finds']

In [128]:
defs

['finds',
 'gets',
 'finds',
 'meets with',
 'discovers',
 'discovers',
 'finds',
 'gets',
 'gains',
 'obtains',
 'procures',
 'gets',
 'induces',
 'causes',
 'meets with',
 'experiences',
 'undergoes',
 'dies',
 'gets',
 'is able',
 'gets',
 'spends',
 'lives',
 'invents',
 'devises',
 'finds in oneself',
 'dares',
 'presumes',
 'imparts',
 'communicates']

## Parsing the eDIL

In [134]:
dil = {}

for root, dirs, files in os.walk("./dil"):
    for file in tqdm(files):
        with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
            try:
                entry = Entry(f)
                forms, lemma = entry.get_forms()
                defs = entry.get_defs()
                trans = entry.get_trans()
                first_trans = entry.get_first_trans()
                first_trans_defs = entry.get_first_trans_defs()
                filtered_defs = entry.intersect_defs_trans()
                dil = entry.make_dil(dil)
            except (AttributeError) as e:
                print("%s: %s" % (file, str(e)))
                continue
            
    write_data(dil)

100%|██████████| 43345/43345 [05:22<00:00, 134.31it/s]


In [135]:
dil['catt']

{'lemma': 'catt',
 'forms': ['cat', 'catt'],
 'defs': ['cat-heads', 'cat', 'vulva', 'kitten'],
 'first_trans_defs': ['cat'],
 'filtered_defs': ['cat'],
 'trans': ['cat-heads', 'cat', 'vulva (?)', 'kitten']}

In [136]:
with open('parsed_dil.json', 'w', encoding='utf-8') as f:
    json.dump(dil, f, ensure_ascii = False, sort_keys=True)

## Inverting translations

In [137]:
inverted_dil = {}
for k, v in dil.items():
    for d in v['filtered_defs']:
        if d in inverted_dil:
            inverted_dil[d].append(k)
        else:
            inverted_dil[d] = [k]

In [138]:
inverted_dil['cat']

['catt']

In [139]:
with open('filtered_inverted_dil.json', 'w', encoding='utf-8') as f:
    json.dump(inverted_dil, f, ensure_ascii = False, sort_keys=True)

## Saving as a DataFrame

In [140]:
df = pd.DataFrame(dil.values())
df.head()

Unnamed: 0,lemma,forms,defs,first_trans_defs,filtered_defs,trans
0,immid,"[imme, ime, immid]","[encloses, surrounds, makes a dam, fences]","[encloses, surrounds]","[encloses, surrounds]","[encloses, surrounds, fences (a field) ; makes..."
1,seirgne,"[seirgne, seirgni]",[shrivelled],[shrivelled],[],[state of being shrivelled (?)]
2,trácht,[trácht],[breadth],[breadth],[breadth],[breadth (?)]
3,doscaílte,"[doscaílte, do-scóiltighe]","[loosen, impenetrability]","[loosen, impenetrability]",[impenetrability],"[impenetrability, state of being hard to loosen]"
4,bairille,[bairille],[barrel],[barrel],[barrel],[barrel]


In [141]:
df.to_csv("parced_dil.tsv", sep="\t", index=False, encoding="utf-8")

## Extracting variation from headwords

In [13]:
reg = re.compile(",|\(|\)|\[|\]")
headers = []

for root, dirs, files in os.walk("./dil"):
    for file in tqdm(files):
        with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
            entry = Entry(f)
            header = entry.get_header()
            if reg.search(header):
                headers.append(header)
            

100%|██████████| 43345/43345 [03:22<00:00, 214.21it/s]


In [33]:
headers = [i.replace('?', '') for i in headers]
headers = [i.replace(':', '') for i in headers]

In [39]:
def open_brackets(form, start, end):
    i = form.index(start)
    j = form.index(end)
    form1 = form[:i] + form[i+1:j] + form[j+1:]
    form2 = form[:i] + form[j+1:]
    return [form1, form2]

def clean_headers(headers):
    cleaned_headers = []
    for header in headers:
        new_forms = []
        forms = header.split(",")
        for h in forms:
            if '(' in h:
                new_h = open_brackets(h.strip(), "(", ")")
            elif '[' in h:
                new_h = open_brackets(h.strip(), "[", "]")
            else:
                new_h = [h.strip()]
            new_forms += new_h
        cleaned_headers.append(new_forms)
    return cleaned_headers
            

In [40]:
cleaned_headers = clean_headers(headers)

In [41]:
cleaned_headers[:50]

[['ellte', 'elltes'],
 ['foimmrimm', 'foimrimm'],
 ['toirche', 'toirchid'],
 ['breóaid', 'breóid'],
 ['rígdae', 'rígda', 'rída'],
 ['oíbne', 'oíbinne', ''],
 ['elefaint', 'elefint'],
 ['indsaigthech', 'innsaigech'],
 ['fáitbiud', 'fáitbe'],
 ['ind-asaig', 'in-asaig'],
 ['fríth', 'fríthe'],
 ['meild', 'meill'],
 ['smál', 'smól', 'smúal'],
 ['timthirecht', 'timpirecht'],
 ['éilmid', 'éilmigid'],
 ['greiss', 'greis'],
 ['anmothaigthige', 'anmothaige'],
 ['riclean', 'riclen'],
 ['léagad', 'léaga'],
 ['gnó', 'gnáe'],
 ['moned', 'monad'],
 ['in-ellaig', 'inellgither'],
 ['dimbág', 'dimbáig'],
 ['ullmachad', 'ullmugad'],
 ['ná', 'na'],
 ['dá n-', 'da n-'],
 ['íarngáesach', 'íargáesach'],
 ['ionnaid', 'innaid'],
 ['lúathugad', 'lúathad'],
 ['tobach', 'tabach'],
 ['metta', 'meta'],
 ['fuimiter', 'fumiterra'],
 ['íarcain', 'íarcáin'],
 ['teinntidecht', 'teintidecht'],
 ['plannta', 'planta'],
 ['ulchobchán', 'ulchobcán', 'ulchubchán', 'ulchubcán'],
 ['córaigid', 'cóirigid', 'cóirgid'],
 ['túaraim

In [42]:
with open("header_variation.txt", "w", encoding="utf-8") as f:
    for i in cleaned_headers:
        f.write("/".join(i) + "\n")