# A crawler for Irish text from UCC CELT

Requires list of pages to download as input.

In [2]:
import re
import time
import requests
import pandas as pd
import pickle as pkl
from tqdm import tqdm
from bs4 import BeautifulSoup
from urllib3.exceptions import MaxRetryError, NewConnectionError
from requests.exceptions import ConnectionError

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
%cd drive/My Drive/Colab Notebooks/medieval-embeddings

/content/drive/My Drive/Colab Notebooks/medieval-embeddings


In [51]:
punc = "[—‘’:;.,!?–&°£§$\"^«»<>/\|%=\[\]()'*+]+"

entry_pattern = """
{"Title":"%s", "Editor":"%s", "Size (words)":"%s", "Real size (tokens)":"%s", "Unique forms":"%s", "Dates":"%s", "Irish":"%s", "Other languages":"%s", "Form":"%s", "Genre":"%s", "Subgenre":"%s", "Group":"—", "Author":"%s", "Link":"%s", "Comment":"—", "Raw Text":"–", "Text":"–", "Preprocessed Text":"–"}
"""

columns = ["Title", "Editor", "Size (words)", "Real size (tokens)", "Unique forms", "Dates", "Irish", 
           "Other languages", "Form", "Genre", "Subgenre", "Group", "Author", "Link", "Comment", "Raw Text",
           "Text", "Preprocessed Text"]

mutations = {
                'bh': 'b', 'mb': 'b', 'ch': 'c', 'gc': 'c', 'gh': 'g',
                'ng': 'g', 'dh': 'd', 'nd': 'd', 'fh': 'f', 'ḟ': 'f',
                'ḟh': 'f', 'bhf': 'f', 'mm': 'm', 'll': 'l', 'nn': 'n',
                'ph': 'p', 'bp': 'p', 'rr': 'r', 'sh': 's', 'ṡ': 's',
                'th': 't', 'dt': 't', 'he': 'e', 'hé': 'é', 'ha': 'a',
                'há': 'á', 'hi': 'i', 'hí': 'í', 'ho': 'o', 'hó': 'ó',
                'hu': 'u', 'hú': 'ú', 'n-': '', 'h-': '', 'ss': 's',
                'ts': 's', 'n': 'e', 'né': 'é', 'na': 'a', 'ná': 'á',
                'ni': 'i', 'ní': 'í', 'no': 'o', 'nó': 'ó', 'nu': 'u',
                'nú': 'ú', 'm-': '', 't-': '', 't\'': ' ', 'm\'': '',
                'd\'': '', 'l-': '', 'mh': 'm', 'r-': '', 's-': '',
                'cc': 'c', 'mh\'': '', 'g-': 'g'}


with open('./data/composite_names.txt', 'r', encoding='utf-8') as f:
    names = f.read().split('\n')
    namedict = {name:'_'.join(name.split()) for name in names}
    
with open('./data/celt_links.txt', 'r', encoding='utf-8') as f:
    lst = f.read().split('\n')
    links = {entry.split('\t')[0]:entry.split('\t')[1] for entry in lst if len(entry) !=0}

In [8]:
print(len(links))

485


In [52]:
genres = {}
subgenres = {}
forms = {}

sorted_links = sorted(links.keys())
verse = [sorted_links[24]] + sorted_links[195:459] + [sorted_links[462]] + [sorted_links[464]]
tracts = sorted_links[459:462] + [sorted_links[463]] + sorted_links[466:]
annals = sorted_links[:24] + [sorted_links[25]]

for link in verse:
    forms[link] = 'Verse'
    
for link in tracts:
    forms[link] = 'Prose'
    genres[link] = 'Tract'
    
for link in annals:
    forms[link] = 'Prose'
    genres[link] = 'Annals'
    subgenres[link] = 'History'

In [53]:
def clean(text):
    """
    Cleans text from all sorts of rubbish.
    """
    text = re.sub(r"\nU\d+ ", r" ", text) # removes error codes
    text = re.sub(r"(p|P)(age)?\.? ?\d+", r" ", text)  # removes page numbers
    text = re.sub(r"\{(.+?)\}\s*?", r" ", text)  # removes MS folios
    text = re.sub(r"(\.|[a-z])\d*(\n|\s)", r"\1\2", text)  # removes footnote numbers
    text = re.sub(r"\n(\s*?)¶*(\d)+~*\]", r"\n", text)  # removes line numbers
    text = re.sub(r"[A-Z]{0,4}\d+[A-Z]{0,3}\.?( |\n)", r" ", text) # removes random numbers, text ids and sections
    text = re.sub(r"([A-Z]+\d+)?|Kl\.?(\n| )|", "", text) # removes section numbers in annals
    text= re.sub(r" \d+ ", " ", text) #removes numbers
    text = re.sub(r"\.i\.", r"id_est", text) # changes .i. to 'id_est'
    text = re.sub(r"\.l\.", r"vel", text) # changes .l. to 'vel'
    text = re.sub(r" (&|s)?rl\.?", r" etc", text) # changes &rl to 'etc'
    text = re.sub(r"(\[.*?\])", "", text) # removes restored letters and comments in []
    text = re.sub(r"\n[XVICDML]{1,6}\.", "", text) # removes roman numbers
    text = re.sub(r"(Text\s.*?\n)|((\.|\s)[ivxUuírcfl]+?\.)|sic\.|(Incipit|Finit|Finis|FINIT|FINIS|Amen|anno\sdomini|dixit|Author:?|Unknown|folio|Folio)[.,\n\s]*", r"", text)  # removes Latin & editor's comments
    text = re.sub(r"(\w|[.:;,-?!’])\s([.:;,-?!’])", r"\1\2", text) # removes whitespaces between words and punctuation
    text = re.sub(r"^l|l^", "", text) # removes these weird things
    text = re.sub(r"(\.|\.’|’|,)(‘|\w)", r"\1 \2", text)  # adds necessary whitespaces
    text = re.sub(r"([.?!]’*)(\s|‘)", r"\1\n\2", text)  # moves each sentence to a new line
    text = re.sub(r"(\w|,|&|-|\s|;)\n", r"\1 ", text)  # joins sentence parts that were on different lines
    text = re.sub(r"(\n|\s|-|‘)\s+(\w|&|[’'.:;,-?!]*|\n)", r"\1\2", text)  # removes unnecessary whitespaces
    text = re.sub(r"~|§|\[|\]|\(|\)|\|", r" ",text)  # removes unnecessary symbols
    text = re.sub(r"\s-(\w+)", r"-\1", text) # removes whitespaces before dashes
    text = re.sub(r"(\w+)-\s", r"\1-", text) # removes whitespaces after dashes
    text = re.sub(r"\t", " ", text) # changes tabs to spaces
    text = re.sub(r"[.,]", r" ", text) # removes random stops and commas
    text = re.sub(r" +", r" ", text) # replaces multiple whitespaces with one
    text = re.sub(r"\n+", r"\n", text) # replaces multiple newlines with one
    text = re.sub(r" \n", r"\n", text) # removes whitespaces before newlines
    text = re.sub(r"\n ", r"\n", text) # removes whitespaces after newlines
    
    return text

In [112]:
# title = re.compile(r'(<h1>|<title>)(.*?)(</h1>|</title>)')
editor = re.compile(r'<h3>File Description</h3>(.*?compiled by )?(.*?)</?p>')
author = re.compile(r'<h2>Author:\s+?(.*?)</h2>', re.DOTALL)
words = re.compile(r'(\d+) (words|gmd)')
dates = re.compile(r'Date range: (.*?\d+(–|-)?\d+(.*?century)?)')
# irish = re.compile(r'Language.*?((Old |Middle |(Early Modern )|Classical | )\S*?Irish).*?</h5>', re.DOTALL)
irish_text = re.compile(r'Language.*?Irish.*?</h5>', re.DOTALL)
irish = re.compile(r'Old|Middle|Early Modern|Classical.*?Irish', re.DOTALL)
langs = re.compile(r'Language.*?\[([A-Z]{2})\]', re.DOTALL)

In [90]:
test1 = "Language: [GA] More than 99% is in Middle Irish and Early Modern Irish.</h5>"
test2 = "Language: [GA] Most of the later annals are in Old, Middle, and Early Modern Irish with very many Latin words and formulae.</h5>"

In [109]:
t = irish_text.search(test2).group(0)

In [110]:
irish.findall(t)

['Old', 'Middle', 'Early Modern']

In [120]:
def demutate(word):
    """
    Checks if the word is mutated and restores its original form
    :param word: str
    :return: str
    """
    if len(word) != 0:
        if word[:2] in mutations:
            word = mutations[word[:2]] + word[2:]
        elif word[0] in mutations:
            word = mutations[word[0]] + word[1:]
    return word


def join_names(text):
    """
    Joins multiword names with underscores.
    """
    for k, v in namedict.items():
        text = re.sub(k, v, text)
    return text

def regex_wrapper(regex, gr):
    if regex == None:
        return "–"
    else:
        return regex.group(gr)

def find_info(text):
    """
    Finds metadata on the text's webpage.
    """
    ed = editor.search(text).group(2).replace("\n", " ").strip(' ')
    au = author.search(text).group(1).replace("\n", " ").strip('[] ').capitalize()
    wo = words.search(text).group(1).replace("\n", "").strip(' ')
    da = regex_wrapper(dates.search(text), 1).replace("\n", " ").strip(' ')
    ir_text = irish_text.search(text).group(0)
    ir = ', '.join(list(set(irish.findall(ir_text))))
    # ir = irish.search(irish_text).group(1).replace("\n", " ").strip(' ')
    la = ', '.join(list(set(langs.findall(text))))
    return ed, au, wo, da, ir, la


def preprocess_and_count(text):
    """
    Preprocesses text and computes 
    the number of tokens and types. 
    NB! Naive tokenization!
    """
    text = re.sub(r'[!"#$%&()\*+,\.:;<=>?@\[\]^_`{|}‘’~„“«»†*—]', ' ', text)
    text = re.sub(r'((\r?\n)+)|( +)', ' ', text)
    text = join_names(text)
    text = text.lower()
    tokens = len(text.split())
    unique = len(set(text.split()))    
    return text, tokens, unique

    
def download_page(link):
    """
    Downloads a webpage and parses it.
    """
    try:
        source_code = requests.get(link)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text, "html.parser")
        fname = link.split("/")[-1]
        # with open("./data/raw_celt/" + fname, "w", encoding="utf-8") as f:
        #     f.write(plain_text)
        return soup
    except (ConnectionError, MaxRetryError, NewConnectionError, TimeoutError) as e:
        print("Connection Error")
        return None


def get_text(soup):
    """
    Extracts text itself from the parsed webpage.
    """
    raw_text = soup.get_text()
    raw_text = raw_text.split("Corpus of Electronic Texts Edition: ")[-1]
    return raw_text


def get_params(name, link):
    """
    Gets parameters for a single entry.
    """
    page = download_page(link)
    if page != None and page.title.text != '404 Not Found':
        try:
            raw_text = get_text(page)
            text = clean(raw_text)
            title = name
            editor, author, words, dates, irish, langs = find_info(str(page))
            preprocessed, tokens, unique = preprocess_and_count(text)
            form = forms[link] if link in forms else '–'
            genre = genres[link] if link in genres else '–'
            subgenre = subgenres[link] if link in subgenres else '–'
            params = (title, editor, words, tokens, unique, dates, irish, 
                    langs, form, genre, subgenre, author, link)
            return (params, text, raw_text, preprocessed)
        except RecursionError:
            print("Recursion error: %s" % link)
    else:
        print("Bad link: %s" % link)
        return None

    
def add_entry(df, params, pattern):
    """
    Adds a row to the dataset.
    """
    line = eval(pattern % params)
    df = df.append(line, ignore_index=True)
    return df
    

def crawl(links):
    """
    Downloads web pages from the list, processes them
    and creates a dataset with raw texts, cleaned texts
    and metadata.
    """
    data = pd.DataFrame(columns=columns)
    for k, v in links.items():
        print(k)
        res = get_params(v, k)
        if res:
            data = add_entry(data, res[0], entry_pattern)
            data.at[data[data['Title']==res[0][0]].index, "Raw Text"] = res[2]
            data.at[data[data['Title']==res[0][0]].index, "Text"] = res[1]
            data.at[data[data['Title']==res[0][0]].index, "Preprocessed Text"] = res[3]
#           print(res[3][:200])
        else:
            print("No result")
#         time.sleep(2)
    return data


def save_data(data, filename):
    """
    Saves dataset in both csv and pickle
    """
    data.to_csv(filename+'.csv', encoding='utf-8', sep='\t', index=False)
    with open(filename+'.pkl', 'wb') as f:
        pkl.dump(data, f)

In [121]:
%time data = crawl(links)

https://celt.ucc.ie//published/G100001A.html
https://celt.ucc.ie//published/G100001B.html
https://celt.ucc.ie//published/G100002.html
https://celt.ucc.ie//published/G100004.html
https://celt.ucc.ie//published/G100004P.html
https://celt.ucc.ie//published/G100005A.html
Recursion error: https://celt.ucc.ie//published/G100005A.html
No result
https://celt.ucc.ie//published/G100005B.html
https://celt.ucc.ie//published/G100005C.html
https://celt.ucc.ie//published/G100005D.html
https://celt.ucc.ie//published/G100005E.html
https://celt.ucc.ie//published/G100005F.html
https://celt.ucc.ie//published/G100010A.html
https://celt.ucc.ie//published/G100011.html
https://celt.ucc.ie//published/G100013.html
https://celt.ucc.ie//published/G100014.html
https://celt.ucc.ie//published/G100015.html
https://celt.ucc.ie//published/G100016.html
Recursion error: https://celt.ucc.ie//published/G100016.html
No result
https://celt.ucc.ie//published/G100017.html
https://celt.ucc.ie//published/G100018.html
https://cel

In [122]:
len(data)

483

In [123]:
data.head()

Unnamed: 0,Title,Editor,Size (words),Real size (tokens),Unique forms,Dates,Irish,Other languages,Form,Genre,Subgenre,Group,Author,Link,Comment,Raw Text,Text,Preprocessed Text
0,The Annals of Ulster (to AD 1201),"Donnchadh Ó Corráin, Mavis Cournane",118200,65982,12514,c. 550,,"LA, EN, GA",Prose,Annals,History,—,Unknown,https://celt.ucc.ie//published/G100001A.html,—,G100001A\nThe Annals of Ulster: Author:\t[unkn...,The Annals of Ulster: List of witnesses H: Du...,the annals of ulster list of witnesses h dubl...
1,The Annals of Ulster (to AD 1378),Donnchadh Ó Corráin,760,28984,5329,c. 550,,,Prose,Annals,History,—,Unknown,https://celt.ucc.ie//published/G100001B.html,—,G100001B\nAnnala Uladh: Annals of Ulster other...,Annala Uladh: Annals of Ulster otherwise Anna...,annala uladh annals of ulster otherwise annal...
2,Annals of Tigernach,Donnchadh Ó Corráin,870,42546,9266,Mid-6th century to 1178,Early Modern,,Prose,Annals,History,—,Unknown,https://celt.ucc.ie//published/G100002.html,—,G100002\nAnnals of Tigernach: Author:\t[unknow...,Annals of Tigernach: K\nui Quies sancti Ciann...,annals of tigernach k ui quies sancti ciannai...
3,Annals of Inisfallen,Seán Mac Airt,65500,35811,8868,550-1326,"Old, Middle, Early Modern","LA, EN, GA",Prose,Annals,History,—,Unknown,https://celt.ucc.ie//published/G100004.html,—,G100004\nAnnals of Inisfallen: Author:\tunknow...,Annals of Inisfallen: unknown Prima feria\nCo...,annals of inisfallen unknown prima feria conu...
4,"Annals of Inisfallen, Pre-Patrician section",,12450,7342,3199,AD 550-1326,Middle,"LA, EN, GA",Prose,Annals,History,—,Unknown,https://celt.ucc.ie//published/G100004P.html,—,"G100004P\nAnnals of Inisfallen, Pre-Patrician ...",Annals of Inisfallen Pre-Patrician Section: u...,annals of inisfallen pre-patrician section un...


In [124]:
data.tail()

Unnamed: 0,Title,Editor,Size (words),Real size (tokens),Unique forms,Dates,Irish,Other languages,Form,Genre,Subgenre,Group,Author,Link,Comment,Raw Text,Text,Preprocessed Text
478,A Treatise on Fevers,Beatrix Färber,19540,12946,2017,–,Early Modern,"LA, AR, GA, EN",Prose,Tract,–,—,Unknown,https://celt.ucc.ie//published/G600020.html,—,G600020\nA Treatise on Fevers: Author:\tUnknow...,A Treatise on Fevers: List of witnesses A: Du...,a treatise on fevers list of witnesses a dubl...
479,De Dosibus medicinarum,Shawn Sheahan,19950,12241,2784,–,"Old, Middle","LA, EN, GR, GA",Prose,Tract,–,—,Walter de agilon/galterius agilinus,https://celt.ucc.ie//published/G600021.html,—,G600021\nAn Irish Version of Gualterus de Dosi...,An Irish Version of Gualterus de Dosibus: Wal...,an irish version of gualterus de dosibus walt...
480,An Irish Astronomical Tract,"Beatrix Färber, Ruth Murphy",98800,79748,6417,c.1300–1350,,"LA, GA, AR, FR, EN",Prose,Tract,–,—,Unknown,https://celt.ucc.ie//published/G600030.html,—,G600030\nAn Irish Astronomical Tract: Author:\...,An Irish Astronomical Tract: Acknowlegements ...,an irish astronomical tract acknowlegements c...
481,The Meaning of Birthdays,Beatrix Färber,1135,559,254,c.1000–1300,Middle,"EN, GA",Prose,Tract,–,—,Unknown,https://celt.ucc.ie//published/G602001.html,—,G602001\nThe Meaning of Birthdays: Author:\t[u...,The Meaning of Birthdays: List of witnesses :...,the meaning of birthdays list of witnesses tr...
482,Ranna an aeir,A. O. Anderson,5650,4411,1382,c.1500–1550,Early Modern,"LA, EN, GA",Prose,Tract,–,—,Unknown,https://celt.ucc.ie//published/G602002.html,—,G602002\nRanna an aeir: Author:\t[unknown]\n1[...,Ranna an aeir:1 Fiarfaighthear and so ca lin ...,ranna an aeir 1 fiarfaighthear and so ca lin ...


In [125]:
save_data(data, "crawled_celt")

In [None]:
for title, text in zip(data['Title'], data['Text']):
    with open("./crawled_celt/"+title.strip('?')+".txt", "w", encoding="utf-8") as f:
        f.write(text)

## Tests

In [None]:
str(download_page('https://celt.ucc.ie//published/G201014.html').title.text)

In [None]:
link = 'https://celt.ucc.ie//published/G402111.html'
name = links[link]
page = download_page(link)
editor, author, words, dates, irish, langs = find_info(str(page))

In [None]:
print(name, editor, author, words, dates, irish, langs, sep="\n")

In [None]:
raw_text = get_text(page)
text = clean(raw_text)

In [None]:
raw_text[:500]

In [None]:
text[:500]

In [None]:
preprocessed, tokens, unique = preprocess_and_count(text)

In [None]:
tokens

In [None]:
unique

In [None]:
preprocessed[:500]

In [None]:
params = ("title", "editor", "words", "tokens", "unique", "dates", "irish", "langs", "author", "link", "raw_text", "text", "preprocessed")
json_string = entry_pattern % params

In [None]:
json.loads(json_string)

In [None]:
test = """
<h4>Canonical References</h4>
<p>This text uses the <tt>DIV1</tt> element to represent the Year. <p>Refs: EVENT (<tt>&lt;DIV2&gt;</tt>)<p>This text uses the <tt>DIV1</tt> element to represent the Year. <h3>Profile Description</h3>
<p>Created: By mostly unknown authors in Irish monastic scriptoria
and medieval Irish clerics.
 Date range: c. 550-c.1588, various and mostly unknown.<h4>Use of language</h4>
<h5>Language: [LA] A large portion of the earlier part of the annals is in Latin.</h5>
<h5>Language: [GA] Most of the later annals are in Old, Middle, and Early Modern Irish with very many Latin words and formulae.</h5>
<h5>Language: [EN] Witness-list in the front matter is in English.</h5>
<h4>List of hands</h4>
<p><b>H</b> [main] Ruaidhri &Oacute; Luin&iacute;n</p><p><b>H1</b> [interpolator/glossator] unknown</p><p><b>H2</b> [interpolator/glossator] Cathal Mac Maghnusa</p><p><b>H3</b> [interpolator/glossator] unknown</p><p><b>H4</b> [interpolator/glossator] unknown</p><p><b>LATE</b> [otherwise unidentified interpolator/glossator (vols 2-3)] unknown</p><h3>Revision History</h3>
"""

langs.findall(test)