In [28]:
import re
import time
import requests
import urllib3
from tqdm import tqdm
import pickle as pkl
from bs4 import BeautifulSoup
from urllib3.exceptions import MaxRetryError, NewConnectionError
from requests.exceptions import ConnectionError

In [12]:
def siteopen(url):
    web_source = url
    try:
        source_code = requests.get(web_source)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text, "html.parser")
        return soup
    except (ConnectionError, MaxRetryError, NewConnectionError, TimeoutError) as e:
        pass

In [3]:
"""Opens the web page entered and returns parsed HTML code."""

def sitetext(url):
    web_source = url
    source_code = requests.get(web_source)
    plain_text = source_code.text
    return plain_text

In [4]:
"""Finds url beginning from entered url, combines it with href to make full hyperlink to new page"""

def combinelink(url, ref):
    try:
        pattern = re.compile(r"\.[\w]*/")
        find = pattern.search(url)
        span = (find.span())
        endpat = span[1]
        urlstub = url[:endpat]
        newlink = urlstub + ref
        return newlink
    except AttributeError:
        return "Unable to find website."

In [5]:
"""Finds the href for each link on a page"""

def findrefs(url):
    reflist = []
    html = siteopen(url)
    """opens all links on irish text page"""
    for link in html.findAll('a', href=True):
        ref = link.get('href')
        """Fixes broken links where href entered without = sign"""
        if ref == "":
            title = link.text
            if "á" in title:
                for letter in title:
                    if letter == "á":
                        fadaplace = title.find("á")
                        title = title[:fadaplace] + "&aacute;" + title[fadaplace + 1:]
            titlelength = len(title)
            rawsite = sitetext(url)
            titleplace = rawsite.find(title)
            searchstring = rawsite[:titleplace + titlelength]
            startoflink = searchstring.rfind('href"') + 5
            endoflink = searchstring.rfind(".html") + 5
            ref = searchstring[startoflink:endoflink]
        reflist.append(ref)
    return reflist

In [6]:
"""Finds all hyperlinks on main Irish page of CELT.
   Separates Irish texts from translations and non-text links."""

def textlinks(url):
    brokencount = 0
    gaelcount = 0
    transcount = 0
    ntcount = 0
    ga_links = []
    eng_links = []
    fr_links = []
    otherlinks = []
    """opens all links on irish text page"""
    for ref in findrefs(url):
        # print(ref)
        text_type = ["published/G", "published/T", "published/F"]
        fixedlinks = []
        """counts broken links found"""
        if ref == "":
            brokencount += 1
            # print(brokencount)
        if ref[0:11] in text_type:
            """finds and counts working links to Irish texts"""
            if ref[0:11] == text_type[0]:
                gaelcount += 1
                g_link = combinelink(url, ref)
                ga_links.append(g_link)
                # print(str(gaelcount) + ": " + g_link)
                """finds and counts working links to English translations of Irish texts"""
            elif ref[0:11] == text_type[1]:
                transcount += 1
                t_link = combinelink(url, ref)
                eng_links.append(t_link)
                # print(str(transcount) + ": " + t_link)
                """finds and counts working links to French translations of Irish texts"""
            elif ref[0:11] == text_type[2]:
                transcount += 1
                t_link = combinelink(url, ref)
                fr_links.append(t_link)
                # print(str(transcount) + ": " + t_link)
        elif ref[:4] == "http":
            ntcount += 1
            other_link = ref
            otherlinks.append(other_link)
            # print(str(ntcount) + ": " + other_link)
        else:
            ntcount += 1
            other_link = combinelink(url, ref)
            otherlinks.append(other_link)
            # print(str(ntcount) + ": " + other_link)
    alllinks = [ga_links, eng_links, fr_links, otherlinks]
    return alllinks


In [17]:
"""Opens links in a list and retrieves information 
(Title, Author, Irish Form(s), Other Languages, Text-Link)"""

def textinfo(urllist):
    textsinfo = []
    no_baselink = []
    infotag = "/header.html"
    maintag = ".html"
    basetag = "/index.html"
    for link in urllist:
        textdata = []
        baselink = link
        endroot = link.find(basetag)
        linkroot = link[:endroot]
        """Finds any base links which do not conform to the expected format"""
        if basetag not in baselink:
            no_baselink.append(baselink)
        else:
            infolink = linkroot + infotag
            mainlink = linkroot + maintag
            textdata.append(mainlink)
            print(mainlink)
        """Opens text's header page. Finds the text name and edits it to remove html code and other issues found"""
        infohtml = siteopen(infolink)
        try:
            for title in infohtml.findAll("h1"):
                textstring = (str(title))
                textname = textstring[19:-5]
                changeables = {"f�n": "fün"}
                for item in changeables:
                    if item in textname:
                        itemplace = textname.find(item)
                        textname = textname[:itemplace] + changeables.get(item) + textname[itemplace + len(item):]
                removables = ["<u>", "</u>", "\n"]
                for item in removables:
                    if item in textname:
                        itemcount = textname.count(item)
                        for i in range(0, itemcount):
                            itemplace = textname.find(item)
                            textname = textname[:itemplace] + textname[itemplace + len(item):]
                textnamedata = "Text: " + textname
                textdata.append(textnamedata)
        except AttributeError:
            pass
        """Finds the author's name, where given, and removes any issues found"""
        authunknowns = ["unknown", "Unknown", "[unknown]"]
        for author in infohtml.findAll("h2"):
            authstring = (str(author))
            authorname = authstring[27:-5]
            if authorname in authunknowns:
                authdata = "Author: Unknown"
                textdata.append(authdata)
            elif authorname not in authunknowns:
                removables = ["[", "]", "\n"]
                for item in removables:
                    if item in authorname:
                        itemplace = authorname.find(item)
                        authorname = authorname[:itemplace] + authorname[itemplace + len(item):]
                authdata = "Author: " + authorname
                textdata.append(authdata)
        """Finds all languages in the text"""
        langtags = []
        for header in infohtml.findAll("h5"):
            headerstring = (str(header))
            """Finds Irish Language Information where identification conforms to expected format"""
            if "Language: [GA]" in headerstring:
                irishinfo = headerstring[19:-5]
                irishtypes = []
                if "Old" in irishinfo:
                    irishtypes.append("Old Irish")
                if "Middle" in irishinfo:
                    irishtypes.append("Middle Irish")
                emi_list = ["Early Modern", "Early modern", "(Early) Modern", "Early-Modern", "early Modern",
                            "Classical Modern"]
                for variant in emi_list:
                    if variant in irishinfo:
                        if "Early Modern Irish" not in irishtypes:
                            irishtypes.append("Early Modern Irish")
                if not irishtypes:
                    irishtypes.append("not specified")
                textdata.append("Form(s) of Irish: " + str(irishtypes))
                """Finds Irish Language Information where identification does not conform to expected format"""
            elif "Language: GA" in headerstring:
                infohtmlstring = (str(infohtml))
                headerpos = infohtmlstring.find(headerstring)
                headerlen = len(headerstring)
                headerend = headerpos + headerlen
                splitstring = infohtmlstring[headerend:]
                paraend = splitstring.find("</p>")
                teststring = splitstring[4:paraend]
                removables = ["\n"]
                for item in removables:
                    if item in teststring:
                        itemcount = teststring.count(item)
                        for i in range(0, itemcount):
                            spacepos = teststring.find(item)
                            teststring = teststring[:spacepos] + teststring[spacepos + len(item):]
                irishinfo = teststring
                irishtypes = []
                if "Old" in irishinfo:
                    irishtypes.append("Old Irish")
                if "Middle" in irishinfo:
                    irishtypes.append("Middle Irish")
                emi_list = ["Early Modern", "Early modern", "(Early) Modern", "Early-Modern", "early Modern",
                            "Classical Modern"]
                for variant in emi_list:
                    if variant in irishinfo:
                        if "Early Modern Irish" not in irishtypes:
                            irishtypes.append("Early Modern Irish")
                if not irishtypes:
                    irishtypes.append("not specified")
                textdata.append("Form(s) of Irish: " + str(irishtypes))
            elif "Language: [" in headerstring:
                langtag = headerstring[15:17]
                langtags.append(langtag)
            elif "Language: " in headerstring:
                langtag = headerstring[14:16]
                langtags.append(langtag)
        if langtags:
            textdata.append(langtags)
        elif not langtags:
            textdata.append("None")
        textsinfo.append(textdata)
        """Prints an error message if any base links are in an unexpected format"""
        time.sleep(2)
    if no_baselink:
        print("The following links are broken: " + str(no_baselink))
    return textsinfo

In [8]:
""""Counts (Irish) texts on main page of CELT, translations, Categories, 
Subcategories, subcategories per category and lists them all."""

def getstats(url):
    pagestats = []
    html = siteopen(url)
    textcount = 0
    transcount = 0
    engcount = 0
    frcount = 0
    catcount = 0
    subcatcount = 0
    breakcount = 0
    """Finds and counts texts and translations"""
    for link in html.findAll('a', href=True):
        linktext = (str(link))
        ref = link.get("href")
        brokenhref = ""
        if ref == brokenhref:
            breakcount += 1
        elif "published/G" in linktext:
            textcount += 1
            linkname = link.text
            linkname = (str(linkname))
            replaces = {"\r": "", "\n": " "}
            for item in replaces:
                if item in linkname:
                    itemcount = linkname.count(item)
                    for i in range(0, itemcount):
                        itemplace = linkname.find(item)
                        linkname = linkname[:itemplace] + replaces.get(item) + linkname[itemplace + len(item):]
            gtextinfo = (str(textcount) + ". " + str(linkname))
    #         print(gtextinfo)
        elif "published/T" in linktext:
            transcount += 1
            engcount += 1
            linkname = link.text
            linkname = (str(linkname))
            replaces = {"\r": "", "\n": " "}
            for item in replaces:
                if item in linkname:
                    itemcount = linkname.count(item)
                    for i in range(0, itemcount):
                        itemplace = linkname.find(item)
                        linkname = linkname[:itemplace] + replaces.get(item) + linkname[itemplace + len(item):]
            entextinfo = (str(textcount) + ". " + str(linkname))
    #         print(entextinfo)
        elif "published/F" in linktext:
            transcount += 1
            frcount += 1
            linkname = link.text
            linkname = (str(linkname))
            replaces = {"\r": "", "\n": " "}
            for item in replaces:
                if item in linkname:
                    itemcount = linkname.count(item)
                    for i in range(0, itemcount):
                        itemplace = linkname.find(item)
                        linkname = linkname[:itemplace] + replaces.get(item) + linkname[itemplace + len(item):]
            frtextinfo = (str(textcount) + ". " + str(linkname))
            # print(frtextinfo)
    """Finds and counts categories"""
    categories = []
    catnames = []
    for category in html.find_all('h4'):
        catcount += 1
        catname = category.text
        catname = (str(catname))
        catnames.append(catname)
        catinfo = ("   " + str(catcount) + ". " + catname)
        categories.append(catinfo)
    """Finds and counts subcategories"""
    subcategories = []
    subcatnames = []
    for subcat in html.find_all('p'):
        subcatname = subcat.text
        subcatname = (str(subcatname))
        if "Corpus of Electronic Texts" not in subcatname:
            subcatnames.append(subcatname)
            subcatcount += 1
            replaces = {"\r": "", "\n": " "}
            for item in replaces:
                if item in subcatname:
                    itemcount = subcatname.count(item)
                    for i in range(0, itemcount):
                        itemplace = subcatname.find(item)
                        subcatname = subcatname[:itemplace] + replaces.get(item) + subcatname[itemplace + len(item):]
            subcatinfo = ("      " + str(subcatcount) + ". " + subcatname)
            subcategories.append(subcatinfo)
    """Finds and counts subcategories per category"""
    catdict = {}
    for cname in catnames:
        subcatecount = 0
        searchtext = (str(html))
        replacables = {"&": "&amp;"}
        origname = cname
        for item in replacables:
            if item in cname:
                probpos = cname.find(item)
                cname = cname[:probpos] + replacables.get(item) + cname[probpos + len(item):]
        if cname in searchtext:
            cnamepos = searchtext.find(cname)
            if origname == catnames[-1]:
                endpos = None
            elif origname != catnames[-1]:
                nextcat = (catnames.index(origname) + 1)
                nextcatname = catnames[nextcat]
                for replacable in replacables:
                    if replacable in nextcatname:
                        probpos = nextcatname.find(replacable)
                        nextcatname = nextcatname[:probpos] + \
                            replacables.get(replacable) + nextcatname[probpos + len(replacable):]
                endpos = searchtext.find(nextcatname)
        searchtext = searchtext[cnamepos:endpos]
        identifier = "<p>"
        closer = "</p>"
        pcount = searchtext.count(identifier)
        subcatchecks = []
        for i in range(0, pcount):
            beginpos = searchtext.find(identifier) + 3
            closepos = searchtext.find(closer)
            subcattext = searchtext[beginpos:closepos]
            subcatchecks.append(subcattext)
            searchtext = searchtext[:beginpos - 3] + searchtext[closepos + 4:]
        for subcatcheck in subcatchecks:
            if subcatcheck in subcatnames:
                subcatecount += 1
        catdict.update({cname: subcatecount})
    totaltcount = textcount + transcount + breakcount
    pagestats.append("Number of Texts (total): " + str(totaltcount))
    pagestats.append("Number of Texts (links working): " + str(textcount))
    pagestats.append("Number of Texts (links broken): " + str(breakcount))
    pagestats.append("Number of Translations: " + str(transcount))
    pagestats.append("   English Translations: " + str(engcount))
    pagestats.append("   French Translations: " + str(frcount))
    pagestats.append("Number of Categories: " + str(catcount))
    pagestats.append("Number of Subcategories: " + str(subcatcount))
    """Appends Categories and, where applicable, their subcategories to the output list, pagestats, in order"""
    for cat in categories:
        pagestats.append(cat)
        cattitle = (str(cat[6:]))
        if cattitle in catdict:
            removables = {"&amp;": "&"}
            for removable in removables:
                if removable in cattitle:
                    remopos = cattitle.find(removable)
                    cattitle = cattitle[:remopos] + removables.get(removable) + cattitle[remopos + len(removable):]
            catsubcats = catdict.get(cattitle)
        if catsubcats > 0:
            pagestats.append("      Subcategories: " + str(catsubcats))
            for i in range(0, catsubcats):
                pagestats.append(subcategories[i])
            for i in range(0, catsubcats):
                subcategories.remove(subcategories[0])
    return pagestats

In [9]:
""""Counts crawled texts by authors, language etc. and lists them all."""

def getmorestats(crawldata):
    outstats = []
    authcount = 0
    authors = []
    authknowncount = 0
    knownauthtexts = []
    authunknowncount = 0
    unknownauthtexts = []
    oicount = 0
    micount = 0
    emicount = 0
    oitexts = []
    mitexts = []
    emitexts = []
    irishonlycount = 0
    irishonlytexts = []
    otherlangcount = 0
    otherlangs = []
    for entry in crawldata:
        entname = entry[1]
        entname = entname[6:]
        authdata = entry[2]
        if authdata == "Author: Unknown":
            authunknowncount += 1
            unknownauthtexts.append(entname)
        else:
            authknowncount += 1
            knownauthtexts.append(entname)
            authname = authdata[8:]
            if authname not in authors:
                authcount += 1
                authors.append(authname)
        irformdata = entry[3]
        if "Old Irish" in irformdata:
            oicount += 1
            oitexts.append(entname)
        if "Middle Irish" in irformdata:
            micount += 1
            mitexts.append(entname)
        if "Early Modern Irish" in irformdata:
            emicount += 1
            emitexts.append(entname)
        otherlangdata = entry[4]
        if otherlangdata == "None":
            irishonlycount += 1
            irishonlytexts.append(entname)
        else:
            for langtag in otherlangdata:
                if langtag not in otherlangs:
                    otherlangcount += 1
                    otherlangs.append(langtag)
    outstats.append("Number of Authors: " + str(authcount))
    outstats.append("Number of Texts (author known): " + str(authknowncount))
    outstats.append("Number of Texts (author unknown): " + str(authunknowncount))
    outstats.append("Number of Texts (Old Irish): " + str(oicount))
    outstats.append("Number of Texts (Middle Irish): " + str(micount))
    outstats.append("Number of Texts (Early Modern Irish): " + str(emicount))
    outstats.append("Number of Texts (Irish Only): " + str(irishonlycount))
    outstats.append("Number of Alternative Languages: " + str(otherlangcount))
    # print(authors)
    # print(knownauthtexts)
    # print(unknownauthtexts)
    # print(oitexts)
    # print(mitexts)
    # print(emitexts)
    # print(irishonlytexts)
    # print(otherlangs)
    # return authors
    # return knownauthtexts
    # return unknownauthtexts
    # return oitexts
    # return mitexts
    # return emitexts
    # return irishonlytexts
    # return otherlangs
    return outstats

In [10]:
def celtcrawl(url):
    links_list = textlinks(url)
    crawl_list = links_list[0]
    crawl_data = textinfo(crawl_list)
    return crawl_data

In [18]:
%time information = celtcrawl('http://celt.ucc.ie/irlpage.html')

http://celt.ucc.ie/published/G100001A.html
http://celt.ucc.ie/published/G100001B.html
http://celt.ucc.ie/published/G100002.html
http://celt.ucc.ie/published/G100004.html
http://celt.ucc.ie/published/G100004P.html
http://celt.ucc.ie/published/G100005A.html
http://celt.ucc.ie/published/G100005B.html
http://celt.ucc.ie/published/G100005C.html
http://celt.ucc.ie/published/G100005D.html
http://celt.ucc.ie/published/G100005E.html
http://celt.ucc.ie/published/G100005F.html
http://celt.ucc.ie/published/G100010A.html
http://celt.ucc.ie/published/G100011.html
http://celt.ucc.ie/published/G100013.html
http://celt.ucc.ie/published/G100014.html
http://celt.ucc.ie/published/G100015.html
http://celt.ucc.ie/published/G100016.html
http://celt.ucc.ie/published/G100017.html
http://celt.ucc.ie/published/G100018.html
http://celt.ucc.ie/published/G100019.html
http://celt.ucc.ie/published/G100020.html
http://celt.ucc.ie/published/G100021.html
http://celt.ucc.ie/published/G100022.html
http://celt.ucc.ie/publi

http://celt.ucc.ie/published/G600030.html
http://celt.ucc.ie/published/G600005.html
http://celt.ucc.ie/published/G600005.html
http://celt.ucc.ie/published/G600008.html
http://celt.ucc.ie/published/G600009A.html
http://celt.ucc.ie/published/G600009B.html
http://celt.ucc.ie/published/G600009C.html
http://celt.ucc.ie/published/G600010.html
http://celt.ucc.ie/published/G600011.html
http://celt.ucc.ie/published/G600012.html
http://celt.ucc.ie/published/G600013.html
http://celt.ucc.ie/published/G600014.html
http://celt.ucc.ie/published/G600015.html
http://celt.ucc.ie/published/G600016.html
http://celt.ucc.ie/published/G600018.html
http://celt.ucc.ie/published/G600019.html
http://celt.ucc.ie/published/G600020.html
http://celt.ucc.ie/published/G600021.html
http://celt.ucc.ie/published/G500001.html
http://celt.ucc.ie/published/G502003.html
http://celt.ucc.ie/published/G502006.html
http://celt.ucc.ie/published/G502007.html
http://celt.ucc.ie/published/G503000.html
http://celt.ucc.ie/published/G5

In [19]:
type(information)

list

In [20]:
len(information)

223

In [21]:
for stat in getmorestats(information):
    print(stat)

Number of Authors: 18
Number of Texts (author known): 19
Number of Texts (author unknown): 204
Number of Texts (Old Irish): 55
Number of Texts (Middle Irish): 129
Number of Texts (Early Modern Irish): 61
Number of Texts (Irish Only): 17
Number of Alternative Languages: 17


In [23]:
information[0]

['http://celt.ucc.ie/published/G100001A.html',
 'Text: The Annals of Ulster',
 'Author: Unknown',
 "Form(s) of Irish: ['Old Irish', 'Middle Irish', 'Early Modern Irish']",
 ['LA', 'EN']]

In [34]:
def get_text(link):
    soup = siteopen(link)
    try:
        text = soup.get_text()
    except AttributeError:
        text = ""
    return text

In [35]:
for entry in information:
    text = get_text(entry[0])
    entry.append(text)
    time.sleep(2)

In [36]:
information[0]

['http://celt.ucc.ie/published/G100001A.html',
 'Text: The Annals of Ulster',
 'Author: Unknown',
 "Form(s) of Irish: ['Old Irish', 'Middle Irish', 'Early Modern Irish']",
 ['LA', 'EN'],
 '\n\nThe Annals of Ulster\n\n\n<!-- // \n    function footNote(link) {\n\topenpopup = window.open(link,"openpopup","width=512,height=128,left=256,top=256,resizable=no,scrollbars=1,menubar=1,statusbar=0,toolbar=0");\n}\n// -->\n\n\n\nCorpus of Electronic Texts Edition\nBackground details and bibliographic information\nThe Annals of Ulster\nAuthor:\t[unknown]\nFile DescriptionElectronic edition compiled by Donnchadh Ó Corráin, Mavis CournaneFunded by University College, Cork andProfessor Marianne McDonald via the CELT Project. 3. Third draft, revised and corrected.Proof corrections by Donnchadh Ó Corráin, Mavis CournaneExtent of text: 118200 words; 2 volumesPublication\nCELT: Corpus of Electronic Texts: a project of University College CorkCollege Road, Cork, Ireland — http://www.ucc.ie/celt (2005)  (200

In [49]:
def preprocess(text):
    text = re.sub(r"\nU\d+ ", r" ", text) # removes error codes
    text = re.sub(r"G\d+ ", r"", text) # removes text ids
    text = re.sub(r"\n\d+", r"", text) # removes unknown digits
    text = re.sub(r"\n(\s*?)¶*(\d)+~*\]", r"\n", text)  # removes line numbers
#   text = re.sub(r"\.[il]\.", r"ed ón", text) # changes .i., .l. to 'ed ón'
    text = re.sub(r"(\[.*?\])|(Text\s.*?\n)|((\.|\s)[ivxUuírcfl]+?\.)|(\.|\s)[A-Z]\.|sic\."
                  r"|(Incipit|Finit|FINIT|FINIS|Amen|anno\sdomini|dixit)[.,\n\s]*", r"", text)  # removes Latin & editor's comments
    text = re.sub(r"(\w|[.:;,-?!’])\s([.:;,-?!’])", r"\1\2", text) # removes whitespaces between words and punctuation
    text = re.sub(r"p\.((\s|\n)*)\d+(\s|\n)*?", r"", text)  # removes page numbers
    text = re.sub(r"\{(.+?)\}\s*?", r"", text)  # removes MS folios
    text = re.sub(r"(\.|[a-z])\d*(\n|\s)", r"\1\2", text)  # removes footnote numbers
    text = re.sub(r"^l|l^", "", text) # removes these weird things
    text = re.sub(r"(\.|\.’|’|,)(‘|\w)", r"\1 \2", text)  # adds necessary whitespaces
    text = re.sub(r"([.?!]’*)(\s|‘)", r"\1\n\2", text)  # moves each sentence to a new line
    text = re.sub(r"(\w|,|&|-|\s|;)\n", r"\1 ", text)  # joins sentence parts that were on different lines
    text = re.sub(r"(\n|\s|-|‘)\s+(\w|&|[’'.:;,-?!]*|\n)", r"\1\2", text)  # removes unnecessary whitespaces
    text = re.sub(r"~|§|\[|\]|\(|\)|\|", r"",text)  # removes unnecessary symbols
    text = re.sub(r"\s-(\w+)", r"-\1", text) # removes whitespaces before dashes
    text = re.sub(r"(\w+)-\s", r"\1-", text) # removes whitespaces after dashes
    text = re.sub(r"\t", " ", text) # changes tabs to spaces
    text = re.sub(r" +", r" ", text) # replaces multiple whitespaces with one
    text = re.sub(r" \. ", r" ", text) # removes random stops
    return text

def process_files(lst):
    for entry in lst:
        if entry[-1] != "":
            text = entry[-1].split("Corpus of Electronic Texts Edition: ")[-1]
            text = preprocess(text)
        else:
            text = ""
        entry.append(text)
    return lst

In [52]:
%time data = process_files(information)

Wall time: 16.5 s


In [53]:
data[25][-1][:200]

'Egerton Annals: Mionannala: Author: Unknown.\nlá dana do Shuibne Mhenn ina ghilla óg ina thig féin ro búi acá rád re a mhnái: ‘Is ingnad lem’, ol sé, ‘a laighet ro gab Cinél n-Eogain tigernas for chách'

In [54]:
data[146][-1][300:500]

'penta a righe do flaith nod-gebed beus: ‘Ind flaith si do-asselbtur duit a taspenad uait a n-ógi dom-biur duit cen gái cen ethech cen imarbae cen anfir flatha.\nGlinne aurut friss secht n-octhigerna & '

In [55]:
with open('crawled_celt.pkl', 'wb') as f:
    pkl.dump(data, f)

In [70]:
def write_to_file(entry):
    name = entry[1].split("Text: ")[-1].replace('"', '')
    print(name)
    with open('./crawled_celt/'+str(name)+'.txt', 'w', encoding='utf-8') as f:
        f.write(entry[-1])

In [71]:
for entry in data:
    write_to_file(entry)

The Annals of Ulster
Annala Uladh: Annals of Ulster otherwise Annala Senait, Annals of Senat
Annals of Tigernach
Annals of Inisfallen
Annals of Inisfallen, Pre-Patrician Section
Annals of the Four Masters
Annals of the Four Masters
Annals of the Four Masters
Annals of the Four Masters
Annals of the Four Masters
Annals of the Four Masters
Annals of Loch Cé A.D.1014-1590
Annála Connacht
Mac Carthaigh's Book
Mac Carthaigh's Book
Mac Carthaigh's Book
Chronicon Scotorum
Fragmentary Annals of Ireland
Ann[acute ]la Gearra as Proibhinse Ard Macha
Short Annals of Tirconaill
Short Annals of Fir Manach
Short Annals of Leinster
A Fragment of Irish Annals
Memoranda Gadelica
The circuit of Ireland by Muircheartach mac Néill
Egerton Annals: Mionannala
The Irish version of the Historia Britonum of Nennius
The Leabhar Oiris
Cathréim Cellacháin Caisil
Sex Aetates Mundi
Aimirgin Glúngel tuir tend
Tréide cétna labratar iarna genemain
Tréide cétna labratar iarna genemain
Orgguin trí mac Diarmata Mic Cerbai