In [1]:
import re
import requests
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
from requests.exceptions import ConnectionError
from urllib3.exceptions import MaxRetryError, NewConnectionError

In [2]:
with open("csnag_contents_links.txt", "r", encoding='utf-8') as f:
    contents = f.read().split('\n')[:-1]

In [3]:
contents[-1]

'http://corpas.ria.ie/index.php?fsg_function=1&fsg_page=Z'

In [4]:
base = "http://corpas.ria.ie/"
books_dict = {"csg_id": [], "title": [], "author": [], "year": [], "publisher": [], "link": []}
periodicals_dict = {"csg_id": [], "title": [], "author": [], "year": [], "publisher": [], "link": []}

In [5]:
def download_page(link):
    """
    Downloads a webpage and parses it.
    """
    try:
        source_code = requests.get(link)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text, "html.parser")     
        return soup
    except (ConnectionError, MaxRetryError, NewConnectionError, TimeoutError) as e:
        print("Connection Error")
        return None
    
def parse_contents(soup, coldict, table="results_table_books"):
    for tr in tqdm(soup.find(id=table).find_all("tr")[1:]):
        try:
            l = base + tr.td.a.get("href").replace("function=3", "function=5")
            csg_id = l.split("=")[-1]
            t, a, y, p = [td.text for td in tr.find_all("td")]
            coldict["title"].append(t)
            coldict["author"].append(a)
            coldict["year"].append(y)
            coldict["publisher"].append(p)
            coldict["link"].append(l)
            coldict["csg_id"].append(csg_id)
        except AttributeError: 
            print(tr.td.text)
            pass
    return coldict

## Metadata from Text Archive tables of contents

In [6]:
for link in contents:
    soup = download_page(link)
    books_dict = parse_contents(soup, books_dict)
    periodicals_dict = parse_contents(soup, periodicals_dict, table="results_table_periodicals")

100%|██████████| 72/72 [00:00<00:00, 19526.05it/s]
100%|██████████| 319/319 [00:00<00:00, 25884.25it/s]
100%|██████████| 51/51 [00:00<00:00, 18688.58it/s]
100%|██████████| 174/174 [00:00<00:00, 24638.23it/s]
100%|██████████| 107/107 [00:00<00:00, 23126.38it/s]
100%|██████████| 753/753 [00:00<00:00, 23610.19it/s]


A Chomharsain Éistigí
An Cleasaidhe


100%|██████████| 132/132 [00:00<00:00, 24576.00it/s]
100%|██████████| 192/192 [00:00<00:00, 25269.27it/s]


An Dochartach


100%|██████████| 45/45 [00:00<00:00, 18540.64it/s]
100%|██████████| 151/151 [00:00<00:00, 20154.01it/s]


An Eolaíocht : Páipéir Shamplacha Gnáthleibhéal agus Ardleibhéal An Teastas Sóisearach


100%|██████████| 51/51 [00:00<00:00, 23319.47it/s]
100%|██████████| 220/220 [00:00<00:00, 24655.24it/s]
100%|██████████| 26/26 [00:00<00:00, 18245.26it/s]
100%|██████████| 169/169 [00:00<00:00, 22207.38it/s]
100%|██████████| 3/3 [00:00<00:00, 5092.23it/s]
100%|██████████| 13/13 [00:00<00:00, 17634.53it/s]


An Ghaoth Aniar


100%|██████████| 35/35 [00:00<00:00, 22208.87it/s]
100%|██████████| 106/106 [00:00<00:00, 22286.64it/s]
100%|██████████| 1/1 [00:00<00:00, 3123.09it/s]
100%|██████████| 3/3 [00:00<00:00, 8283.68it/s]
100%|██████████| 1/1 [00:00<00:00, 3077.26it/s]
100%|██████████| 1/1 [00:00<00:00, 4293.04it/s]
100%|██████████| 39/39 [00:00<00:00, 21346.45it/s]
100%|██████████| 151/151 [00:00<00:00, 24102.44it/s]
100%|██████████| 62/62 [00:00<00:00, 13367.96it/s]
100%|██████████| 159/159 [00:00<00:00, 25149.69it/s]


A Mhuintir Dhú Chaocháin Labhraigí Feasta


100%|██████████| 14/14 [00:00<00:00, 20800.66it/s]
100%|██████████| 246/246 [00:00<00:00, 23962.44it/s]
100%|██████████| 19/19 [00:00<00:00, 18319.95it/s]
100%|██████████| 115/115 [00:00<00:00, 18790.95it/s]
100%|██████████| 41/41 [00:00<00:00, 22698.85it/s]
100%|██████████| 85/85 [00:00<00:00, 23630.66it/s]
0it [00:00, ?it/s]
100%|██████████| 1/1 [00:00<00:00, 3483.64it/s]
100%|██████████| 24/24 [00:00<00:00, 17604.63it/s]
100%|██████████| 67/67 [00:00<00:00, 19781.67it/s]
100%|██████████| 153/153 [00:00<00:00, 25159.90it/s]
100%|██████████| 495/495 [00:00<00:00, 23656.94it/s]
100%|██████████| 104/104 [00:00<00:00, 13193.23it/s]
100%|██████████| 400/400 [00:00<00:00, 13717.52it/s]


An tAmhrán Macarónach


100%|██████████| 9/9 [00:00<00:00, 13315.25it/s]
100%|██████████| 38/38 [00:00<00:00, 19330.93it/s]
100%|██████████| 1/1 [00:00<00:00, 5236.33it/s]
100%|██████████| 34/34 [00:00<00:00, 11992.80it/s]
100%|██████████| 7/7 [00:00<00:00, 12003.32it/s]
100%|██████████| 8/8 [00:00<00:00, 14382.53it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
100%|██████████| 1/1 [00:00<00:00, 2449.94it/s]
0it [00:00, ?it/s]
100%|██████████| 1/1 [00:00<00:00, 2626.36it/s]


In [7]:
books_df = pd.DataFrame(books_dict)
books_df.head()

Unnamed: 0,csg_id,title,author,year,publisher,link
0,3663,Abhráin atá leagtha ar an Reachtabhrach,"De hÍde, Dubhghlas",1903,Gill agus a mhac,http://corpas.ria.ie/index.php?fsg_function=5&...
1,437,Abhráin Diadha Chúige Connacht I,In eagar ag Dubhglas De hÍde,1906,Gill / Unwin,http://corpas.ria.ie/index.php?fsg_function=5&...
2,438,Abhráin Diadha Chúige Connacht II,In eagar ag Dubhglas De hÍde,1906,Gill / Unwin,http://corpas.ria.ie/index.php?fsg_function=5&...
3,457,"Abhráin Ghaedhilge an Iarthair, an Chéad Chuid","Údair éagsúla, bailithe ag Mícheál Ó Tiománaidhe",1906,Connradh na Gaedhilge,http://corpas.ria.ie/index.php?fsg_function=5&...
4,439,Abhráin Grádh Chúige Connacht(Love Songs of Co...,In eagar ag Dubhglas De hÍde,1893,Gill / Unwin,http://corpas.ria.ie/index.php?fsg_function=5&...


In [8]:
len(books_df)

990

### Periodicals

A problem with periodicals is that there can be several authors in one issue, and these authors are only tagged within the text. Tags within texts are not parced well with `bs4`, because they have values where normal tags have attributes, and there are no closing tags.

In [67]:
periodicals_df = pd.DataFrame(periodicals_dict)
periodicals_df.tail()

Unnamed: 0,csg_id,title,author,year,publisher,link
3897,4756,West Cork Proverbs.,Féach bailitheoir,1894,Connradh na Gaedhilge,http://corpas.ria.ie/index.php?fsg_function=5&...
3898,5316,Whereas go dtáinig san bhfoghmhar.,Ní fios,1899,Connradh na Gaedhilge,http://corpas.ria.ie/index.php?fsg_function=5&...
3899,2326,Words and music of Úir Chille Creagan,L.D. and J.Q.,1908,County Louth Archaeological and Historical Soc...,http://corpas.ria.ie/index.php?fsg_function=5&...
3900,4231,Yr Haul: Caerfyrddin. Adolygiad y Wasg.,"Gill, M. H.",1882,Connradh na Gaedhilge,http://corpas.ria.ie/index.php?fsg_function=5&...
3901,1765,Zabern agus na Vosges,An Buachaillín Buidhe,1914,An Claidheamh Soluis,http://corpas.ria.ie/index.php?fsg_function=5&...


In [68]:
len(periodicals_df)

3902

In [126]:
periodicals_df.to_csv("csg_periodicals_metadata.tsv", sep="\t", encoding="utf-8", index=False)
books_df.to_csv("csg_books_metadata.tsv", sep="\t", encoding="utf-8", index=False)

## Texts

In [13]:
def get_genre(soup, no_genre):
    try:
        return list(soup.textarea.c.attrs.keys())[0]
    except (AttributeError, IndexError) as e:
        no_genre.append(soup.find(id="fixed_title").h2.text)
        return ""

def get_regex(html, regex, g=1):
    try:
        res = regex.search(html).group(g)
    except AttributeError:
        res = ""
    return res

def get_text(soup):
    raw_text = soup.textarea.text
    text = re.sub("\ufeff", "", raw_text)
    text = re.sub("-\r\n", "", text)
    text = re.sub("\s+", " ", text)
    text = re.sub("^\s+", "", text)
    return text


def download_page(link):
    """
    Downloads a webpage and parses it.
    """
    try:
        source_code = requests.get(link)
        plain_text = source_code.text
        penname = get_regex(plain_text, re.compile("<dt>Pen Name</dt><dd>(.*?)</dd>"))
        editor = get_regex(plain_text, re.compile("<dt>Compiler/Editor</dt><dd>(.*?)</dd>"))
        genre = get_regex(plain_text, re.compile("(<|{)C (.+?)(>|})"), g=2)
        soup = BeautifulSoup(plain_text, "html.parser")     
        return soup, penname, editor, genre
    except (ConnectionError, MaxRetryError, NewConnectionError, TimeoutError) as e:
        print("Connection Error")
        return None

In [14]:
rtest = "http://corpas.ria.ie/index.php?fsg_function=5&fsg_id=3897"
source_code = requests.get(rtest)
plain_text = source_code.text
e = get_regex(plain_text, re.compile("<dt>Compiler/Editor</dt><dd>(.*?)</dd>"))
print(e)

Ó Fiaich, Tomás


In [15]:
# test genre
test = requests.get("http://corpas.ria.ie/index.php?fsg_function=5&fsg_id=2370").text
e = get_regex(test, re.compile("(<|{)C (.+?)(>|})", re.DOTALL), g=2)
print(e)

Prós


In [17]:
booklinks = books_df['link']
# perlinks = periodicals_df['link']

In [18]:
texts_books = []
genres_books = []
pennames_books = []
editors_books = []
no_genre = []

In [19]:
for link in tqdm(booklinks):
    soup, penname, editor, genre = download_page(link)
    text = get_text(soup)
#     genre = get_genre(soup, no_genre)
    texts_books.append(text)
    pennames_books.append(penname)
    editors_books.append(editor)
    genres_books.append(genre)

100%|██████████| 990/990 [04:58<00:00,  3.31it/s]


In [393]:
no_genre

[]

In [20]:
full_books_df = books_df.copy()
full_books_df.head()

Unnamed: 0,csg_id,title,author,year,publisher,link
0,3663,Abhráin atá leagtha ar an Reachtabhrach,"De hÍde, Dubhghlas",1903,Gill agus a mhac,http://corpas.ria.ie/index.php?fsg_function=5&...
1,437,Abhráin Diadha Chúige Connacht I,In eagar ag Dubhglas De hÍde,1906,Gill / Unwin,http://corpas.ria.ie/index.php?fsg_function=5&...
2,438,Abhráin Diadha Chúige Connacht II,In eagar ag Dubhglas De hÍde,1906,Gill / Unwin,http://corpas.ria.ie/index.php?fsg_function=5&...
3,457,"Abhráin Ghaedhilge an Iarthair, an Chéad Chuid","Údair éagsúla, bailithe ag Mícheál Ó Tiománaidhe",1906,Connradh na Gaedhilge,http://corpas.ria.ie/index.php?fsg_function=5&...
4,439,Abhráin Grádh Chúige Connacht(Love Songs of Co...,In eagar ag Dubhglas De hÍde,1893,Gill / Unwin,http://corpas.ria.ie/index.php?fsg_function=5&...


In [21]:
full_books_df.insert(3, "pen_name", pennames_books, True)
full_books_df.insert(4, "editor", editors_books, True)
full_books_df.insert(6, "genre", genres_books, True)
full_books_df["text"] = texts_books

full_books_df.head()

Unnamed: 0,csg_id,title,author,pen_name,editor,year,genre,publisher,link,text
0,3663,Abhráin atá leagtha ar an Reachtabhrach,"De hÍde, Dubhghlas","Craoibhín Aoibhinn, An",,1903,Prós,Gill agus a mhac,http://corpas.ria.ie/index.php?fsg_function=5&...,ABHRÁIN AN REACHTÚIRE. Nuair caithtear cloch i...
1,437,Abhráin Diadha Chúige Connacht I,In eagar ag Dubhglas De hÍde,"Craoibhín Aoibhinn, An (Bail. / Eag.)","De hÍde, Dubhglas",1906,Prós,Gill / Unwin,http://corpas.ria.ie/index.php?fsg_function=5&...,ABHRÁIN DIADHA CHÚIGE CONNACHT. Is cráibhtheac...
2,438,Abhráin Diadha Chúige Connacht II,In eagar ag Dubhglas De hÍde,"Craoibhín Aoibhinn, An (Bail. / Eag.)","De hÍde, Dubhglas",1906,Prós,Gill / Unwin,http://corpas.ria.ie/index.php?fsg_function=5&...,Ag so giota neamh-ghnáthach tá rud-beag cosmhú...
3,457,"Abhráin Ghaedhilge an Iarthair, an Chéad Chuid","Údair éagsúla, bailithe ag Mícheál Ó Tiománaidhe",,"Ó Tiománaidhe, Micheál",1906,Prós,Connradh na Gaedhilge,http://corpas.ria.ie/index.php?fsg_function=5&...,Reamhrádh. Ní raibh éan ríoghacht ar dhruim ua...
4,439,Abhráin Grádh Chúige Connacht(Love Songs of Co...,In eagar ag Dubhglas De hÍde,"Craoibhín Aoibhinn, An (Bail. / Eag.)","De hÍde, Dubhglas",1893,Filíocht,Gill / Unwin,http://corpas.ria.ie/index.php?fsg_function=5&...,AN CEATRAMHADH CAIBIDIL ABHRÁIN GRÁDH. Tar éis...


In [24]:
authors = full_books_df[full_books_df.groupby("author")["author"].transform('size') > 5]
authors.groupby("author").count()

Unnamed: 0_level_0,csg_id,title,pen_name,editor,year,genre,publisher,link,text
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
,176,176,176,176,176,176,176,176,176
Anaithnid,171,171,171,171,171,171,171,171,171
"Céitinn, Seathrún",13,13,13,13,13,13,13,13,13
"Mac Cuarta, Séamas Dall",13,13,13,13,13,13,13,13,13
"Paodhar, Muiris",9,9,9,9,9,9,9,9,9
"Ua Laoghaire, Peadar, An tAthair",17,17,17,17,17,17,17,17,17
"Ó Conaire, Pádraic",6,6,6,6,6,6,6,6,6
"Ó Longáin, Micheál Óg",6,6,6,6,6,6,6,6,6
"Ó Neachtain, Seán",8,8,8,8,8,8,8,8,8
"Ó Neachtain, Tadhg",7,7,7,7,7,7,7,7,7


In [25]:
full_books_df.groupby("genre").count()

Unnamed: 0_level_0,csg_id,title,author,pen_name,editor,year,publisher,link,text
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
,20,20,20,20,20,20,20,20,20
Béal.,1,1,1,1,1,1,1,1,1
FIl.,1,1,1,1,1,1,1,1,1
Fil.,406,406,406,406,406,406,406,406,406
Filíocht,9,9,9,9,9,9,9,9,9
M,1,1,1,1,1,1,1,1,1
NChóir.,1,1,1,1,1,1,1,1,1
PRÓS,2,2,2,2,2,2,2,2,2
Prós,548,548,548,548,548,548,548,548,548
Prós.,1,1,1,1,1,1,1,1,1


In [26]:
eds = full_books_df[full_books_df.groupby("editor")["editor"].transform('size') > 3]
eds.groupby("editor").count()

Unnamed: 0_level_0,csg_id,title,author,pen_name,year,genre,publisher,link,text
editor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
,269,269,269,269,269,269,269,269,269
An Craoibhín,5,5,5,5,5,5,5,5,5
An Seabhac,4,4,4,4,4,4,4,4,4
"Borthwick, Norma",4,4,4,4,4,4,4,4,4
"Breathnach, Pól",5,5,5,5,5,5,5,5,5
"Breatnach, P.A.",17,17,17,17,17,17,17,17,17
"Buttimer, C. G.",5,5,5,5,5,5,5,5,5
"Carney, James",4,4,4,4,4,4,4,4,4
"De hÍde, Dubhglas",5,5,5,5,5,5,5,5,5
"Hyde, Douglas",4,4,4,4,4,4,4,4,4


In [27]:
pens = full_books_df[full_books_df.groupby("pen_name")["pen_name"].transform('size') > 1]
pens.groupby("pen_name").count()

Unnamed: 0_level_0,csg_id,title,author,editor,year,genre,publisher,link,text
pen_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
,831,831,831,831,831,831,831,831,831
"Bhn Smmbhbc / Seabhac, An / Sigfrid",3,3,3,3,3,3,3,3,3
Caoilte Mac Ronáin,3,3,3,3,3,3,3,3,3
Caoilte na gCos / Cois Leasa,9,9,9,9,9,9,9,9,9
Cara na nUghdar / Kevin O'Kennedy,2,2,2,2,2,2,2,2,2
Carraig Áine,4,4,4,4,4,4,4,4,4
Cath Muige Mucrime,18,18,18,18,18,18,18,18,18
Cois na Teineadh / Gruagach an Tobair,3,3,3,3,3,3,3,3,3
Colm Ó Conaire / Cuimín Ó Cualáin / Laeg,4,4,4,4,4,4,4,4,4
Conn,2,2,2,2,2,2,2,2,2


In [28]:
full_books_df.groupby("genre").count()

Unnamed: 0_level_0,csg_id,title,author,pen_name,editor,year,publisher,link,text
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
,20,20,20,20,20,20,20,20,20
Béal.,1,1,1,1,1,1,1,1,1
FIl.,1,1,1,1,1,1,1,1,1
Fil.,406,406,406,406,406,406,406,406,406
Filíocht,9,9,9,9,9,9,9,9,9
M,1,1,1,1,1,1,1,1,1
NChóir.,1,1,1,1,1,1,1,1,1
PRÓS,2,2,2,2,2,2,2,2,2
Prós,548,548,548,548,548,548,548,548,548
Prós.,1,1,1,1,1,1,1,1,1


In [29]:
full_books_df[full_books_df["genre"] == "M"]

Unnamed: 0,csg_id,title,author,pen_name,editor,year,genre,publisher,link,text
214,3134,Cosa Buidhe Arda I,"Mac Coluim, Fionán",Droigheantóir / Fínghín na Leamhna / Gio,"Mac Coluim, Fionán",1914,M,Fallamhain / Oifig an Lóchrainn,http://corpas.ria.ie/index.php?fsg_function=5&...,"Cosa buidhe árda, árda, Cosa buidhe árda, dear..."


In [30]:
full_books_df['genre'].replace("M", "poetry", inplace=True)
full_books_df['genre'].replace("Filíocht", "poetry", inplace=True)
full_books_df['genre'].replace("Fil.", "poetry", inplace=True) 
full_books_df['genre'].replace("FIl.", "poetry", inplace=True)
full_books_df['genre'].replace("NChóir.", "prose", inplace=True)
full_books_df['genre'].replace("Prós.", "prose", inplace=True)
full_books_df['genre'].replace("Prós", "prose", inplace=True)
full_books_df['genre'].replace("PRÓS", "prose", inplace=True)
full_books_df['genre'].replace("Béal.", "folklore", inplace=True)
full_books_df['editor'].replace("A. Ó D", "A. Ó D.", inplace=True)
full_books_df.replace("Ní fios", "", inplace=True)
full_books_df.replace("Anaithnid", "", inplace=True)
full_books_df["pen_name"] = full_books_df["pen_name"].map(lambda x: x.replace("(Bail. / Eag.)", ""))
full_books_df["pen_name"] = full_books_df["pen_name"].map(lambda x: x.replace("(Aistr.)", ""))

In [31]:
full_books_df.groupby("genre").count()

Unnamed: 0_level_0,csg_id,title,author,pen_name,editor,year,publisher,link,text
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
,20,20,20,20,20,20,20,20,20
folklore,1,1,1,1,1,1,1,1,1
poetry,417,417,417,417,417,417,417,417,417
prose,552,552,552,552,552,552,552,552,552


In [41]:
full_books_df.head()

Unnamed: 0,csg_id,title,author,pen_name,editor,year,genre,publisher,link,text,tokens,types
0,3663,Abhráin atá leagtha ar an Reachtabhrach,"De hÍde, Dubhghlas","Craoibhín Aoibhinn, An",,1903,prose,Gill agus a mhac,http://corpas.ria.ie/index.php?fsg_function=5&...,ABHRÁIN AN REACHTÚIRE. Nuair caithtear cloch i...,48703,7588
1,437,Abhráin Diadha Chúige Connacht I,In eagar ag Dubhglas De hÍde,"Craoibhín Aoibhinn, An","De hÍde, Dubhglas",1906,prose,Gill / Unwin,http://corpas.ria.ie/index.php?fsg_function=5&...,ABHRÁIN DIADHA CHÚIGE CONNACHT. Is cráibhtheac...,51331,8141
2,438,Abhráin Diadha Chúige Connacht II,In eagar ag Dubhglas De hÍde,"Craoibhín Aoibhinn, An","De hÍde, Dubhglas",1906,prose,Gill / Unwin,http://corpas.ria.ie/index.php?fsg_function=5&...,Ag so giota neamh-ghnáthach tá rud-beag cosmhú...,50595,8119
3,457,"Abhráin Ghaedhilge an Iarthair, an Chéad Chuid","Údair éagsúla, bailithe ag Mícheál Ó Tiománaidhe",,"Ó Tiománaidhe, Micheál",1906,prose,Connradh na Gaedhilge,http://corpas.ria.ie/index.php?fsg_function=5&...,Reamhrádh. Ní raibh éan ríoghacht ar dhruim ua...,22526,4778
4,439,Abhráin Grádh Chúige Connacht(Love Songs of Co...,In eagar ag Dubhglas De hÍde,"Craoibhín Aoibhinn, An","De hÍde, Dubhglas",1893,poetry,Gill / Unwin,http://corpas.ria.ie/index.php?fsg_function=5&...,AN CEATRAMHADH CAIBIDIL ABHRÁIN GRÁDH. Tar éis...,22833,4604


In [60]:
full_books_df.groupby("year").count().head(60)

Unnamed: 0_level_0,csg_id,title,author,pen_name,editor,genre,publisher,link,text,tokens,types
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
,2,2,2,2,2,2,2,2,2,2,2
1581.0,1,1,1,1,1,1,1,1,1,1,1
1583.0,1,1,1,1,1,1,1,1,1,1,1
1593.0,1,1,1,1,1,1,1,1,1,1,1
1594.0,1,1,1,1,1,1,1,1,1,1,1
1600.0,14,14,14,14,14,14,14,14,14,14,14
1601.0,3,3,3,3,3,3,3,3,3,3,3
1602.0,3,3,3,3,3,3,3,3,3,3,3
1603.0,2,2,2,2,2,2,2,2,2,2,2
1604.0,2,2,2,2,2,2,2,2,2,2,2


### Tokens

In [37]:
from nltk import word_tokenize

tokens = []
types = []
for t in full_books_df["text"]:
    tks = word_tokenize(t)
    tokens.append(len(tks))
    types.append(len(set(tks)))

In [38]:
full_books_df["tokens"] = tokens
full_books_df["types"] = types

In [39]:
full_books_df["tokens"].describe()

count       990.000000
mean      15263.395960
std       32310.377518
min           0.000000
25%         684.250000
50%        5113.000000
75%       17939.500000
max      640630.000000
Name: tokens, dtype: float64

In [40]:
full_books_df["types"].describe()

count      990.000000
mean      2633.163636
std       3324.700236
min          0.000000
25%        364.250000
50%       1541.000000
75%       3721.500000
max      44689.000000
Name: types, dtype: float64

In [44]:
full_books_df[full_books_df["tokens"] == 0]

Unnamed: 0,csg_id,title,author,pen_name,editor,year,genre,publisher,link,text,tokens,types
108,2473,Bethada Náem nÉrenn I,,,"Plummer, Charles",1620.0,,"(Oxford: Clarendon Press, 1922)",http://corpas.ria.ie/index.php?fsg_function=5&...,,0,0
130,3498,An Cnámharlach,,,,,,,http://corpas.ria.ie/index.php?fsg_function=5&...,,0,0
368,2254,Eachtra Cléirigh na gCroiceann,,,,1907.0,,"(B.Á.C.: Connradh Chuilm Naomhtha, 1907)",http://corpas.ria.ie/index.php?fsg_function=5&...,,0,0
390,2330,Elegy on the Death of the Rev. Edmond Kavanagh...,"Ó Leathlobhair, Séamas",,"O'Donovan, John",1856.0,,(B.Á.C.: Kilkenny and South-East of Ireland Ar...,http://corpas.ria.ie/index.php?fsg_function=5&...,,0,0
497,2417,"Introduction to the Irish Language 1, An","Neilson, William",,,1808.0,,"Wogan, P",http://corpas.ria.ie/index.php?fsg_function=5&...,,0,0
498,2418,"Introduction to the Irish Language 2, An","Neilson, William",,,1808.0,,"Wogan, P",http://corpas.ria.ie/index.php?fsg_function=5&...,,0,0
499,2419,"Introduction to the Irish Language 3, An","Neilson, William",,,1808.0,,"Wogan, P",http://corpas.ria.ie/index.php?fsg_function=5&...,,0,0
641,3413,Oidhe Chloinne Tuireann(Fate of the Children o...,Féach bail. / eag.,,"O'Duffy, Richard 1888",1889.0,,"Gill, M. H. and Son",http://corpas.ria.ie/index.php?fsg_function=5&...,,0,0
714,3497,Róisín Fiain na Mara,"Mac Clúin, An t-Ath. Seoirse",,,1924.0,,Brún agus Ó Nóláin Teor.,http://corpas.ria.ie/index.php?fsg_function=5&...,,0,0
737,10,Saltair Mhuire,"Céitinn, Seathrún",,"Ó Foghludha, Riseárd",1908.0,,"(B.Á.C.: Dominican Fathers, 1908)",http://corpas.ria.ie/index.php?fsg_function=5&...,,0,0


In [73]:
len(full_books_df)

990

In [75]:
century = []

for y in full_books_df["year"]:
    try:
        if int(y) >= 1500 and int(y) < 1600:
            century.append("16")
        if int(y) >= 1600 and int(y) < 1700:
            century.append("17")
        if int(y) >= 1700 and int(y) < 1800:
            century.append("18")
        if int(y) >= 1800 and int(y) < 1900:
            century.append("19")   
        if int(y) >= 1900 and int(y) < 2000:
            century.append("20")
    except ValueError:
        century.append("–")

In [77]:
full_books_df.insert(6, "century", century)
full_books_df.tail()

Unnamed: 0,csg_id,title,author,pen_name,editor,year,century,genre,publisher,link,text,tokens,types
985,2144,A West Limerick Anthology No.3 (Donncha Woulfe),"Woulfe, Donncha",,"de Bháll, Tomás",1824,19,poetry,"(B.Á.C.: Sign of the Three Candles, 1937)",http://corpas.ria.ie/index.php?fsg_function=5&...,Donochadh Woulfe cct. aig Freagra an Aithir Ui...,685,436
986,2145,A West Limerick Anthology No.4 (Séamas Ó Caoin...,"Ó Caoindealbháin, Séamas",,"de Bháll, Tomás",1854,19,poetry,"(B.Á.C.: Sign of the Three Candles, 1937)",http://corpas.ria.ie/index.php?fsg_function=5&...,Madan aoibhin aorach go seasgair is mé am aona...,732,483
987,2138,"Walter Luin, cct.",,,"Breathnach, Pol",1687,17,poetry,"(B.Á.C.: Sign of the Three Candles, 1932)",http://corpas.ria.ie/index.php?fsg_function=5&...,Ní leanbaidh Banbha le haltrom a glún mar dhea...,135,104
988,2334,Welcome to the Primate MacMahon Archbishop of ...,,,"Morris, H.",1739,18,poetry,(Dún Dealgan: Co. Louth Archaeological Society...,http://corpas.ria.ie/index.php?fsg_function=5&...,Fáilte Do'n tighearna ro-dhersgnaidh .i. Brian...,773,434
989,2141,What we know if Cúchoigríche Ó Cléirigh,,,"Breathnach, Pól",1664,17,poetry,"(B.Á.C.: Sign of the Three Candles, 1935)",http://corpas.ria.ie/index.php?fsg_function=5&...,Beannacht chugaibh a Chalbhaigh ós tú as umhla...,281,202


## Saving data

In [78]:
full_books_df.to_csv("csg_books.tsv", sep="\t", encoding="utf-8", index=False)

In [None]:
for n, y, t in zip (full_books_df["title"], full_books_df["year"], full_books_df["text"]):
    if len(t) > 1:
        try:
            if int(y) > 1900 and int(y) < 2000:
                with open("./texts/books/1900/%s.txt" % n.replace("/", "|"), "w", encoding="utf=8") as f:
                    f.write(t.replace("\\", ""))
        except ValueError:
            pass