In [87]:
from collections import Counter
import pandas as pd
import pickle
import json 
import re
import os

In [2]:
punc_full = "[‘’–:—;.,!?–&-$\"^«»<>/\|%=\[\]()*+‘]+"
punc = "[‘’–:—;.,!?–$\"^«»<>/\|%=\[\]()*+‘]+"

mutations = {
                'bh': 'b', 'mb': 'b', 'ch': 'c', 'gc': 'c', 'gh': 'g',
                'ng': 'g', 'dh': 'd', 'nd': 'd', 'fh': 'f', 'ḟ': 'f',
                'ḟh': 'f', 'bhf': 'f', 'mm': 'm', 'll': 'l', 'nn': 'n',
                'ph': 'p', 'bp': 'p', 'rr': 'r', 'sh': 's', 'ṡ': 's',
                'th': 't', 'dt': 't', 'he': 'e', 'hé': 'é', 'ha': 'a',
                'há': 'á', 'hi': 'i', 'hí': 'í', 'ho': 'o', 'hó': 'ó',
                'hu': 'u', 'hú': 'ú', 'n-': '', 'h-': '', 'ss': 's',
                'ts': 's', 'n': 'e', 'né': 'é', 'na': 'a', 'ná': 'á',
                'ni': 'i', 'ní': 'í', 'no': 'o', 'nó': 'ó', 'nu': 'u',
                'nú': 'ú', 'm-': '', 't-': '', 't\'': ' ', 'm\'': '',
                'd\'': '', 'l-': '', 'mh': 'm', 'r-': '', 's-': '',
                'cc': 'c', 'mh\'': '', 'g-': 'g'}

In [23]:
def preprocess(data):
    words = []
    sents = []
    for sent in data:
        sent = sent.strip('\n')
        if len(sent) > 1:
            sent = sent.replace('.i.', '_i_')
            sent = sent.replace('.l.', '_l_')
            sent = sent.replace(' & ', ' _&_ ')
            sent = ' '.join([w.strip(punc_full) for w in sent.split()]).lower()
            sent = sent.replace(punc, ' ')
            sents.append(sent)
            words += sent.split()
    return sents, words

def split_sents(text):
    try:
        sents = re.split(r'\?’|!’|\.’|!|\?|\.|…', text)
    except TypeError:
        print(text)
        sents = []
    return sents

In [14]:
data = pd.read_csv('csg_books.tsv', encoding='utf-8', sep='\t')
data.fillna("–", inplace=True)
data.head()

Unnamed: 0,csg_id,title,author,pen_name,editor,year,century,genre,publisher,link,text,tokens,types
0,3663,Abhráin atá leagtha ar an Reachtabhrach,"De hÍde, Dubhghlas","Craoibhín Aoibhinn, An",–,1903,20,prose,Gill agus a mhac,http://corpas.ria.ie/index.php?fsg_function=5&...,ABHRÁIN AN REACHTÚIRE. Nuair caithtear cloch i...,48703,7588
1,437,Abhráin Diadha Chúige Connacht I,In eagar ag Dubhglas De hÍde,"Craoibhín Aoibhinn, An","De hÍde, Dubhglas",1906,20,prose,Gill / Unwin,http://corpas.ria.ie/index.php?fsg_function=5&...,ABHRÁIN DIADHA CHÚIGE CONNACHT. Is cráibhtheac...,51331,8141
2,438,Abhráin Diadha Chúige Connacht II,In eagar ag Dubhglas De hÍde,"Craoibhín Aoibhinn, An","De hÍde, Dubhglas",1906,20,prose,Gill / Unwin,http://corpas.ria.ie/index.php?fsg_function=5&...,Ag so giota neamh-ghnáthach tá rud-beag cosmhú...,50595,8119
3,457,"Abhráin Ghaedhilge an Iarthair, an Chéad Chuid","Údair éagsúla, bailithe ag Mícheál Ó Tiománaidhe",–,"Ó Tiománaidhe, Micheál",1906,20,prose,Connradh na Gaedhilge,http://corpas.ria.ie/index.php?fsg_function=5&...,Reamhrádh. Ní raibh éan ríoghacht ar dhruim ua...,22526,4778
4,439,Abhráin Grádh Chúige Connacht(Love Songs of Co...,In eagar ag Dubhglas De hÍde,"Craoibhín Aoibhinn, An","De hÍde, Dubhglas",1893,19,poetry,Gill / Unwin,http://corpas.ria.ie/index.php?fsg_function=5&...,AN CEATRAMHADH CAIBIDIL ABHRÁIN GRÁDH. Tar éis...,22833,4604


In [42]:
texts16 = list(data[data['century'] == '16']['text'])
texts17 = list(data[data['century'] == '17']['text'])
texts18 = list(data[data['century'] == '18']['text'])
texts19 = list(data[data['century'] == '19']['text'])
texts20 = list(data[data['century'] == '20']['text'])

In [43]:
print(len(texts16))
print(len(texts17))
print(len(texts18))
print(len(texts19))
print(len(texts20))

4
204
243
256
281


In [44]:
texts20[0][:200]

'ABHRÁIN AN REACHTÚIRE. Nuair caithtear cloch i n-uisge corruighthear an t-uisge. Tuitean an chloch go dti an tóin agus luidheann sí annsin, acht a bhfad tar éis a tuitime maireann gluasacht an uisge a'

In [45]:
all_texts = texts16 + texts17 + texts18 + texts19 + texts20
len(all_texts)

988

In [46]:
sents16 = []
sents17 = []
sents18 = []
sents19 = []
sents20 = []
all_sents = []

for text in texts16:
    sents16 += split_sents(text)

for text in texts17:
    sents17 += split_sents(text)

for text in texts18:
    sents18 += split_sents(text)

for text in texts19:
    sents19 += split_sents(text)
    
for text in texts20:
    sents20 += split_sents(text)

for text in all_texts:
    all_sents += split_sents(text)

In [47]:
sents16 = [sent for sent in sents16 if len(sent)>1]
sents17 = [sent for sent in sents17 if len(sent)>1]
sents18 = [sent for sent in sents18 if len(sent)>1]
sents19 = [sent for sent in sents19 if len(sent)>1]
sents20 = [sent for sent in sents20 if len(sent)>1]
all_sents = [sent for sent in all_sents if len(sent)>1]

In [49]:
with open('csnag16.txt', 'w', encoding='utf-8') as f1, \
          open('csnag17.txt', 'w', encoding='utf-8') as f2, \
          open('csnag18.txt', 'w', encoding='utf-8') as f3, \
          open('csnag19.txt', 'w', encoding='utf-8') as f4, \
          open('csnag20.txt', 'w', encoding='utf-8') as f5, \
          open('csnag_all.txt', 'w', encoding='utf-8') as f6:
    f1.write('\n'.join(sents16))
    f2.write('\n'.join(sents17))
    f3.write('\n'.join(sents18))
    f4.write('\n'.join(sents19))
    f5.write('\n'.join(sents20))
    f6.write('\n'.join(all_sents))

## Preprocessing after manual cleaning

In [52]:
# cleaning done manually
with open('./data/csnag16_clean.txt', 'r', encoding='utf-8') as f:
    csnag16 = f.read().split('\n')

# cleaning done manually
with open('./data/csnag17_clean.txt', 'r', encoding='utf-8') as f:
    csnag17 = f.read().split('\n')

# cleaning done manually
with open('./data/csnag18_clean.txt', 'r', encoding='utf-8') as f:
    csnag18 = f.read().split('\n')

# cleaning done manually
with open('./data/csnag19_clean.txt', 'r', encoding='utf-8') as f:
    csnag19 = f.read().split('\n')
    
# cleaning done manually
with open('./data/csnag20_clean.txt', 'r', encoding='utf-8') as f:
    csnag20 = f.read().split('\n')

# cleaning done manually
with open('./data/csnag_all_clean.txt', 'r', encoding='utf-8') as f:
    all_csnag = f.read().split('\n')

In [53]:
csnag16_sents, csnag16_words = preprocess(csnag16)
csnag17_sents, csnag17_words = preprocess(csnag17)
csnag18_sents, csnag18_words = preprocess(csnag18)
csnag19_sents, csnag19_words = preprocess(csnag19)
csnag20_sents, csnag20_words = preprocess(csnag20)
all_csnag_sents, all_csnag_words = preprocess(all_csnag)

### Statistics

In [54]:
print(len(csnag16_words))
print(len(csnag17_words))
print(len(csnag18_words))
print(len(csnag19_words))
print(len(csnag20_words))
print(len(all_csnag_words))

26746
3078133
2026287
2130151
5706230
12989041


In [59]:
print(len(set(csnag16_words)))
print(len(set(csnag17_words)))
print(len(set(csnag18_words)))
print(len(set(csnag19_words)))
print(len(set(csnag20_words)))
print(len(set(all_csnag_words)))

8112
252914
176518
165322
205939
596941


In [55]:
print(len(csnag16_sents))
print(len(csnag17_sents))
print(len(csnag18_sents))
print(len(csnag19_sents))
print(len(csnag20_sents))
print(len(all_csnag_sents))

1910
135814
80117
95119
381645
713422


### Saving preprocessed texts

In [56]:
with open('./data/csnag16_preprocessed.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(csnag16_sents))
        
with open('./data/csnag17_preprocessed.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(csnag17_sents))

with open('./data/csnag18_preprocessed.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(csnag18_sents))

with open('./data/csnag19_preprocessed.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(csnag19_sents))

with open('./data/csnag20_preprocessed.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(csnag20_sents))

with open('./data/all_csnag_preprocessed.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(all_csnag_sents))

## Word counts

In [60]:
csnag16_counts = Counter(csnag16_words)
csnag17_counts = Counter(csnag17_words)
csnag18_counts = Counter(csnag18_words)
csnag19_counts = Counter(csnag19_words)
csnag20_counts = Counter(csnag20_words)
all_csnag_counts = Counter(all_csnag_words)

In [61]:
print(csnag16_counts.most_common(5))
print(csnag17_counts.most_common(5))
print(csnag18_counts.most_common(5))
print(csnag19_counts.most_common(5))
print(csnag20_counts.most_common(5))
print(all_csnag_counts.most_common(5))

[('a', 1169), ('do', 860), ('ar', 644), ('an', 642), ('na', 571)]
[('agus', 141193), ('do', 114685), ('an', 107268), ('a', 99815), ('na', 63057)]
[('an', 74213), ('do', 67852), ('a', 65365), ('agus', 64208), ('na', 41169)]
[('a', 87205), ('an', 84770), ('agus', 70430), ('do', 56604), ('na', 40992)]
[('an', 265584), ('a', 218041), ('agus', 180481), ('ar', 136874), ('go', 104047)]
[('an', 532286), ('a', 471442), ('agus', 456621), ('do', 339084), ('ar', 266486)]


### Saving word counts

In [62]:
# saving counts in json

with open('./data/csnag16_counts.json', 'w', encoding='utf-8') as f:
    json.dump(csnag16_counts, f)
        
with open('./data/csnag17_counts.json', 'w', encoding='utf-8') as f:
    json.dump(csnag17_counts, f)

with open('./data/csnag18_counts.json', 'w', encoding='utf-8') as f:
    json.dump(csnag18_counts, f)

with open('./data/csnag19_counts.json', 'w', encoding='utf-8') as f:
    json.dump(csnag19_counts, f)

with open('./data/csnag20_counts.json', 'w', encoding='utf-8') as f:
    json.dump(csnag20_counts, f)

with open('./data/all_csnag_counts.json', 'w', encoding='utf-8') as f:
    json.dump(all_csnag_counts, f)

In [63]:
# saving counts in plaintext

with open('./data/csnag16_wordlist.txt', 'w', encoding='utf-8') as f:
    for pair in csnag16_counts.most_common():
        f.write('%s %s\n' % (pair[0], pair[1]))
        
with open('./data/csnag17_wordlist.txt', 'w', encoding='utf-8') as f:
    for pair in csnag17_counts.most_common():
        f.write('%s %s\n' % (pair[0], pair[1]))

with open('./data/csnag18_wordlist.txt', 'w', encoding='utf-8') as f:
    for pair in csnag18_counts.most_common():
        f.write('%s %s\n' % (pair[0], pair[1]))

with open('./data/csnag19_wordlist.txt', 'w', encoding='utf-8') as f:
    for pair in csnag19_counts.most_common():
        f.write('%s %s\n' % (pair[0], pair[1]))

with open('./data/csnag20_wordlist.txt', 'w', encoding='utf-8') as f:
    for pair in csnag20_counts.most_common():
        f.write('%s %s\n' % (pair[0], pair[1]))

with open('./data/all_csnag_wordlist.txt', 'w', encoding='utf-8') as f:
    for pair in all_csnag_counts.most_common():
        f.write('%s %s\n' % (pair[0], pair[1]))

## Getting rid of undated texts and sorting by year

In [69]:
dated = data.drop(data[data['year'] == '–'].index)
dated.astype({'year': 'int64'}).dtypes

csg_id        int64
title        object
author       object
pen_name     object
editor       object
year          int64
century      object
genre        object
publisher    object
link         object
text         object
tokens        int64
types         int64
dtype: object

In [70]:
dated.sort_values('year', inplace=True)
dated.head()

Unnamed: 0,csg_id,title,author,pen_name,editor,year,century,genre,publisher,link,text,tokens,types
112,2493,Book of O'Hara,–,–,"McKenna, Lambert",1581,16,poetry,"(B.Á.C.: I.A.B., 1951)",http://corpas.ria.ie/index.php?fsg_function=5&...,"VIII Táinig san chluiche ag Cormac, gég nach u...",6313,2692
716,498,Rudimenta Physionomae,–,–,"Mac Niocaill, G.",1583,16,prose,"(B.Á.C.: I.A.B., 1963)",http://corpas.ria.ie/index.php?fsg_function=5&...,Dá raibh gné ghlass nó bhuidhe ar neach is com...,1232,436
434,483,Flaithrí Ó Maolchonaire's Catechism of Christi...,"Ó Maolchonaire, Flaithrí",–,"Ó Cuív, Brian",1593,16,prose,"(B.Á.C.: I.A.B., 1950)",http://corpas.ria.ie/index.php?fsg_function=5&...,{N Celt01 161-206} {L 162} {B 1593 <17} {U 008...,9876,2249
531,2520,Leabhar Branach,–,–,"Mac Airt, Seán",1594,16,poetry,"(B.Á.C.: I.A.B., 1944)",http://corpas.ria.ie/index.php?fsg_function=5&...,AONGHUS DUBH Ó DÁLAIGH CC. Scél tásgmhar do rá...,14115,4866
349,2549,Duanaire Gaedhilge II,–,–,"Ní Ógáin, Róis",1600,17,poetry,"(B.Á.C.: C.O.É., 1924)",http://corpas.ria.ie/index.php?fsg_function=5&...,Ciamhair cráidhte an croidhese; An croidhese c...,287,170


In [71]:
dated.tail()

Unnamed: 0,csg_id,title,author,pen_name,editor,year,century,genre,publisher,link,text,tokens,types
777,3163,Seanfhocail na Muimhneach,"Ó Siochfhradha, Pádraig","Seabhac, An",An Seanbhao,1926,20,prose,Comhlucht Oideachais na hÉireann,http://corpas.ria.ie/index.php?fsg_function=5&...,"SEANFHOCAIL NA MUIMHNEACH ""AN SEABHAC"" Do bhai...",53987,10093
423,3055,Filí gan Iomrádh,"Mac Grianna, Seosamh",Iolann Fionn,–,1926,20,prose,"Ultach, An t-",http://corpas.ria.ie/index.php?fsg_function=5&...,I. Is fada an lá sgoláirí ag obair go dicheall...,10483,2620
883,2634,An Troid agus an t-Uaigneas,"Ó Beirn, Liam, An tAthair","Beirneach, An",–,1926,20,prose,"Mac Ghuill, M.H. agus a Mhac, Teó",http://corpas.ria.ie/index.php?fsg_function=5&...,AN TROID AGUS AN t-UAIGNEAS LEAN DO'N tSOLUS F...,15639,3533
670,2903,Papers on Irish Idiom,"O'Leary, Peter Canon",–,"O'Rahilly, Thomas F.",1926,20,prose,Browne and Nolan Limited,http://corpas.ria.ie/index.php?fsg_function=5&...,Is ainmhí bó. Is fear é. Fear isea é. Isé é. I...,22987,3359
626,2815,Notes on Irish Words and Usages,"Ó Laoghaire, Peadar, An t-Ath.",–,–,1926,20,prose,Browne and Nolan Limited,http://corpas.ria.ie/index.php?fsg_function=5&...,NOTES ON IRISH WORDS AND USAGES BY AN T-ATHAIR...,52986,8527


In [72]:
dated.to_csv("./data/csg_books_sorted_by_year.tsv", sep="\t", encoding="utf-8", index=False)

## Even splits

10 parts, ~99 texts each, continuous split regardless of time

In [75]:
def split_list(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [80]:
chunks = split_list(list(dated["text"]), 99)

for i, c in enumerate(chunks):
    sents = []
    for text in c:
        sents += split_sents(text)
    sents = [sent for sent in sents if len(sent)>1]
    with open("./data/corpora/csnag_%s.txt" % i, "w", encoding="utf-8") as f:
        f.write("\n".join(sents))

In [81]:
chunks_year = split_list(list(dated["year"]), 99)

for i, c in enumerate(chunks_year):
    print("Part %s" % i)
    print("Start: %s" % c[0])
    print("End: %s" % c[-1])

Part 0
Start: 1581.0
End: 1640.0
Part 1
Start: 1640.0
End: 1690.0
Part 2
Start: 1691.0
End: 1728.0
Part 3
Start: 1729.0
End: 1771.0
Part 4
Start: 1771.0
End: 1817.0
Part 5
Start: 1817.0
End: 1836.0
Part 6
Start: 1836.0
End: 1875.0
Part 7
Start: 1876.0
End: 1908.0
Part 8
Start: 1908.0
End: 1919.0
Part 9
Start: 1919.0
End: 1926.0


In [83]:
chunks_tokens = split_list(list(dated["tokens"]), 99)

for i, c in enumerate(chunks_tokens):
    print("Part %s" % i)
    print("Tokens: %s" % sum(c))

Part 0
Tokens: 1824106
Part 1
Tokens: 1701390
Part 2
Tokens: 853931
Part 3
Tokens: 976147
Part 4
Tokens: 751839
Part 5
Tokens: 1217419
Part 6
Tokens: 701187
Part 7
Tokens: 1777732
Part 8
Tokens: 2551903
Part 9
Tokens: 2754762


In [84]:
chunks_types = split_list(list(dated["types"]), 99)

for i, c in enumerate(chunks_types):
    print("Part %s" % i)
    print("Types: %s" % sum(c))

Part 0
Tokens: 327616
Part 1
Tokens: 286320
Part 2
Tokens: 190175
Part 3
Tokens: 214226
Part 4
Tokens: 190916
Part 5
Tokens: 235074
Part 6
Tokens: 126760
Part 7
Tokens: 290801
Part 8
Tokens: 350689
Part 9
Tokens: 394039


### Preprocessing after manual cleaning

In [89]:
for root, dirs, files in os.walk("./data/corpora/10/clean/"):
    for file in files:
        with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
            csnag_part = f.read().split('\n')
        csnag_part_sents, csnag_part_words = preprocess(csnag_part)
        print("Tokens: %s" % len(csnag_part_words))
        print("Types: %s" % len(set(csnag_part_words)))
        print("Sentencess: %s" % len(csnag_part_sents))
        
        ### Saving preprocessed texts
        with open(file.replace("clean.txt", "preprocessed.txt"), 'w', encoding='utf-8') as f:
            f.write('\n'.join(csnag_part_sents))
            
        ## Word counts
        csnag_part_counts = Counter(csnag_part_words)
        print("Most common words: %s" % csnag_part_counts.most_common(10), end="\n\n")
        
        # saving counts in json
        with open(file.replace("clean.txt", "counts.json"), 'w', encoding='utf-8') as f:
            json.dump(csnag_part_counts, f)
            
        # saving counts in plaintext
        with open(file.replace("clean.txt", "wordlist.txt"), 'w', encoding='utf-8') as f:
            for pair in csnag_part_counts.most_common():
                f.write('%s %s\n' % (pair[0], pair[1]))

Tokens: 1599808
Types: 155956
Sentencess: 69774
Most common words: [('do', 61289), ('agus', 59355), ('an', 52479), ('a', 46232), ('na', 30852), ('ar', 30513), ('go', 18029), ('mac', 14446), ('i', 13443), ('is', 13438)]

Tokens: 610031
Types: 60876
Sentencess: 24661
Most common words: [('a', 36829), ('agus', 23902), ('an', 22456), ('do', 13507), ('na', 12032), ('go', 12015), ('ar', 8929), ('is', 7329), ('air', 6743), ('le', 5614)]

Tokens: 1461687
Types: 140078
Sentencess: 62657
Most common words: [('agus', 80101), ('a', 53613), ('an', 53394), ('do', 52826), ('na', 31528), ('ar', 26368), ('go', 21613), ('is', 11733), ('sin', 11185), ('ní', 10280)]

Tokens: 1473547
Types: 86711
Sentencess: 89029
Most common words: [('an', 69880), ('a', 48967), ('agus', 42825), ('ar', 32313), ('do', 27971), ('go', 27555), ('na', 24547), ('sé', 22652), ('ag', 19781), ('i', 18573)]

Tokens: 748394
Types: 83053
Sentencess: 27018
Most common words: [('agus', 28940), ('do', 27726), ('an', 26711), ('a', 23392),