In [1]:
import re
import json
import pandas as pd
import urllib.request
from bs4 import BeautifulSoup

In [14]:
def download_page(link):
    req = urllib.request.Request(link)
    with urllib.request.urlopen(req) as response:
        html = response.read()
    soup = BeautifulSoup(html, 'html.parser')
    return soup

In [15]:
base = "http://www.rhyddiaithganoloesol.caerdydd.ac.uk/en/"
mss_list = download_page("http://www.rhyddiaithganoloesol.caerdydd.ac.uk/en/manuscripts.php")

In [16]:
data = []
for link in mss_list.table.find_all('a'):
    ms = {}
    ms['ms_title'] = link.text
    ms['ms_link'] = base + link.get('href')
    ms['header_link'] = ms['ms_link'].replace("ms-home", "tei-header")
    data.append(ms)

In [17]:
dates = ["1275–1325", "1300–1350", "1350", "1350–1400", "1375–1425", "1400–1450"]

for ms in data[:6]:
    ms['date'] = dates[0]
    
for ms in data[6:24]:
    ms['date'] = dates[1]
    
for ms in data[24:36]:
    ms['date'] = dates[2]
    
for ms in data[36:39]:
    ms['date'] = dates[3]
    
for ms in data[39:-1]:
    ms['date'] = dates[4]
    
data[-1]['date'] = dates[5]

In [18]:
data[-3:]

[{'ms_title': 'NLW MS. Peniarth 19',
  'ms_link': 'http://www.rhyddiaithganoloesol.caerdydd.ac.uk/en/ms-home.php?ms=Pen19',
  'header_link': 'http://www.rhyddiaithganoloesol.caerdydd.ac.uk/en/tei-header.php?ms=Pen19',
  'date': '1375–1425'},
 {'ms_title': 'NLW MS. Llanstephan 4',
  'ms_link': 'http://www.rhyddiaithganoloesol.caerdydd.ac.uk/en/ms-home.php?ms=Llst4',
  'header_link': 'http://www.rhyddiaithganoloesol.caerdydd.ac.uk/en/tei-header.php?ms=Llst4',
  'date': '1375–1425'},
 {'ms_title': 'NLW MS. Peniarth 33',
  'ms_link': 'http://www.rhyddiaithganoloesol.caerdydd.ac.uk/en/ms-home.php?ms=Pen33',
  'header_link': 'http://www.rhyddiaithganoloesol.caerdydd.ac.uk/en/tei-header.php?ms=Pen33',
  'date': '1400–1450'}]

In [19]:
for ms in data:
    ms['contents'] = []
    header = download_page(ms['header_link'])
    rows = header.table.find_all('tr')[1:]
    for row in rows:
        cells = row.find_all('td')
        if len(cells) == 3:
            ms['contents'].append({'folio': cells[0].text, 'columns': '—', 'text': cells[1].text, 'scribe': cells[2].text})
        elif len(cells) == 4:
            ms['contents'].append({'folio': cells[0].text, 'columns': cells[1].text, 'text': cells[2].text, 'scribe': cells[3].text})
        
        elif len(cells) == 5:
            ms['contents'].append({'folio': cells[1].text, 'columns': cells[2].text, 'text': cells[3].text, 'scribe': cells[4].text})

In [20]:
data[0]

{'ms_title': 'NLW MS. Peniarth 16 part i',
 'ms_link': 'http://www.rhyddiaithganoloesol.caerdydd.ac.uk/en/ms-home.php?ms=Pen16i',
 'header_link': 'http://www.rhyddiaithganoloesol.caerdydd.ac.uk/en/tei-header.php?ms=Pen16i',
 'date': '1275–1325',
 'contents': [{'folio': '1r-2v',
   'columns': '—',
   'text': 'table of contents, not transcribed',
   'scribe': 'later hand'},
  {'folio': '3r-4v', 'columns': '—', 'text': 'blank', 'scribe': '-'},
  {'folio': '5r-6r',
   'columns': '—',
   'text': 'Pwyll y Pader o ddull Awstin Sant',
   'scribe': 'Peniarth 16 hand A'},
  {'folio': '6r-9r',
   'columns': '—',
   'text': 'Pwyll y Pader o ddull Hu Sant',
   'scribe': 'Peniarth 16 hand A'},
  {'folio': '9r-11r',
   'columns': '—',
   'text': 'Deuddeg Pwnc y Gredo',
   'scribe': 'Peniarth 16 hand B'}]}

In [21]:
with open('rhyddiaith.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii = False)

In [22]:
texts = []

for m in data:
    date = m["date"]
    ms = m["ms_title"]
    for t in m["contents"]:
        text = t["text"]
        folio = t["folio"]
        columns = t["columns"]
        scribe = t["scribe"]
        texts.append({"text": text, "date": date, "manuscript": ms, "folio": folio, "columns": columns, "scribe": scribe})

In [23]:
texts[-85]

{'text': 'Diarhebion',
 'date': '1375–1425',
 'manuscript': 'Oxford Jesus College MS. 111 (The Red Book of Hergest)',
 'folio': '264r-270v',
 'columns': '1057-1083',
 'scribe': 'Hywel Fychan'}

In [24]:
len(texts)

843

In [25]:
with open('rhyddiaith_texts.json', 'w', encoding='utf-8') as f:
    json.dump(texts, f, ensure_ascii = False)

In [26]:
text_names = [t['text'] for t in texts]
dates = [t['date'] for t in texts]
ms_names = [t['manuscript'] for t in texts]
folios = [t['folio'] for t in texts]
columns = [t['columns'] for t in texts]
scribes = [t['scribe'] for t in texts]

df_dict = {"text": text_names, "date": dates, "manuscript": ms_names, "folio": folios, "columns": columns, "scribe": scribes}

df = pd.DataFrame.from_dict(df_dict)
df.head()

Unnamed: 0,text,date,manuscript,folio,columns,scribe
0,"table of contents, not transcribed",1275–1325,NLW MS. Peniarth 16 part i,1r-2v,—,later hand
1,blank,1275–1325,NLW MS. Peniarth 16 part i,3r-4v,—,-
2,Pwyll y Pader o ddull Awstin Sant,1275–1325,NLW MS. Peniarth 16 part i,5r-6r,—,Peniarth 16 hand A
3,Pwyll y Pader o ddull Hu Sant,1275–1325,NLW MS. Peniarth 16 part i,6r-9r,—,Peniarth 16 hand A
4,Deuddeg Pwnc y Gredo,1275–1325,NLW MS. Peniarth 16 part i,9r-11r,—,Peniarth 16 hand B


In [27]:
df.tail()

Unnamed: 0,text,date,manuscript,folio,columns,scribe
838,Notes on the locations of a number of tracts i...,1400–1450,NLW MS. Peniarth 33,iv,—,>
839,Signature of William Morris dated 1662,1400–1450,NLW MS. Peniarth 33,v,—,William Maurice
840,Mixed notes on the contents of the manuscript,1400–1450,NLW MS. Peniarth 33,vi,—,unknown
841,Note in the hand of J. Gwenogvryn Evans?,1400–1450,NLW MS. Peniarth 33,vii,—,?Evans
842,Llyfr Blegywryd,1400–1450,NLW MS. Peniarth 33,1-186,—,Peniarth 33 hand A


In [28]:
df.to_csv("rhyddiaith_texts.tsv", sep="\t", encoding="utf-8", index=False)

In [29]:
for item in data:
    print(item['ms_title'])

NLW MS. Peniarth 16 part i
NLW MS. Peniarth 8 part i
NLW MS. Peniarth 8 part ii
NLW MS. Peniarth 7
NLW MS. Peniarth 21
NLW MS. Peniarth 3 part ii
Cambridge Trinity College MS. O.7.1
NLW MS. Peniarth 36A
NLW MS. Peniarth 36B
Bodorgan MS.
NLW MS. 3036 (Mostyn 117)
BL Cotton Cleopatra MS. A XIV
NLW MS. Peniarth 6 part iv
BL Harley MS. 4353
NLW MS. Peniarth 31
NLW MS. Peniarth 35
NLW MS. Peniarth 37
NLW MS. Peniarth 45
NLW MS. Peniarth 9
NLW MS. Peniarth 20
BL Cotton Cleopatra MS. B V part i
BL Cotton Cleopatra MS. B V part iii
Cardiff MS. 1.363 (Hafod 2)
NLW MS. Peniarth 14, pp.101-90
Cardiff MS. 1.362 (Hafod 1)
BL Cotton Cleopatra MS. B V part ii
NLW MS. Peniarth 10
Oxford Jesus College MS. 119 (The Book of the Anchorite of Llanddewi Brefi)
NLW MS. Peniarth 18
NLW MS. Peniarth 46
NLW MS. Peniarth 47 part i
NLW MS. Peniarth 5 (The White Book of Rhydderch, part 1)
NLW MS. Peniarth 4 (The White Book of Rhydderch, part 2)
BL Cotton Titus MS. D IX
BL Harley MS. 958
NLW MS. 24029 (Boston 5)
NL

In [30]:
data[-6]

{'ms_title': 'Oxford Jesus College MS. 111 (The Red Book of Hergest)',
 'ms_link': 'http://www.rhyddiaithganoloesol.caerdydd.ac.uk/en/ms-home.php?ms=Jesus111',
 'header_link': 'http://www.rhyddiaithganoloesol.caerdydd.ac.uk/en/tei-header.php?ms=Jesus111',
 'date': '1375–1425',
 'contents': [{'folio': '1r-8r',
   'columns': '1-29.39',
   'text': 'Ystoria Dared',
   'scribe': 'Red Book hand A'},
  {'folio': '8r',
   'columns': '29.39-30',
   'text': 'Ystoria Dared',
   'scribe': 'Hywel Fychan'},
  {'folio': '8v-26v',
   'columns': '31-104',
   'text': 'Brut y Brenhinoedd',
   'scribe': 'Red Book hand A'},
  {'folio': '', 'columns': '', 'text': 'folios missing', 'scribe': ''},
  {'folio': '27r-58r',
   'columns': '105-230.11',
   'text': 'Brut y Brenhinoedd',
   'scribe': 'Red Book hand AI'},
  {'folio': '58r-89v',
   'columns': '230.20-319, 340-376.8',
   'text': 'Brut y Tywysogion',
   'scribe': 'Red Book hand A'},
  {'folio': '89v-90r',
   'columns': '376.9-377.18',
   'text': 'Gildas 