In [92]:
import json
import re
from os import path
from glob import glob
from natsort import natsorted
from collections import defaultdict
from bs4 import BeautifulSoup

In [93]:
BILARA_DATA_DIR = './bilara-data/'
SC_DATA_DIR = './sc-data/'

In [94]:
# load acronyms and author data
with open(path.join(SC_DATA_DIR, 'misc/uid_expansion.json')) as f:
    acronyms = { a['uid']: a for a in json.loads(f.read()) }

with open(path.join(SC_DATA_DIR, 'additional-info/author_edition.json')) as f:
    authors = { a['long_name']: a for a in json.loads(f.read()) }

In [95]:
# source: https://www.oreilly.com/library/view/python-cookbook/0596001673/ch03s24.html
def int_to_roman(inp):
    """ Convert an integer to a Roman numeral. """

    if not isinstance(inp, type(1)):
        raise TypeError("expected integer, got %s" % type(inp))
    if not 0 < inp < 4000:
        raise ValueError("Argument must be between 1 and 3999")
    ints = (1000, 900,  500, 400, 100,  90, 50,  40, 10,  9,   5,  4,   1)
    nums = ('M',  'CM', 'D', 'CD','C', 'XC','L','XL','X','IX','V','IV','I')
    result = []
    for i in range(len(ints)):
        count = int(inp / ints[i])
        result.append(nums[i] * count)
        inp -= ints[i] * count
    
    return ''.join(result).lower()

In [96]:
def extract_vol_pos(pts_ref: str):
    """ Extract volume and position from reference id. """
    
    match_position = re.search(r'([0-9]+\.)?([0-9]+)$', pts_ref)
    assert match_position != None and match_position.group(2) != None 

    volume = match_position.group(1)
    volume = int_to_roman(int(volume.replace('.', ''))) if volume else None
    position = int(match_position.group(2))
    
    return '%s %i' % (volume, position) if volume else position

In [97]:
def extract_book(uid: str):
    """ Extract book acronym from uid. """

    book_id = re.match(r'^([^0-9]+)', uid).group(1)
    book_id = book_id.replace('pli-tv-vb-', '').replace('pli-tv-bi-', '').replace('pli-tv-bu', '').replace('pli-tv', '')
    
    assert book_id in acronyms
    assert acronyms[book_id]['acro']
    
    return acronyms[book_id]['acro']

In [98]:
def extract_edition(pts_ref: str):
    """ Extract edition number if exists. """
    
    edition = ''

    m = re.search(r'pts-vp-pli(1ed|2ed)', pts_ref)
    if m and m.group(1):
        edition = m.group(1)
    
    return edition

In [99]:
def format_pts(uid: str, pts_ref: str):
    """ Use SC segment reference and uid to convert PTS reference """
    
    edition = extract_edition(pts_ref)
    book = extract_book(uid)
    vol_pos = extract_vol_pos(pts_ref)
    
    return '%s (%s) %s' % (book, edition, vol_pos) if edition else '%s %s' % (book, vol_pos)

In [100]:
pts_to_refs = defaultdict(list)

# get all possible cross-refs for every pts ref
for ref_file in natsorted(glob(path.join(BILARA_DATA_DIR, 'reference/pli/ms/sutta/**/*.json'), recursive=True)):   
    with open(ref_file) as f:
        ref_data = json.loads(f.read())
    
    for uid, reference_list in ref_data.items():
        all_refs = [uid] + [x.strip() for x in reference_list.split(',')]
        pts_refs = []

        for ref in all_refs:
            if ref.startswith('pts-vp-pli'):
                pts_refs.append(ref)


        for ref in pts_refs:
            pts_to_refs[format_pts(uid, ref)].append({
                'file': path.basename(ref_file).replace('.json', '').replace('_reference', ''),
                'refs': all_refs
            })

In [105]:
%%time
language_lookup = {}

for pts_formatted, items in pts_to_refs.items():
    for item in items:
        html_files = glob(path.join(SC_DATA_DIR, 'html_text', 'de', 'pli', '**', item['file'] + '.html'), recursive=True)
        
        for html_file in html_files:
            language = html_file.split('html_text')[1][1:3]
            
            if not language in language_lookup:
                language_lookup[language] = defaultdict(list)

            try:
                with open(html_file) as f:
                    soup = BeautifulSoup(f.read(), 'html.parser')

                article = soup.find('article').attrs
                article_id = article['id']
                assert article_id                

                author = soup.find('meta', { 'name': 'author' })
                author_uid = authors[author['content']]['uid']
                assert author_uid
                
                # find first marker for link
                for ref in item['refs']:
                    marker = soup.find('a', { 'id': ref })

                    if marker:
                        link = 'https://suttacentral.net/%s/%s/%s#%s' % (article_id, language, author_uid, ref)
                        language_lookup[language][pts_formatted].append([article_id, link])
                        break
                print(language_lookup) 
            except Exception as e:
                raise e
                print('error', html_file, e)

{'de': defaultdict(<class 'list'>, {})}
{'de': defaultdict(<class 'list'>, {})}
{'de': defaultdict(<class 'list'>, {})}
{'de': defaultdict(<class 'list'>, {})}
{'de': defaultdict(<class 'list'>, {})}
{'de': defaultdict(<class 'list'>, {})}
{'de': defaultdict(<class 'list'>, {})}
{'de': defaultdict(<class 'list'>, {})}
{'de': defaultdict(<class 'list'>, {})}
{'de': defaultdict(<class 'list'>, {})}
{'de': defaultdict(<class 'list'>, {})}
{'de': defaultdict(<class 'list'>, {})}
{'de': defaultdict(<class 'list'>, {})}
{'de': defaultdict(<class 'list'>, {})}
{'de': defaultdict(<class 'list'>, {})}
{'de': defaultdict(<class 'list'>, {})}
{'de': defaultdict(<class 'list'>, {})}
{'de': defaultdict(<class 'list'>, {})}
{'de': defaultdict(<class 'list'>, {})}
{'de': defaultdict(<class 'list'>, {})}
{'de': defaultdict(<class 'list'>, {})}
{'de': defaultdict(<class 'list'>, {})}
{'de': defaultdict(<class 'list'>, {})}
{'de': defaultdict(<class 'list'>, {})}
{'de': defaultdict(<class 'list'>, {})}


{'de': defaultdict(<class 'list'>, {'AN i 98': [['an2.230-279', 'https://suttacentral.net/an2.230-279/de/nyanatiloka#sc1']], 'AN i 101': [['an3.1', 'https://suttacentral.net/an3.1/de/nyanatiloka#sc1']], 'AN i 102': [['an3.2', 'https://suttacentral.net/an3.2/de/nyanatiloka#sc1']], 'AN i 103': [['an3.3', 'https://suttacentral.net/an3.3/de/nyanatiloka#sc2']], 'AN i 104': [['an3.6', 'https://suttacentral.net/an3.6/de/nyanatiloka#sc1']], 'AN i 105': [['an3.9', 'https://suttacentral.net/an3.9/de/nyanatiloka#sc1']], 'AN i 106': [['an3.11', 'https://suttacentral.net/an3.11/de/nyanatiloka#sc1']], 'AN i 108': [['an3.13', 'https://suttacentral.net/an3.13/de/nyanatiloka#sc2']], 'AN i 109': [['an3.13', 'https://suttacentral.net/an3.13/de/nyanatiloka#sc5']], 'AN i 110': [['an3.14', 'https://suttacentral.net/an3.14/de/nyanatiloka#sc2']]})}
{'de': defaultdict(<class 'list'>, {'AN i 98': [['an2.230-279', 'https://suttacentral.net/an2.230-279/de/nyanatiloka#sc1']], 'AN i 101': [['an3.1', 'https://suttac

{'de': defaultdict(<class 'list'>, {'AN i 98': [['an2.230-279', 'https://suttacentral.net/an2.230-279/de/nyanatiloka#sc1']], 'AN i 101': [['an3.1', 'https://suttacentral.net/an3.1/de/nyanatiloka#sc1']], 'AN i 102': [['an3.2', 'https://suttacentral.net/an3.2/de/nyanatiloka#sc1']], 'AN i 103': [['an3.3', 'https://suttacentral.net/an3.3/de/nyanatiloka#sc2']], 'AN i 104': [['an3.6', 'https://suttacentral.net/an3.6/de/nyanatiloka#sc1']], 'AN i 105': [['an3.9', 'https://suttacentral.net/an3.9/de/nyanatiloka#sc1']], 'AN i 106': [['an3.11', 'https://suttacentral.net/an3.11/de/nyanatiloka#sc1']], 'AN i 108': [['an3.13', 'https://suttacentral.net/an3.13/de/nyanatiloka#sc2']], 'AN i 109': [['an3.13', 'https://suttacentral.net/an3.13/de/nyanatiloka#sc5']], 'AN i 110': [['an3.14', 'https://suttacentral.net/an3.14/de/nyanatiloka#sc2']], 'AN i 111': [['an3.15', 'https://suttacentral.net/an3.15/de/nyanatiloka#sc1']], 'AN i 112': [['an3.15', 'https://suttacentral.net/an3.15/de/nyanatiloka#sc3']], 'AN i

{'de': defaultdict(<class 'list'>, {'AN i 98': [['an2.230-279', 'https://suttacentral.net/an2.230-279/de/nyanatiloka#sc1']], 'AN i 101': [['an3.1', 'https://suttacentral.net/an3.1/de/nyanatiloka#sc1']], 'AN i 102': [['an3.2', 'https://suttacentral.net/an3.2/de/nyanatiloka#sc1']], 'AN i 103': [['an3.3', 'https://suttacentral.net/an3.3/de/nyanatiloka#sc2']], 'AN i 104': [['an3.6', 'https://suttacentral.net/an3.6/de/nyanatiloka#sc1']], 'AN i 105': [['an3.9', 'https://suttacentral.net/an3.9/de/nyanatiloka#sc1']], 'AN i 106': [['an3.11', 'https://suttacentral.net/an3.11/de/nyanatiloka#sc1']], 'AN i 108': [['an3.13', 'https://suttacentral.net/an3.13/de/nyanatiloka#sc2']], 'AN i 109': [['an3.13', 'https://suttacentral.net/an3.13/de/nyanatiloka#sc5']], 'AN i 110': [['an3.14', 'https://suttacentral.net/an3.14/de/nyanatiloka#sc2']], 'AN i 111': [['an3.15', 'https://suttacentral.net/an3.15/de/nyanatiloka#sc1']], 'AN i 112': [['an3.15', 'https://suttacentral.net/an3.15/de/nyanatiloka#sc3']], 'AN i

KeyboardInterrupt: 