In [4]:
import lxml

In [21]:
biblio = "/home/larj/urn-biblio-mods.db"
filelist = "/home/larj/notebooks/filelist.db"
nbdigital = "/disk4/"
ramdisk = '/ram_work' 
import json
import os
import tarfile
import shutil
from multiprocessing import Pool
import ipyparallel as ipp
import sqlite3
from time import time

In [10]:
def query(db, query, param=""):
    with sqlite3.connect(db) as con:
        cur = con.cursor()
        try:
            res = cur.execute(query, param).fetchall()
        except:
            res = []
    return res

In [45]:
def alto_extract(altofile, to_path = ramdisk):
    """
    Pakk ut tarfil til ramdisk og returnerer mappen filene ligger i
    """


    tf = tarfile.open(altofile, 'r')
    # lag mappe på disk med navnet foran det som står foran .tar
    filename = os.path.basename(altofile)
    dname = filename.split('.tar')[0]
    ndir = os.path.join(to_path, dname )
    os.mkdir(ndir)
    # Pakk ut alt til mappen
    tf.extractall(ndir)
    tf.close()
    return ndir


def extract_image_alto(ndir):
    """Hent ut alle bildene i alto-filen. Mappen ndir peker til mappen der tarfilene ligger"""
    
    import xml.etree.ElementTree as ET
    import shutil
    
    ns = {'mix': "http://www.loc.gov/mix/"} 
    # XML-filene ligger i mappen ndir, så gå gjennom med os.walk()
    # Alle filene blir liggende i variabelen f
    
    r,d,f = next(os.walk(ndir))
    
    # hent sidene i teksten og legg dem i variabelen pages
    # skip metadatafilene - tekstene har sidenummer representert som 4-sifrede nummer, f.eks. 0014
    # for bøker ligger de i slutten på _-sekvensen, for aviser er de nest sist
    
    # hent oppløsningen fra metsfil
    metsfile = [fmets for fmets in f if fmets.endswith('mets.xml')]
    if metsfile != []:
        metsfile = os.path.join(r, metsfile[0])
        tree = ET.parse(metsfile)
        root = tree.getroot()
        resolutions = {element.text for element in root.findall(".//mix:XphysScanResolution", ns)}
        resolution = list(resolutions)[0]
    else:
        resolution = 100 
    
    pages = []
    
    for page in f:
        pag = page.split('.xml')[0].split('_')
        try:
            int(pag[2])
            pages.append((page, int(pag[2])))
            
        except:
            True
    # print(pages)
    # alle avsnitt blir nummerert fortløpende
    
    illustrations = []
    # sorter variabelen pages på sidenummer, andre ledd i tuplet
    for page in sorted(pages, key=lambda x: x[1]):
        page_file = os.path.join(r, page[0])
        page_num = page[1]
        file_identifier = page[0].split('.')[0] # remove .xml
        #print(page[0], page[1])
        # parse XML-fila og få tak i rotelementet root
        tree = ET.parse(page_file)
        root = tree.getroot()
        #print(root)
        # Gå gjennom XML-strukturen via TextBlock, som er avsnittselementet
        for paragraph in root.findall(".//*[@TYPE='Illustration']"):
            illustrations.append(
                {'page': file_identifier, 
                 'pagenum':page_num,
                 'resolution': resolution,
                 'hpos': paragraph.attrib["HPOS"],
                 'vpos':paragraph.attrib["VPOS"],
                 'width':paragraph.attrib["WIDTH"],
                 'height':paragraph.attrib["HEIGHT"],
                 'type': 'Illustration'
                 }
            )
        for paragraph in root.findall(".//*[@TYPE='GraphicalElement']"):
            illustrations.append(
                {'page': file_identifier,
                 'pagenum':page_num,
                 'resolution': resolution,
                 'hpos': paragraph.attrib["HPOS"],
                 'vpos':paragraph.attrib["VPOS"],
                 'width':paragraph.attrib["WIDTH"],
                 'height':paragraph.attrib["HEIGHT"],
                 'type': 'GraphicalElement'
                 }
            )
        
    return illustrations

def clean_up_ramdisk():
    import shutil
    r,d,f = next(os.walk(ramdisk))
    for folder in d:
        shutil.rmtree(os.path.join(ramdisk, folder))
    

    
    
def fetch_words(altofile, to_path=ramdisk):
    """Lag en produksjonsløype for en enkelt ALTO-fil """
    
    import shutil
    
    ndir = alto_extract(altofile, to_path=to_path)
    print(ndir)
    text = process_alto(ndir)
    shutil.rmtree(ndir)
    return text

def process_tar(tarfile):
    """ illustration_database is global variable - set it outside function"""
    alto_dir = alto_extract(tarfile)
    illustrations = extract_image_alto(alto_dir)
    for i in illustrations:
        query(GLOBAL_illustration_database, 
          "insert into illustrations values (?,?,?,?,?,?,?,?)", 
                (i['page'], 
                 i['pagenum'],
                 i['resolution'],
                 i['hpos'],
                 i['vpos'],
                 i['width'],
                 i['height'],
                 i['type'] ))
    shutil.rmtree(alto_dir)
    
def process_tar_directory(tarfiles_dir):
    """ illustration_database is global variable - set it outside function"""
    safe = "".join([c for c in tarfiles_dir if c.isalpha() or c.isdigit() or c==' ']).rstrip()
    illustration_database = "illustrations_" + safe + ".db"
    query(illustration_database, """create table illustrations (
        page varchar, 
         pagenum int,
         resolution int,
         hpos int,
         vpos int,
         width int,
         height int,
         type varchar)""")
    print(tarfiles_dir, illustration_database)
    r, d, f = next(os.walk(tarfiles_dir))
    count = 0
    for file in f:
        count += 1
        tarfile = os.path.join(r, file)
        alto_dir = alto_extract(tarfile)
        illustrations = extract_image_alto(alto_dir)
        for i in illustrations:
            query(illustration_database, 
              "insert into illustrations values (?,?,?,?,?,?,?,?)", 
                    (i['page'], 
                     i['pagenum'],
                     i['resolution'],
                     i['hpos'],
                     i['vpos'],
                     i['width'],
                     i['height'],
                     i['type'] ))
        shutil.rmtree(alto_dir)
        if count % 10000 == 0:
            print(i, tarfile)
    return True
    



# Create sqlite3 database for illustrations

In [24]:
illustrationdb = 'illustrations_bookshelf_spb.db'
query(illustrationdb, """create table illustrations (
        page varchar, 
         pagenum int,
         resolution int,
         hpos int,
         vpos int,
         width int,
         height int,
         type varchar)""")


[]

In [25]:
GLOBAL_illustration_database = illustrationdb

In [290]:
#query(GLOBAL_illustration_database, "drop table illustrations")

[]

# Fill up database with files, use parallell

## Test speed

In [16]:
r,d,f = next(os.walk('/disk4/1'))

In [41]:
clean_up_ramdisk()

# Serial process

In [19]:
def process_tar_t(tf):
    t = time()
    process_tar(tf)
    print(tf, time() - t)


# Parallell (so serial is approx 2.1 sec pr. file)

In [46]:
worker.close()

In [47]:
worker.terminate()


In [48]:
worker

<multiprocessing.pool.Pool at 0x7f059c6dea20>

## Kjør hovedloop for å fylle opp database

In [None]:

if __name__ == '__main__':
    worker = Pool(7)
    directories = [os.path.join('/disk4', str(i)) for i in range(1, 7)]
    worker.map(process_tar_directory, directories)

/disk4/3 illustrations_disk43.db
/disk4/2 illustrations_disk42.db
/disk4/1 illustrations_disk41.db
/disk4/6 illustrations_disk46.db
/disk4/4 illustrations_disk44.db
/disk4/5 illustrations_disk45.db
