# Illustrations

Access illustrations using the text surrounding them.

In [340]:
import dhlab.module_update as mu
#mu.update("nbpictures", silent = True)
from nbpictures import iiif_manifest, display_finds, get_urls_from_illustration_data, get_illustration_data_from_book, large_scale, small_scale
from IPython.display import HTML, Markdown, display
import streamlit as st
from PIL import Image

import sqlite3
import pandas as pd
import os
import re

small_scale = 0.59
large_scale = 1.58


In [248]:
def get_urls_from_illustration_data(illus, part = True, scale = None, cuts = True, delta = 0):
    """From infomration about a picture on a page in a book (or newspaper) generate a link to a picture.
    part sets size of output of page, it takes the value True or a number. If part is True it returns the cut out of image.
    illus is a dictionary of with entries and values like this: 
    {'height': 270, 'hpos': 251, 'page': 'digibok_2017081626006_0018', 'resolution': 400, 'vpos': 791, 'width': 373} 
    the variable cuts, if true allows cropping of image - restricted images must not go over 1024 x 1024 pixels"""
    
    if scale == None:
        if illus['resolution'] >= 300 or illus['resolution'] < 100:
            scale = large_scale
        else:
            scale = small_scale
            
    height = int(illus['height']) + 2*delta
    width = int(illus['width']) + 2*delta
    vpos = int(illus['vpos']) - delta
    hpos = int(illus['hpos']) - delta
    
    if cuts != False:
        if width * scale > 1024:
            width = int(1024/scale)
        if height * scale > 1024:
            height = int(1024/scale)
            
    urn = f"URN:NBN:no-nb_{illus['page']}"
    if part == True:
        # return cut out
        url = f"https://www.nb.no/services/image/resolver/{urn}/{int(hpos*scale)},{int(vpos*scale)},{int(width*scale)},{int(height*scale)}/full/0/native.jpg"
    else:
        # return whole page
        url = f"https://www.nb.no/services/image/resolver/{urn}/full/0,{part}/0/native.jpg"    
    return url

In [10]:
def query(db, sql, params=()):
    with sqlite3.connect(db) as con:
        res = con.execute(sql, params).fetchall()
    return res

def pdquery(db, sql, params=()):
    with sqlite3.connect(db) as con:
        res = pd.read_sql_query(sql, con, params=params)
    return res


In [401]:
illustrations_db = "/mnt/disk1/illustrations_bookshelf.db"
illustration_text = "/mnt/disk1/illus_db"
bigdatabase = "/mnt/disk1/illustrations_all.db"

In [8]:
r, d, f = next(os.walk(illustration_text))
illus = [os.path.join(r, x) for x in f]
illus[:5]

['/mnt/disk1/illus_db/illustrations_10000000_11000000_text.db',
 '/mnt/disk1/illus_db/illustrations_13000000_14000000_text.db',
 '/mnt/disk1/illus_db/illustrations_7000000_8000000_text.db',
 '/mnt/disk1/illus_db/illustrations_24000000_25000000_text.db',
 '/mnt/disk1/illus_db/illustrations_22000000_23000000_text.db']

In [292]:
pics = pd.concat([ pdquery(ill, "select page, rank from pictures_small where text match 'NEAR(nannestad vinter*, 3)' order by rank limit 20") for ill in illus])                                                                                                      

In [293]:
pictures = pics.sort_values(by="rank").drop_duplicates().head(20)

In [301]:
for x in pictures.head(5).iterrows():
    print(x[1]['page'])

digibok_2012071308021_0012
digibok_2007081404024_0159
digibok_2014120108043_0105
digibok_2012071308021_0084
digibok_2014010808089_0020


In [288]:
res = pdquery(illustrations_db, "select * from illustrations where page = ?", params = (t.iloc[0].page,))

In [302]:
for x in pictures.head(5).iterrows():
    res = pdquery(illustrations_db, "select * from illustrations where page = ?", params = (x[1].page,))
    illustrations += [dict(i[1]) for i in res.iterrows()]

In [287]:
t = pdquery(illustrations_db, "select * from illustrations where page = ?", params = ("digibok_2007081404024_0158",))
t

Unnamed: 0,page,pagenum,resolution,hpos,vpos,width,height,type
0,digibok_2007081404024_0158,158,150,0,1759,3588,5102,Illustration


In [303]:
[get_urls_from_illustration_data(i, delta=50, cuts=True, part=500) for i in illustrations]

['https://www.nb.no/services/image/resolver/URN:NBN:no-nb_digibok_2007081404024_0158/full/0,500/0/native.jpg',
 'https://www.nb.no/services/image/resolver/URN:NBN:no-nb_digibok_2012071308021_0012/full/0,500/0/native.jpg',
 'https://www.nb.no/services/image/resolver/URN:NBN:no-nb_digibok_2007081404024_0159/full/0,500/0/native.jpg',
 'https://www.nb.no/services/image/resolver/URN:NBN:no-nb_digibok_2014120108043_0105/full/0,500/0/native.jpg',
 'https://www.nb.no/services/image/resolver/URN:NBN:no-nb_digibok_2012071308021_0084/full/0,500/0/native.jpg',
 'https://www.nb.no/services/image/resolver/URN:NBN:no-nb_digibok_2014010808089_0020/full/0,500/0/native.jpg']

In [448]:
def display_finds(r, width = 500):
    """A list of urls in r is displayed as HTML"""
    base = "https://www.nb.no/items/"
    rows = []
    for row in r:
        urnstring = re.findall("URN[^/]*", row)[0]
        prefix, doctyp, urn, page = urnstring.split('_')
        #print(f'{prefix}_{doctyp}_{urn}?page={page}')
        rows += [f"<tr><td><a href='{base}{prefix}_{doctyp}_{urn}?page={int(page) + 1}' target='_'><img src='{row}'  width={width}'></a></td></tr>" ]
    return HTML("""<html><head></head>
     <body>
     <table>
     {rows}
     </table>
     </body>
     </html>
     """.format(rows=' '.join(rows)))

def get_pictures0(text="sommeridyll", part = 300, cuts = False, hits = 10):
    pics = pd.concat([ pdquery(ill, f"select page, rank from pictures_small where text match '{text}' order by rank limit 200") for ill in illus])                                                                                                      
    pictures = pics.sort_values(by="rank").drop_duplicates().head(20)
    illustrations =[]
    for x in pictures.head(hits).iterrows():
        res = pdquery(illustrations_db, "select * from illustrations where page = ?", params = (x[1].page,))
        illustrations += [dict(i[1]) for i in res.iterrows()]
    images = [get_urls_from_illustration_data(i, delta=50, cuts=cuts, part=part) for i in illustrations]
    return images

def get_pictures(text="sommeridyll", part = 300, cuts = False, hits = 10):
    pics = pdquery(bigdatabase, f"select  page, rank from pictures_small where text match '{text}' order by rank limit 200")   
    picture_pages = pics.sort_values(by="rank")
    illustrations =[]
    for x in picture_pages.head(hits).iterrows():
        if isinstance(part, int):
            res = pdquery(illustrations_db, "select * from illustrations where page = ? group by page", params = (x[1].page,))
            #print(res)
        else:
            res = pdquery(illustrations_db, "select * from illustrations where page = ?", params = (x[1].page,))  
            #print(res)
        illustrations += [dict(i[1]) for i in res.iterrows()]
        
    images = [get_urls_from_illustration_data(i, delta=50, cuts=cuts, part=part) for i in illustrations]
    return images

In [451]:
display_finds(get_pictures(text= "NEAR(harald Ã¸stgaard lund siv berg, 10)", cuts=True, part=True))