# Søk i aviser

In [1]:
import json
import sqlite3
import requests
import pandas as pd
import dhlab.module_update as mu
mu.update('mods_meta')
import mods_meta as mm
import dhlab.nbtext as nb
from dhlab.nbtokenizer import tokenize
import re
from IPython.display import Markdown
from collections import Counter

Updated file `C:\Users\larsj\Documents\GitHub\aviser_trender\mods_meta.py`

### fjern ellipser i pandarammer

In [2]:
pd.set_option('display.max_colwidth', None)

### Hjelpefunksjoner

In [6]:
def nb_search(
    term = '', 
    creator = None, 
    number = 50, 
    page = 0, 
    title = None,
    mediatype = None, 
    lang = None,
    period = None,
    random = False
):
    """Søk etter term og få ut json"""
    
    # max number of hits pr. page
    # page has also a max of 50 - not checked
    
    number = min(number, 50)
    
    filters = []
    aq = []
    

    params = {
        'page':page, 
        'size':number,
        'random': random
    }
    
    if lang != None:
        aq.append('languages:{lang}'.format(lang = lang ))
    
    if title != None:
        filters.append('title:{title}'.format(title = title))
        
    if creator != None:
        filters.append('creator:{c}'.format(c=creator))
    
    if mediatype != None:
        filters.append('mediatype:{mediatype}'.format(mediatype=mediatype))
    
    if period != None:
        filters.append('date:[{date_from} TO {date_to}]'.format(date_from = period[0], date_to = period[1]))
    
    if filters != []:
        params['filter'] = filters
    
    if aq != []:
        params['aq'] = aq
        
    if term != None:
        params['q'] = '"' + term + '"'
    
    r = requests.get("https://api.nb.no:443/catalog/v1/items", params = params)
    return r.json()

def find_urns_sesam(term = None, creator = None, number=50, page=0, lang = None, title = None, mediatype = None, 
                    period = None, random = False):
    """generates urns/sesamid from item search"""
    x = nb_search(
        term = term, creator = creator, title = title,
        number = number, page = page, mediatype=mediatype, period = period, 
        lang = lang,
        random = random
    )
    try:
        sesamid =[f['id'] for f in x['_embedded']['items']]
    except:
        sesamid = []
    return sesamid


def find_items(term = None, 
               creator = None, 
               number = 100, 
               lang = None, 
               title = None, 
               mediatype = None, 
                period = None
              ):
    """find ids from a spec"""
    pages = number // 50
    lastnumbers = number % 50
    print(pages, lastnumbers)
    res = []
    # fetch ids from pages, max hits
    for page in range(pages):
        x = nb_search(
            term = term, creator = creator, title = title,
            number = 50, page = page, mediatype = mediatype, period = period, 
            lang = lang
        )
        res += [f['id'] for f in x['_embedded']['items']]

    # fetch ids from last page
    x = nb_search(
            term = term, creator = creator, title = title,
            number = lastnumbers, page = pages + 1, mediatype=mediatype, period = period, 
            lang = lang)
    res += [f['id'] for f in x['_embedded']['items']]
                  
    return res

def fetch_keys(m, path, delimiter = "/", res = [], start_list = '#'):
    """path /-delimited string, return res if fails, array indices indicaed with start_list"""
    
    # get the sequence of path elements
    path = path.split(delimiter)
    
    x = m
    try:
        for i in range(0, len(path)):
            if path[i].startswith(start_list):
                # then the item is an array selector
                index = int(path[i].split(start_list)[-1])
                x = x[index]
            else:
                x = x[path[i]]
    except KeyError:
        x = res
    return x
        
def find_item(data, item):
    res = []
    if isinstance(data, list):
        print('list', data)
        for subdata in data:
            res += find_item(subdata, item)
    elif isinstance(data, dict):
        for key in data:
            if item in data[key]:
                print('dictvalue',key, data[key])
                res.append(data[key][item])
            else:
                res += find_item(data[key], item)
    return res

def metadata(id):
    r = requests.get("https://api.nb.no:443/catalog/v1/items/" + str(id))

    d = r.json()
    res = {
        'title': fetch_keys(d, 'metadata/title'),
        'contr':[(fetch_keys(x, 'name'), fetch_keys(x, 'roles/#0/name')) for x in fetch_keys(d, 'metadata/people')],
        'pages': fetch_keys(d, 'metadata/physicalDescription/extent'),
        'urn': fetch_keys(d, 'metadata/identifiers/urn'),
        'year': fetch_keys(d, 'metadata/originInfo/issued'),
        'topics':fetch_keys(d, 'metadata/subject/topics'),
        'genres': fetch_keys(d, 'metadata/genres'),
        'target_group':fetch_keys(d, 'metadata/targetAudienceNotes')
                            
    }

    return res

def get_konks(urn, phrase, window = 500, n = 1000):
    import requests
    querystring = '"'+ phrase +'"' 
    query = {
        'q':querystring,
        'fragments': n,
        'fragSize':window
       
    }
    r = requests.get("https://api.nb.no/catalog/v1/items/{urn}/contentfragments".format(urn=urn), params = query)
    res = r.json()
    results = []
    try:
        for x in res['contentFragments']:
            pid = x['pageid']
            hit = x['text']
            splits = hit.split('<em>')
            s2 = splits[1].split('</em>')
            before = splits[0]
            word = s2[0]
            after = s2[1]
            results.append({'urn': urn, 'before': before, 'word':word, 'after':after})
    except:
        True
    return results

def get_konkordanser(word = '', urns = None, window = 500, n = 1000):
    konks = []
    for u in urns:
        konks += get_konks(u, word, window = window, n = n)
    return konks

In [4]:
def count_from_conc(concordance):
    """From a concordance, count the words in it. Assume konks reside a dataframe with columns 'after' and 'before'"""
    word = concordance['word'][0]
    return nb.frame_sort(nb.frame(Counter(tokenize(' '.join(concordance['after'].values + concordance['before'].values))), word))

def count_from_conc_window(concordance, before = 5, after = 5):
    """From a concordance, count the words in it. Assume konks reside a dataframe with columns 'after' and 'before'"""
    word = concordance['word'][0]
    # join before and after
    return nb.frame_sort(
            nb.frame(
                Counter(
                    [tokens for toklists in [tokenize(c)[:after] for c in concordance['after'].values] for tokens in toklists] 
                    + 
                    [tokens for toklists in [tokenize(c)[-before:] for c in concordance['before'].values] for tokens in toklists]
                )
            ), 
            word
        )

# Søk etter aviser

In [10]:
rand = set()
for i in range(20):
    rand |= set(find_urns_sesam('skandinaviske* ide*',  number = 50, random = True, mediatype = 'aviser', title = None, period=(17900101, 19201231) ))

In [11]:
len(rand)

179

In [12]:
konk = pd.DataFrame(get_konkordanser('skandinaviske* ide*', urns = rand, window = 500))

In [13]:
konk

Unnamed: 0,urn,before,word,after
0,9abc1040e2c5fca1ee2c8d1ce63c59a9,"den norske Ministerpræsident, den fhv. Antiskandinav Ha-anden til fortsatte nordiske Tilnærmelser, forende den",skandinaviske Ide,"fremad? Sandelig de gamle varmblodige, om end ofte «praktiske og doktrinære, Storslan» dinaver maa"
1,ee8e14c8d159f7c321272b3e7063c67e,samt praktiske Reformer til Samhold og Styrke mellem Broderfolkene indad som udad. Maaske var den,skandinaviske Ide,"dengang ikke moden til Gjennemftrclse, men ben egentlige Aarsag til, at Selskabets Virksomhed fik"
2,d90169b96a319c75e778aaf0511299a7,"endnu formaste sig til i deres Spalter at behandle Skandinavismen for et at sige ""den","skandinaviske Ide"",",faalebe« fom den fra danst Side nu kan behandles med en ikke faa ringe Tilsætning af
3,7bdc2a353d5a9789a091e9f69af55a6c,"paa hjer te, hvad liden kra \et ar ilein ; han belyste Btudenternes stilling til tien",skandinaviske ide,"og op fordrei alle i sandhetens, fwdrolandets < Btudentenerens navn til *- Baafreml ikke høiere"
4,0a25d64d17fbf7754554bf52c8322e64,"her tillands endnu ikke er kommen til nogen ""fuldkommen klar Bevidsthed"" med Hensyn til den","skandinaviske Ide,","kunde imidlertid vistnok mest passende vcrret borte, iscrr da Festen gjaldt Folk, der dog ikke"
...,...,...,...,...
236,db1be6e688ec918ad40b10c7886e1d88,"har indbudt Grundtvig, efter bet Sljenbffab ben übaatbiblelig maa have til bane Opfatning af den","skandinaviske Ide,","bet er bet fom glæber os saameget, fordi bi deri fe et Veviis paa, at den studerende"
237,0062c689b1f2aef2058d528db5e223bc,fornuftig Aarsag eller Anledning for den kongl. danske Regjering til at udtale sig om den,skandinaviske Ide,"og den af samme vakte Bevægelse; hvis der paahvilede den nogen Skygge af Mistanke om, at"
238,0062c689b1f2aef2058d528db5e223bc,"at rose „begge nordiske Suvercencr"" for den Klogskab, de bave vist, ved ligeover for den","skandinaviske Ide,","„at iagttage den passive Optræden, for hvilken de have bestemt sig."" Denne Ros, viser det sig"
239,0062c689b1f2aef2058d528db5e223bc,vi tro ved en anden Lejlighed er blevet sagt: at i Sommer have Nordens Konger lyst den,skandinaviske Ide,"i Kuld og Kjon. Naar Hr. v. Scheel altsaa nu roser Monarkerne for en Klogskab, som de"


In [14]:
konks = lambda x: pd.DataFrame(get_konkordanser(x, urns = rand, window = 500))

### For kollokasjoner, lag en referanse


In [None]:
ref = nb.frame(nb.totals(500), 'tot')

### Hent en kollokasjon 

Kollokasjonene tar utgangspunkt i en konkordans. Parametre er før og etter

In [None]:
coll = count_from_conc_window(konk, before = 5, after = 5)

In [None]:
def make_coll(word, urns, before = 5, after = 5, ref = ref):
    konk = pd.DataFrame(get_konkordanser(word, urns = urns, window = 500))
    coll = count_from_conc_window(konk, before = 5, after = 5)
    coll['pmi'] = coll[0]/ref.tot
    return nb.frame_sort(coll, 'pmi')

### lager en vektingskolonne

Kolonne kalt `pmi` for  pointwise mutual information.

$\textrm{pmi} = \frac{\textrm{collocation frequencies}}{\textrm{reference frequencies}}$

In [None]:
coll['pmi'] = coll[0]/ref.tot

In [None]:
nb.frame_sort(coll, 'pmi').head(50)

In [None]:
coll.sum()