# Søk i tidsskrift

In [1]:
import json
import sqlite3
import requests
import pandas as pd
import dhlab.module_update as mu
mu.update('mods_meta')
import mods_meta as mm
import dhlab.nbtext as nb
from dhlab.nbtokenizer import tokenize
import re
from IPython.display import Markdown
from collections import Counter

Updated file `C:\Users\yoons\Documents\GitHub\bokhylla_api_medietyper\mods_meta.py`

### fjern ellipser i pandarammer

In [2]:
pd.set_option('display.max_colwidth', None)

### Hjelpefunksjoner

In [11]:
def nb_search(
    term = '', 
    creator = None, 
    number = 50, 
    page = 0, 
    title = None,
    mediatype = None, 
    lang = None,
    period = None,
    random = False
):
    """Søk etter term og få ut json"""
    
    # max number of hits pr. page
    # page has also a max of 50 - not checked
    
    number = min(number, 50)
    
    filters = []
    aq = []
    

    params = {
        'page':page, 
        'size':number,
        'random': random
    }
    
    if lang != None:
        aq.append('languages:{lang}'.format(lang = lang ))
    
    if title != None:
        filters.append('title:{title}'.format(title = title))
        
    if creator != None:
        filters.append('creator:{c}'.format(c=creator))
    
    if mediatype != None:
        filters.append('mediatype:{mediatype}'.format(mediatype=mediatype))
    
    if period != None:
        filters.append('date:[{date_from} TO {date_to}]'.format(date_from = period[0], date_to = period[1]))
    
    if filters != []:
        params['filter'] = filters
    
    if aq != []:
        params['aq'] = aq
        
    if term != None:
        params['q'] = term
    
    r = requests.get("https://api.nb.no:443/catalog/v1/items", params = params)
    
    return r.json()

def find_urns_sesam(term = None, creator = None, number=50, page=0, lang = None, title = None, mediatype = None, 
                    period = None, random = False):
    """generates urns/sesamid from item search"""
    x = nb_search(
        term = term, creator = creator, title = title,
        number = number, page = page, mediatype=mediatype, period = period, 
        lang = lang,
        random = random
    )
    try:
        sesamid =[f['id'] for f in x['_embedded']['items']]
    except:
        sesamid = []
    return sesamid


def find_items(term = None, 
               creator = None, 
               number = 100, 
               lang = None, 
               title = None, 
               mediatype = None, 
                period = None
              ):
    """find ids from a spec"""
    pages = number // 50
    lastnumbers = number % 50
    print(pages, lastnumbers)
    res = []
    # fetch ids from pages, max hits
    for page in range(pages):
        x = nb_search(
            term = term, creator = creator, title = title,
            number = 50, page = page, mediatype = mediatype, period = period, 
            lang = lang
        )
        res += [f['id'] for f in x['_embedded']['items']]

    # fetch ids from last page
    x = nb_search(
            term = term, creator = creator, title = title,
            number = lastnumbers, page = pages + 1, mediatype=mediatype, period = period, 
            lang = lang)
    res += [f['id'] for f in x['_embedded']['items']]
                  
    return res

def fetch_keys(m, path, delimiter = "/", res = [], start_list = '#'):
    """path /-delimited string, return res if fails, array indices indicaed with start_list"""
    
    # get the sequence of path elements
    path = path.split(delimiter)
    
    x = m
    try:
        for i in range(0, len(path)):
            if path[i].startswith(start_list):
                # then the item is an array selector
                index = int(path[i].split(start_list)[-1])
                x = x[index]
            else:
                x = x[path[i]]
    except KeyError:
        x = res
    return x
        
def find_item(data, item):
    res = []
    if isinstance(data, list):
        print('list', data)
        for subdata in data:
            res += find_item(subdata, item)
    elif isinstance(data, dict):
        for key in data:
            if item in data[key]:
                print('dictvalue',key, data[key])
                res.append(data[key][item])
            else:
                res += find_item(data[key], item)
    return res

def metadata(id):
    r = requests.get("https://api.nb.no:443/catalog/v1/items/" + str(id))

    d = r.json()
    res = {
        'title': fetch_keys(d, 'metadata/title'),
        'contr':[(fetch_keys(x, 'name'), fetch_keys(x, 'roles/#0/name')) for x in fetch_keys(d, 'metadata/people')],
        'pages': fetch_keys(d, 'metadata/physicalDescription/extent'),
        'urn': fetch_keys(d, 'metadata/identifiers/urn'),
        'year': fetch_keys(d, 'metadata/originInfo/issued'),
        'topics':fetch_keys(d, 'metadata/subject/topics'),
        'genres': fetch_keys(d, 'metadata/genres'),
        'target_group':fetch_keys(d, 'metadata/targetAudienceNotes')
                            
    }

    return res

def get_konks(urn, phrase, window = 500, n = 1000):
    import requests
    querystring = '"'+ phrase +'"' 
    query = {
        'q':querystring,
        'fragments': n,
        'fragSize':window
       
    }
    r = requests.get("https://api.nb.no/catalog/v1/items/{urn}/contentfragments".format(urn=urn), params = query)
    res = r.json()
    results = []
    try:
        for x in res['contentFragments']:
            pid = x['pageid']
            hit = x['text']
            splits = hit.split('<em>')
            s2 = splits[1].split('</em>')
            before = splits[0]
            word = s2[0]
            after = s2[1]
            results.append({'urn': urn, 'before': before, 'word':word, 'after':after})
    except:
        True
    return results

def get_konkordanser(word = '', urns = None, window = 500, n = 1000):
    konks = []
    for u in urns:
        konks += get_konks(u, word, window = window, n = n)
    return konks

In [4]:
def count_from_conc(concordance):
    """From a concordance, count the words in it. Assume konks reside a dataframe with columns 'after' and 'before'"""
    word = concordance['word'][0]
    return nb.frame_sort(nb.frame(Counter(tokenize(' '.join(concordance['after'].values + concordance['before'].values))), word))

def count_from_conc_window(concordance, before = 5, after = 5):
    """From a concordance, count the words in it. Assume konks reside a dataframe with columns 'after' and 'before'"""
    word = concordance['word'][0]
    # join before and after
    return nb.frame_sort(
            nb.frame(
                Counter(
                    [tokens for toklists in [tokenize(c)[:after] for c in concordance['after'].values] for tokens in toklists] 
                    + 
                    [tokens for toklists in [tokenize(c)[-before:] for c in concordance['before'].values] for tokens in toklists]
                )
            ), 
            word
        )

# Søk i tidsskrift

In [14]:
x = nb_search(term='allehånde', mediatype='tidsskrift')

In [20]:
fetch_keys(x, 'metadata/identifiers/urn')

[]

In [8]:
rand = set()
for i in range(1):
    rand |= set(find_urns_sesam('vi',  number = 50, random = True, mediatype = 'tidsskrift', title = 'blikk'))

In [9]:
len(rand)

50

In [10]:
meta =  {x:metadata(x) for x in rand}
meta

{'7464a037c599594464e3269a70020bc3': {'title': 'Blikk (trykt utg.). 1996 Nr. 1',
  'contr': [],
  'pages': [],
  'urn': 'URN:NBN:no-nb_digitidsskrift_2019091181060_001',
  'year': '1996',
  'topics': ['homoseksualitet', 'homofili', 'tidsskrifter'],
  'genres': ['periodical'],
  'target_group': []},
 '0bf8a20978e74e3181d536c81d16b80e': {'title': 'Blikk (trykt utg.). 2007 Nr. 3',
  'contr': [],
  'pages': [],
  'urn': 'URN:NBN:no-nb_digitidsskrift_2019100481106_001',
  'year': '2007',
  'topics': ['homoseksualitet', 'homofili', 'tidsskrifter'],
  'genres': ['periodical'],
  'target_group': []},
 '65f6996e57694e86056739ca4bd2d152': {'title': 'Blikk (trykt utg.). 1992 Nr. 2',
  'contr': [],
  'pages': [],
  'urn': 'URN:NBN:no-nb_digitidsskrift_2019091181035_001',
  'year': '1992',
  'topics': ['homoseksualitet', 'homofili', 'tidsskrifter'],
  'genres': ['periodical'],
  'target_group': []},
 '76c9603671eb7406db2c3cdfa7d8a27c': {'title': 'Blikk (trykt utg.). 2007 Nr. 10',
  'contr': [],
  '

In [7]:
konk = pd.DataFrame(get_konkordanser('vi', urns = rand, window = 500))

In [36]:
konks = lambda x: pd.DataFrame(get_konkordanser(x, urns = rand, window = 500))

In [8]:
konk

Unnamed: 0,urn,before,word,after
0,274c194c0670b84625233d15dbc2a204,ungdom er ei ekstra sårbar gruppe: «[...] de lurer på mange hjerteskjærende ting. For eksempel fikk,vi,spørsmålet ‘hvordan kan jeg stoppe å være homofil?’ Mange skulle ønske at de var noe annet enn
1,274c194c0670b84625233d15dbc2a204,som tenker helhetlig.,Vi,trenger trojanske hester som løfter lhbt-temaet inn på arenaer
2,274c194c0670b84625233d15dbc2a204,"Skeiv verden og Skeiv ungdom skal utradere skammen og mobbingen alene, er uansvarlig av regjeringen.",Vi,forventer at de rødgrønne nå lager en konkret og helhetlig lhbt-plan der tiltakene i skolen blir
3,274c194c0670b84625233d15dbc2a204,år siden homoforbudet i straffelo- flere radiohits bak seg. ven ble opphevet. Med oss på laget har,vi,"fått Mer sang blir det når Norges lekreste homseden største homoforkjemperen i Norge, Kim Fri- kor,"
4,274c194c0670b84625233d15dbc2a204,"Morten Skogmus og Klatremus er som kjent venner. - Forholdet deres er ikke helt definert, men åpent.",Vi,"får ikke vite om deres sivile status, og det kan man bruke til sin egen fordel. Klatremus"
...,...,...,...,...
6228,15978476302695a4b857b76456d7945b,"sier han. gg "" Hamlet, trommer B .< Morsomt seg til en hybrid av funk, jazz-ele- Så far",vi,"håpe Sugarpops-budskapet menter og jungle, alt innpakket i nådde fram til homsene. I hvertfall 90-tallslyd,"
6229,15978476302695a4b857b76456d7945b,Sugarpops at det var mortydelig satte pris på. - Det var et somt å spille på Castro. -,Vi,"kjengodt publikum, sier Malika, grup- ner mange i miljøet så det er vår .yl|| .|B pas eneste"
6230,15978476302695a4b857b76456d7945b,en i dykker- re fra Amsterdam og Berlin ker til at våre deiligste og Store under- drakt og,vi,så også en kjekk - helt til du spaserer ut i skitneste fantasier kan leves holdninasiiODO- sikkerhetsvakt
6231,15978476302695a4b857b76456d7945b,nnom uten synlig kles- Oslo som for tida definitivt og pusset opp huset med bar utsida møtte,vi,"en fyr som ode, er det alltid mye gir en følelse av å være iet nede, skikkelig pisserenne,"


### For kollokasjoner, lag en referanse


In [33]:
ref = nb.frame(nb.totals(50000), 'tot')

### Hent en kollokasjon 

Kollokasjonene tar utgangspunkt i en konkordans. Parametre er før og etter

In [32]:
coll = count_from_conc_window(konk, before = 5, after = 5)

In [41]:
def make_coll(word, urns, before = 5, after = 5, ref = ref):
    konk = pd.DataFrame(get_konkordanser(word, urns = urns, window = 500))
    coll = count_from_conc_window(konk, before = 5, after = 5)
    coll['pmi'] = coll[0]/ref.tot
    return nb.frame_sort(coll, 'pmi')

### lager en vektingskolonne

Kolonne kalt `pmi` for  pointwise mutual information.

$\textrm{pmi} = \frac{\textrm{collocation frequencies}}{\textrm{reference frequencies}}$

In [34]:
coll['pmi'] = coll[0]/ref.tot

In [35]:
nb.frame_sort(coll, 'pmi').head(50)

Unnamed: 0,0,pmi
lesbiske,43,0.001595
VW,33,0.001386
Fjeldstad,29,0.001279
S2,30,0.00106
EE,38,0.000952
gratulerer,23,0.000941
homofile,54,0.000901
homofil,21,0.0009
Pf,29,0.000896
samboende,15,0.000632


In [10]:
coll.sum()

0    57164
dtype: int64

In [97]:
make_coll('homse', rand).head(20)

ConnectionError: ('Connection aborted.', OSError(10051, 'Det ble forsøkt en kontaktoperasjon til et nettverk som ikke kunne nås', None, 10051, None))

In [45]:
make_coll('dem', rand).head(20)

Unnamed: 0,0,pmi
eda,16,0.000432
send,48,0.00035
nålene,8,0.000349
lesbiske,9,0.000334
Folke-,26,0.000329
Send,19,0.000219
homofile,13,0.000217
ågi,14,0.00018
stjeler,8,0.000142
lovet,59,0.000111


In [46]:
make_coll('lyst', rand).head(20)

Unnamed: 0,0,pmi
Mimi,14,0.000496
kvinnelig,21,0.000115
lesbiske,3,0.000111
kloden,4,5.3e-05
morsomste,1,4.3e-05
homofil,1,4.3e-05
diskriminerende,1,4.3e-05
dent,1,4.3e-05
bankkonto,1,4.2e-05
glitter,1,4.2e-05


In [48]:
make_coll('penger', rand).head(20)

Unnamed: 0,0,pmi
Fo,28,0.000698
18.00,5,0.000159
dj,6,0.000126
hiv,3,9.1e-05
2.500,1,4.5e-05
investerte,1,4.4e-05
oppfordringen,1,4.4e-05
Records,1,4.1e-05
finansierer,1,4.1e-05
Workshop,1,3.8e-05


In [49]:
make_coll('aids', rand).head(20)

Unnamed: 0,0,pmi
lesbiske,112,0.004154
hiv,127,0.003847
informerer,82,0.003003
STATENS,81,0.002896
homofile,113,0.001885
Landsforeningen,22,0.000865
HELSE,20,0.000848
Opera,28,0.0005
Aires,17,0.000433
forsvinn,11,0.000426


In [50]:
make_coll('hiv', rand).head(20)

Unnamed: 0,0,pmi
aids,129,0.003722
adopsjon,121,0.002617
fei,121,0.001494
HELSE,31,0.001315
sammen-,22,0.000949
Gratis,16,0.000712
smitter,20,0.000686
ds,121,0.000662
Pluss,18,0.000647
Rica,17,0.000594
