In [1]:
import dhlab as dh
import pandas as pd
import dhlab.api.dhlab_api as api
import requests
BASE_URL = api.BASE_URL

In [2]:
corpus = dh.Corpus(doctype="digibok", ddk="8*", limit = 500000, lang='nob', from_year = 1950, to_year=2022)

In [3]:
corpus.size

101367

In [4]:
def get_document_frequencies(urns = None, cutoff= 0, words = None):
    """Fetch frequency counts of ``words`` in documents (``urns``).
    Call the API :py:obj:`~dhlab.constants.BASE_URL` endpoint
    `/frequencies`.
    :param list urns: list of uniform resource name strings, for example:
        ``["URN:NBN:no-nb_digibok_2008051404065", "URN:NBN:no-nb_digibok_2010092120011"]``
    :param int cutoff: minimum frequency of a word to be counted
    :param list words: a list of words to be counted - if left None, whole document is returned.
    """
    params = locals()
    r = requests.post(f"{BASE_URL}/frequencies", json=params)
    result = r.json()
    # check if words are passed - return differs a bit
    if words is None:
        structure = dict()
        for u in result:
            try:
                structure[u[0][0]] = dict([(x[1], x[2]) for x in u])
            except IndexError:
                pass
        df = pd.DataFrame(structure)
        df = df.sort_values(by=df.columns[0], ascending=False).fillna(0)
    else:
        df = pd.DataFrame(result)
        df.columns = ["urn", "word", "count", "urncount"]
        #df = pd.pivot_table(df, values="count", index="word", columns="urn").fillna(0)
    return df

In [5]:
def countwords(urns=None, words=None):
    hifreq = get_document_frequencies(urns=list(urns), words= words)
    wordcounts = pd.pivot_table(hifreq, values="count", index="word", columns="urn").fillna(0).transpose()
    uc = pd.pivot_table(hifreq, values="urncount", index="word", columns="urn").fillna(0)
    urncounts = uc.max()
    c = pd.DataFrame()
    for x in wordcounts:
        c[x] = wordcounts[x]*100/urncounts
    return c

In [6]:
wc = countwords(urns = list(corpus.corpus.urn), words = "han hun henne ham hennes hans jeg meg du deg vi oss der dit".split())

In [7]:
wc3 = countwords(urns = list(corpus.corpus.urn), words = "han hun henne ham hennes hans".split())

In [8]:
wc3.sum(axis=1).describe()

count    99082.000000
mean         2.517593
std          1.306758
min          0.000459
25%          1.572796
50%          2.448588
75%          3.386231
max         10.432570
dtype: float64

In [9]:
wc.sum(axis=1).describe()

count    100471.000000
mean          5.033751
std           1.779675
min           0.000578
25%           3.923619
50%           5.114185
75%           6.262884
max          20.361991
dtype: float64

In [10]:
wc = countwords(urns = list(corpus.corpus.urn), words = "han hun jeg du vi der dit".split())

In [11]:
wc

Unnamed: 0_level_0,der,dit,du,han,hun,jeg,vi
urn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
100000000,1.010054,0.000000,0.181530,0.709831,0.409607,0.004655,0.004655
100000127,0.077551,0.018150,0.499959,0.301955,0.252454,1.831532,0.189753
100000256,0.125015,0.010565,0.170796,0.727202,0.242988,0.107408,0.044020
100000259,0.152889,0.013651,0.136508,0.556951,0.226603,0.010921,0.065524
100000274,0.108368,0.009602,0.218107,0.559671,0.039781,0.238683,0.186557
...,...,...,...,...,...,...,...
100613281,0.116362,0.007338,0.648901,1.455049,1.786313,0.561892,0.128942
100613282,0.508614,0.011455,0.077896,1.972599,0.152355,0.121426,0.297837
100613284,0.212160,0.013324,0.631355,0.924483,1.981182,0.879387,0.195761
100613286,0.245033,0.059603,0.331126,0.483444,0.920530,0.496689,0.715232


In [15]:
cf = corpus.frame[corpus.frame.langs=='nob']

In [27]:
hoel = list(cf[cf.authors.str.contains('Hoel')].urn)

In [22]:
dh.Models().models

['nb_core_news_lg',
 'da_core_news_lg',
 'nb_core_news_sm',
 'en_core_web_lg',
 'en_core_web_md',
 'da_core_news_trf']

In [26]:
hoel[1]

100041949

In [32]:
hoel_ner = [dh.NER(urn=h,model = "nb_core_news_lg", start_page=2, to_page=500).ner for h in hoel]


In [34]:
hoel_pos = [dh.POS(urn=h,model = "nb_core_news_lg", start_page=2, to_page=500).pos for h in hoel]


In [36]:
hoel_pos[10]

Unnamed: 0,token,lemma,pos,dep,frekv
476,du,du,PRON,nsubj,90
963,og,og,CCONJ,cc,81
711,i,i,ADP,case,50
488,e,e,AUX,cop,40
1130,som,som,PRON,nsubj,35
...,...,...,...,...,...
524,ende,ende,NOUN,amod,1
520,elden,eld,NOUN,nsubj,1
519,einaste,einesge,ADJ,amod,1
517,egne,egen,DET,det,1


In [33]:
hoel_ner[0]

Unnamed: 0,token,ner,frekv
1310,Håvard,PER,697
2344,ban,PER,195
2169,a,PER,179
1538,Kjersti,PER,133
2959,na,ORG,89
...,...,...,...
1603,Lavar,PER,1
1604,Ler,GPE_LOC,1
1605,Ler,PER,1
1606,Lerg,PROD,1
