In [2]:
import dhlab as dh
import pandas as pd
import dhlab.api.dhlab_api as api
import requests
BASE_URL = api.BASE_URL

In [3]:
corpus = dh.Corpus(doctype="digibok", ddk="8*", limit = 500000, lang='nob', from_year = 1950, to_year=2022)

In [4]:
corpus.size

101367

In [25]:
pronouns = "han hun henne ham hennes hans jeg meg du deg vi oss der dit".split()

In [5]:
def get_document_frequencies(urns = None, cutoff= 0, words = None):
    """Fetch frequency counts of ``words`` in documents (``urns``).
    Call the API :py:obj:`~dhlab.constants.BASE_URL` endpoint
    `/frequencies`.
    :param list urns: list of uniform resource name strings, for example:
        ``["URN:NBN:no-nb_digibok_2008051404065", "URN:NBN:no-nb_digibok_2010092120011"]``
    :param int cutoff: minimum frequency of a word to be counted
    :param list words: a list of words to be counted - if left None, whole document is returned.
    """
    params = locals()
    r = requests.post(f"{BASE_URL}/frequencies", json=params)
    result = r.json()
    # check if words are passed - return differs a bit
    if words is None:
        structure = dict()
        for u in result:
            try:
                structure[u[0][0]] = dict([(x[1], x[2]) for x in u])
            except IndexError:
                pass
        df = pd.DataFrame(structure)
        df = df.sort_values(by=df.columns[0], ascending=False).fillna(0)
    else:
        df = pd.DataFrame(result)
        df.columns = ["urn", "word", "count", "urncount"]
        #df = pd.pivot_table(df, values="count", index="word", columns="urn").fillna(0)
    return df

In [6]:
def countwords(urns=None, words=None):
    hifreq = get_document_frequencies(urns=list(urns), words= words)
    wordcounts = pd.pivot_table(hifreq, values="count", index="word", columns="urn").fillna(0).transpose()
    uc = pd.pivot_table(hifreq, values="urncount", index="word", columns="urn").fillna(0)
    urncounts = uc.max()
    c = pd.DataFrame()
    for x in wordcounts:
        c[x] = wordcounts[x]*100/urncounts
    return c

In [7]:
cf = corpus.frame[corpus.frame.langs=='nob']

In [8]:
hoel = list(cf[cf.authors.str.contains('Hoel')].urn)

In [9]:
dh.Models().models

['nb_core_news_lg',
 'da_core_news_lg',
 'nb_core_news_sm',
 'en_core_web_lg',
 'en_core_web_md',
 'da_core_news_trf']

In [102]:
class Character_analysis():
    
    
    def __init__(self, urn):
        pronouns = [x.strip() for x in "han hun henne ham hennes hans jeg meg min mitt du deg din ditt vi oss vårt der dit hit".split()]
   
        self.pronouns = pronouns
        self.pronouns.extend([x.capitalize() for x in pronouns])
        self.ner = dh.NER(urn=urn, model = 'nb_core_news_lg', start_page=1, to_page=500).ner
        self.pos = dh.POS(urn=urn, model = 'nb_core_news_lg', start_page=1, to_page=500).pos
        
        counts = self.pos[self.pos.pos=='PRON']
        self.pron_count = counts.loc[[x for x in pronouns if x in counts.index]].sum()['lemma']
        
        self.names = [x for x in list(self.ner[self.ner.ner=='PER']['token']) if x.capitalize() == x]
        name_count = self.ner.groupby('ner').count()
        self.name_count = name_count.loc[[x for x in self.names]].sum()['token']
 

In [83]:
hoels = {h:Character_analysis(h) for h in hoel[:10]}

In [86]:
pd.DataFrame({u:[hoels[u].pron_count, hoels[u].name_count] for u in hoels}).transpose()

Unnamed: 0,0,1
URN:NBN:no-nb_digibok_2009032700058,132,127
URN:NBN:no-nb_digibok_2009021004062,154,321
URN:NBN:no-nb_digibok_2007092500058,100,280
URN:NBN:no-nb_digibok_2010061223002,132,179
URN:NBN:no-nb_digibok_2017072407077,33,42
URN:NBN:no-nb_digibok_2010061223004,167,209
URN:NBN:no-nb_digibok_2013010906193,26,28
URN:NBN:no-nb_digibok_2013012805137,21,14
URN:NBN:no-nb_digibok_2013042405018,32,16
URN:NBN:no-nb_digibok_2015050408017,162,377


In [88]:
hamsun = dh.Corpus(author='knut hamsun', limit= 10).frame
hamsun

Unnamed: 0,dhlabid,urn,title,authors,oaiid,sesamid,isbn10,city,timestamp,year,publisher,langs,subjects,ddc,genres,literaryform,doctype,ocr_creator,ocr_timestamp
0,100136821,URN:NBN:no-nb_digibok_2013022509503,"""En ganske almindelig flue"" : en monolog om kj...","Høegh , Ketil / Hamsun , Knut",oai:nb.bibsys.no:990227939094702202,2af95017bb8b8e21853460e6cf98bb79,,[Tromsø],20020101,2002,Hålogaland teater,mul / nob / nno,,839.822,drama,Skjønnlitteratur,digibok,nb,20060101
1,100139760,URN:NBN:no-nb_digibok_2013031906016,Pan,"Hamsun , Knut",oai:nb.bibsys.no:990927991404702202,2405e249517e7441c2f1a41762f4f462,,[Oslo],20090101,2009,Gyldendal,nob,,839.823,novel,Skjønnlitteratur,digibok,nb,20060101
2,100561468,URN:NBN:no-nb_digibok_2009071300099,August Weltumsegler : Roman,"Hamsun , Knut",oai:nb.bibsys.no:999309869624702202,c093149a7286f0c79dfad08784a5a672,3423113200.0,München,19900101,1990,Deutscher Taschenbuch Verl.,ger,,,fiction,Skjønnlitteratur,digibok,nb,20060101
3,100310507,URN:NBN:no-nb_digibok_2016092848035,Noveller,"Hamsun , Knut / Hoel , Sigurd / Lie , Nils",oai:nb.bibsys.no:999404999244702202,d85ba4f579a7233e26e511869eb48506,8205110719.0,Oslo,19780101,1978,Gyldendal,nob,,839.93,fiction,Skjønnlitteratur,digibok,nb,20060101
4,100056761,URN:NBN:no-nb_digibok_2011090206001,Hamsuns beste,"Hamsun , Knut",oai:nb.bibsys.no:990508206994702202,87ef08583742d6f19edf02d51949869a,8205340781.0,[Oslo],20050101,2005,Gyldendal,nob,,839.823,novel,Skjønnlitteratur,digibok,nb,20060101
5,100362221,URN:NBN:no-nb_digibok_2018013048072,Samlede verker. B. 17 : Siesta ; Krattskog ; S...,"Hamsun , Knut / Larsen , Lars Frode",oai:nb.bibsys.no:990715760374702202,bedf1678383c7922b671a7291b1141de,,[Oslo],20070101,2007,Gyldendal,nob,norsk / litteratur / samlede / verker / skjønn...,839.823,fiction,Skjønnlitteratur,digibok,nb,20060101
6,100611608,URN:NBN:no-nb_digibok_2021101848073,Sitater : et utvalg språklige lykketreff,"Hamsun , Knut / Dvergsdal , Alvhild",oai:nb.bibsys.no:999919932603802202,d052d8168ab97048d365524045087711,,Hamarøy,2018,2018,Hamsunsenteret,nob,,839.8286,,Faglitteratur,digibok,nb,20060101
7,100344962,URN:NBN:no-nb_digibok_2017082148027,Markens grøde,"Hamsun , Knut",oai:nb.bibsys.no:999729319024702202,630f2f87a1ba583d6b298b142545598c,,Oslo,19630101,1963,,und,,,,Uklassifisert,digibok,nb,20060101
8,100207576,URN:NBN:no-nb_digibok_2014052308161,Samlede verker. B. 18 : Den gåtefulle ; Bjørge...,"Hamsun , Knut / Larsen , Lars Frode",oai:nb.bibsys.no:990715760614702202,ee8f0b96da65b9b7256b15803a7e85cb,,[Oslo],20070101,2007,Gyldendal,nob,norsk / litteratur / samlede / verker / skjønn...,839.823,fiction,Skjønnlitteratur,digibok,nb,20060101
9,100136402,URN:NBN:no-nb_digibok_2013012108009,Landstrykere,"Hamsun , Knut",oai:nb.bibsys.no:999302866424702202,03627e0c34fa8052f8aeaf6fc3a08a93,8252500676.0,,19660101,1966,Den norske bokklubben,nob,,839.823 / 839.93,fiction,Skjønnlitteratur,digibok,nb,20060101


In [92]:
hamsun.iloc[9]

dhlabid                                    100136402
urn              URN:NBN:no-nb_digibok_2013012108009
title                                   Landstrykere
authors                                Hamsun , Knut
oaiid            oai:nb.bibsys.no:999302866424702202
sesamid             03627e0c34fa8052f8aeaf6fc3a08a93
isbn10                                    8252500676
city                                                
timestamp                                   19660101
year                                            1966
publisher                      Den norske bokklubben
langs                                            nob
subjects                                            
ddc                                839.823 /  839.93
genres                                       fiction
literaryform                        Skjønnlitteratur
doctype                                      digibok
ocr_creator                                       nb
ocr_timestamp                               20

In [103]:
landstrykere = Character_analysis(hamsun.iloc[9].urn)

In [104]:
print(landstrykere.pron_count,
landstrykere.name_count)

140 0.0


In [101]:
landstrykere.names

['Edevart',
 'August',
 'Joakim',
 'Knoff',
 'Karolus',
 'Ezra',
 'Teodor',
 'Ragna',
 'Pauline',
 'Gud',
 'Håbjørg',
 'Gabrielsen',
 'Håkon',
 'Hosea',
 'Nils',
 'Norem',
 'Papst',
 'Magnus',
 'Josefine',
 'Ellingsen',
 'Karel',
 'Lorensen',
 'Augusts',
 'Martinus',
 'Guds',
 'Julie',
 'Romeo',
 'Tykje',
 'Beret',
 'Olga',
 'Skåros',
 'Hermine',
 'Poldens',
 'Vilde',
 'Magrete',
 'Napoleon',
 'Joakims',
 'Tilslut',
 'Joakim vilde',
 'Gabrielsens',
 'Edevarts',
 'Neinei',
 'Hansen',
 'Vorherre',
 'Fossen',
 'Otte',
 'Andresen',
 'Moses',
 'Seiv',
 'August skrytte',
 'Johnsen',
 'August seiv',
 'Magnus krambodgut',
 'Joakim blev',
 'Karolus seiv',
 'Nils blev',
 'Johnny',
 'Martinus halskars',
 'Martinus halskar',
 'Hindø',
 'Pauline vilde',
 'Hile',
 'Skibet',
 'Blev',
 'Ezra blev',
 'August vilde bryte',
 'Teodor blev',
 'Edevart biet',
 'Dokker vilde',
 'Dokker',
 'Teodors',
 'Ander',
 'Geviss',
 'Skåros ur',
 'Pokkers',
 'Amerikabrev',
 'Herrens',
 'Teodor tre',
 'Teodor vilde',
 'S