In [4]:
import dhlab.nbtext as nb
import json

In [2]:
def interesting_urns(df, limit):
    jaccard = dict((df > 0).astype(int).sum(axis=1))
    res = []
    for urn in jaccard:
        if jaccard[urn] >= limit:
            res.append(urn)
    return res

def get_para(para_list):
    import ast
    paras = []
    for p in para_list:
        paras.append(ast.literal_eval(p))
    urns = [x[0] for x in paras]
    paradict = {u:[x[1] for x in paras if x[0] == u] for u in urns}
    paragraphs = dict()
    for u in paradict:
        paragraphs[u] = get_paragraphs(u, paradict[u])
    return paragraphs

def interesting_paragraphs(urns, wordbag, limit):
    interest = dict()
    for u in urns:
        paragraphs = nb.wordbag_eval_para(wordbag, [u])
        for x in paragraphs:
            if len(paragraphs[x]) >= limit:
                #rint(paragraphs[x])
                interest[x] = paragraphs[x]
    return interest

import requests
def get_paragraphs(urn, paras):
    """Return paragraphs for urn"""
    param = dict()
    param['paragraphs'] = paras
    param['urn'] = urn
    r = requests.get("https://api.nb.no/ngram/paragraphs", json=param)
    return r.json()

In [3]:
def major_urns(df):
    res = []
    means = df.mean()
    for urn in df.index:
        good_urn = True
        for theme in means.index:
            good_urn &= df[theme][urn] > means[theme]
        if good_urn:
            res.append(urn)
    return res

In [6]:
#temaene ble lagret med annen notebook - Heidis definisjoner_0_november
with open('heidis_temadefinisjoner_november.json') as f:
    first = json.load(f)

import json

json.dump(first, open('heidis_temadefinisjoner.json', 'w', encoding='utf-8'))

In [7]:
list_of_themes = list(first.keys())
list_of_themes

['“Kvindens Sandhed” (Minst 3 av 4).',
 '“Kvinde, lesning og religion” (Minst 4 av 5)',
 '“Forfatterinder og religion”  (Minst 3 av 4)',
 '“Kvindens posisjon” (Minst 4 av 6)',
 '“Kvinden og offentlig hygiene” (Minst 4 av 6)']

Her er URNene det skal søkes i

In [8]:
urns = nb.book_urn(period=(1830, 1880), limit=12000)

len(urns)

6803

In [9]:

round(90*7/100)

6

## Evaluering av bøker


Her er datarammen for klassifikasjon av bøker, med frekvens av hvert tema.

In [10]:
# sett inn et temanavn eller bare bruk indeks inn i temalisten, som her - 0 er første element 1 er andre osv.

theme_eval = nb.wordbag_eval(first[list_of_themes[0]], urns)

In [11]:
theme_eval.head(10)

Unnamed: 0,Frihed_,Kvinde_,Legeme_,Sandhed_,snitt
2014062048059,699.0,676.0,597.0,483.0,613.75
2009091403021,342.0,1225.0,365.0,424.0,589.0
2008082512001,376.0,1085.0,484.0,400.0,586.25
2008100603020,419.0,433.0,451.0,954.0,564.25
2008111303008,298.0,95.0,785.0,704.0,470.5
2013042608062,472.0,391.0,536.0,440.0,459.75
2016102148034,430.0,443.0,484.0,392.0,437.25
2015010648113,445.0,389.0,512.0,384.0,432.5
2016101129001,526.0,92.0,416.0,694.0,432.0
2016051048081,519.0,88.0,413.0,696.0,429.0


### Hva er gjennomsnittet for temaene?

Gjennomsnittet tas over alle bøkene.

In [12]:
theme_eval.mean()

Frihed_     23.937969
Kvinde_     45.470735
Legeme_     22.421878
Sandhed_    25.786893
snitt       29.404369
dtype: float64

# Se på effekten av `major_urns`  

Den plukker ut de URN-ene som har best sjanse for treff.

In [13]:
sorted_urns = theme_eval.loc[major_urns(theme_eval)]


In [14]:
sorted_urns.head(20).style.background_gradient()

Unnamed: 0,Frihed_,Kvinde_,Legeme_,Sandhed_,snitt
2014062048059,699,676,597,483,613.75
2009091403021,342,1225,365,424,589.0
2008082512001,376,1085,484,400,586.25
2008100603020,419,433,451,954,564.25
2008111303008,298,95,785,704,470.5
2013042608062,472,391,536,440,459.75
2016102148034,430,443,484,392,437.25
2015010648113,445,389,512,384,432.5
2016101129001,526,92,416,694,432.0
2016051048081,519,88,413,696,429.0


In [15]:
sorted_urns.mean()

Frihed_     106.886207
Kvinde_     198.120690
Legeme_      96.144828
Sandhed_    130.508621
snitt       132.915086
dtype: float64

# Hovedloop - generer avsnitt

In [None]:
for theme in first:
    theme_eval = nb.wordbag_eval(first[theme], urns)
    sorted_urns = theme_eval.loc[major_urns(theme_eval)]
    print(theme,'antall urner', len(sorted_urns))
    filename = "".join([c for c in theme if c.isalpha() or c.isdigit() or c==' ']).rstrip() + '.json'
    theme_dict = dict()
    for urn in nb.pure_urn(list(sorted_urns.index)):
        try:
            paras = interesting_paragraphs( [urn], first[theme], round(90*len(first[theme])/100))
            if len(paras) > 0:
                print(', '.join([str(x) for x in nb.metadata(urn)[0]]), len(paras))
            for w in paras:
                theme_dict[w] = paras[w]
        except:
            print('problember med:',  ', '.join([str(x) for x in nb.metadata(urn)[0]]))
    print('\n')      
    json.dump(theme_dict, open(filename, 'w', encoding='utf-8'))