# Notebook for meaning and variation

Turku/Åbo 2017

This notebook is for investigating texts from the collection of digitized text at the Norwegian National Library: http://www.nb.no. Access is via an API which resides at http://api.nb.no/ngram, which the code provides an interface to. Some of texts (books) can be freely downloaded from https://www.nb.no/sprakbanken/show?serial=oai%3Anb.no%3Asbr-34&lang=en. 

There is code for fetching references to books via URNs, plotting trendlines for words, and getting concordances.

One way of comparing texts is to look at frequencies of a selected number of words. For this purpose heatmaps are used, which performs a kind of visual relativization of data. 



In [1]:
# Boilerplate imports

import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import json
import requests
from IPython.display import HTML
import seaborn as sns
from scipy.spatial.distance import cosine

sns.set_style('white')
%matplotlib inline

 ## Program code (activate and scroll past it ...)
 
 The functions defined below provide an interface, and serve as an example of how to use the API.

In [2]:

def get_ngrams(words, params={}, delta='og'):
    import requests
    import pandas as pd
    
    result = dict()
    found_word = False
    for word in words + [delta]:
        params['word'] = word
        r = requests.get("https://api.nb.no/ngram/ngram", params)   
        if r.status_code==200:
            result[word] = r.json()
            found_word=True

    if found_word==True:
        temp = pd.DataFrame(result)
        if delta != "":
            res = temp[words].divide(temp[delta], axis=0)
        else:
            res = temp
    else:
        res = r
    return res

def get_ngram(word, params=dict()):
    import requests
    import pandas as pd
    
    para = params
    para['word']= word
    r = requests.get('https://api.nb.no/ngram/ngram', params=para)
    r = pd.DataFrame.from_dict(r.json(), orient='index').sort_index()
    #r.columns = [word]
    return r


def heatmap(df, color='red'):
    return df.fillna(0).style.background_gradient(cmap=sns.light_palette(color, as_cmap=True))

def get_freq(words, params=dict()):
    import requests
    import pandas as pd
    
    para = params
    para['words']= words
    r = requests.post('https://api.nb.no/ngram/freq', json = para)
    #r.columns = [word]
    return r.json()

def get_urns(params=dict()):
    import requests
    
    para = params
    r = requests.get('https://api.nb.no/ngram/urn', json = para)
    #r.columns = [word]
    return r.json()

def get_urnfreq(urn, top=10, cutoff=10):
    params = dict()
    params['urn'] = urn
    params['top'] = top
    params['cutoff'] = cutoff
    r = requests.get('https://api.nb.no/ngram/urnfreq', json = params)
    return r.json()
    

def get_konk(word, url='konk',  params=dict(), html=True):
    import requests
    import pandas as pd
    
    para = params
    para['word']= word
    r = requests.get('https://api.nb.no/ngram/{url}'.format(url=url), params=para)
    if html:
        rows = ""
        for x in r.json():
            rows += """<tr>
            <td>
                <a href='{urn}' target='_'>{urnredux}</a>
                <td>{b}</td>
                <td>{w}</td>
                <td style='text-align:left'>{a}</td>
                </tr>\n""".format(urn=x['urn'], 
                                  urnredux=','.join([x['author'], x['title'], str(x['year'])]),
                                  b=x['before'],
                                  w=x['word'],
                                  a=x['after']
                                 )
        res = "<table>{rows}</table>".format(rows=rows)   
    else:
        try:
            res = pd.DataFrame(r.json())
            res = res[['urn','author','title','year','before','word','after']]
        except:
            res= pd.DataFrame()
        #r = r.style.set_properties(subset=['after'],**{'text-align':'left'})
    return res    
    
def plotrelative(ng1, ng2, period=(1850, 2005), window=2, figsize=(15,8), legend=False, title='Relative trend', ymin = 0):
    import matplotlib.pyplot as plt
    
    (ng1/ng2).loc[str(period[0]):str(period[1])].rolling(window, win_type='triang').mean().plot(legend=legend, title=title, figsize=figsize);
    fymin, fymax = plt.ylim()
    plt.ylim(ymin, fymax)
    return


def get_urnkonk(word, params=dict(), html=True):
    import requests
    import pandas as pd
    
    para = params
    para['word']= word
    r = requests.post('https://api.nb.no/ngram/urnkonk', json = para)
    if html:
        rows = ""
        for x in r.json():
            rows += """<tr>
                <td>
                    <a href='{urn}' target='_blank' style='text-decoration:none'>{urnredux}</a>
                </td>
                <td>{b}</td>
                <td>{w}</td>
                <td style='text-align:left'>{a}</td>
            </tr>\n""".format(urn=x['urn'],
                              urnredux="{t}, {f}, {y}".format(t=x['title'], f=x['author'], y=x['year']),
                              b=x['before'],
                              w=x['word'],
                              a=x['after']
                             )
        res = """<table>{rows}</table>""".format(rows=rows)    
    else:
        res = pd.DataFrame(r.json())
        res = res[['urn','before','word','after']]
        #r = r.style.set_properties(subset=['after'],**{'text-align':'left'})
    return res

def trend_plot(df,yf='1880',yt='2000', window=3, legend=False, title='Trend'):
    df.loc[yf:yt].rolling(window=window).mean().plot(figsize=(15,8),legend=legend, title=title)
    return
    
def periods(p_start, p_end, step):
    årstall = list(range(p_start, p_end, step))
    ypairs = []
    for i in range(len(årstall) - 1):
        ypairs.append((årstall[i], årstall[i+1]))
    return ypairs
    
def barplot(df):
    df.fillna(0).plot(kind='bar',figsize=(15,8), color= sns.hls_palette(len(df), l=.4, s=.9));
    return 

def dewey_dist(word):
    res = dict()
    for i in range(10):
        dta = get_ngram(word, params={'corpus':'bok','ddk':i}).sum()
        res[str(i*100)] = dta
    return res

def js_delta(js1, js2):
    # Arguments as json, output a dataframe
    return pd.DataFrame.from_dict(js1, orient='index')/pd.DataFrame.from_dict(js2, orient='index')

def get_words_from_urnset(words, urnset):
    urns = dict()
    for urn in urnset:
        urns[urn] = dict(get_freq(words, {'urn':urn}))
    return urns


## Concordances

Investigate a word with the function *get_konk*. It returns a concordance of the key word along with a link to metadata and the full text. If you live outside Norway, go to https://www.nb.no/samlingen/bokhylla-utlandet and apply for a permission to read texts published up to the year 2000.

In [3]:
HTML(
get_konk('arbeid',params={'corpus':'bok', 'title':'%','author': '%%', 'before':18,'after':18, 'size':2, 'offset':0})
)

0,1,2,3
"Nilsson, Lars Petter,XRF feltmålinger (TiO₂ og Fetot.) og magnetiske susceptibilitetsmålinger på mineraliseringer av jern-titan oksyder i den sydlige delen av Bjerkreim-Sokndal intrusjonen, Sokndal kommune, Rogaland,1996",. Ved intens bruk gikk imidlertid batteriet tomt etter ca. 7 timers,arbeid,", noe som hendte oss et par ganger ."
"Lingjærde, Christine,Introduksjon til fransk politikk og samfunnsliv,1994","var det fire millioner av dem , hvorav 1,75 millioner var i",arbeid,", omtrent én av 12 av den totale arbeidsstyrken."
"Lingjærde, Christine,Introduksjon til fransk politikk og samfunnsliv,1994",travail indépendant ( m ) : selvstendig,arbeid,
"Lingjærde, Christine,Introduksjon til fransk politikk og samfunnsliv,1994",emploi de proximité ( m ) :,arbeid,hos bekjente
"Lingjærde, Christine,Introduksjon til fransk politikk og samfunnsliv,1994",réinsertion ( f ) : tilbakeføring i,arbeid,
"Bjørkum, Åshild,Taeko og den gode nyheten,1977",mor og far holder på å stå opp . Far skal på,arbeid,i banken og må tidlig av garde for å nå det .
"Bjørkum, Åshild,Taeko og den gode nyheten,1977",Når far kommer fra,arbeid,", kjenner Taeko og Masao at de er sultne . De henger"
"Bjørkum, Åshild,Taeko og den gode nyheten,1977","sant , det er jo søndag , og far skal ikke på",arbeid,. Da husker hun plutselig at hun skal på søndagsskolen i dag
"Marthinsen, Edgar,Langtidsbrukere av sosialhjelp ved Byåsen sosialtjeneste i 1998,1999",Utsikter til,arbeid,", trygd eller fortsatt sosialhjelp ?"
"Marthinsen, Edgar,Langtidsbrukere av sosialhjelp ved Byåsen sosialtjeneste i 1998,1999",Løsningsfokusert,arbeid,47


## Trendplots

Any word can be plotted and compared, the function *get_ngram* extracts total frequencies from texts. For a simpler interface, one may also go to the web service https://www.nb.no/sp_tjenester/beta/ngram_1/#ngram/query?terms=trend&lang=all&case_sens=0&freq=rel&corpus=bok


In [6]:
arbeid_ng = get_ngram('onani', {'corpus':'bok'})



AttributeError: 'list' object has no attribute 'values'

## Relativization 

In order to plot relative trends, one need to divide the total frequency with som other number that varies over year, like the frequency of all words. Here we use the relative frequency of one trend over another, and choose hi frequency words like coordination and punctuation

In [None]:
coord_ng = get_ngram('og',{'corpus':'bok'})
punct_ng = get_ngram('.', {'corpus': 'bok'})

In [None]:
plotrelative(coord_ng,punct_ng)
plt.savefig('turku.png', dpi=300)

## Relative plot

The first trend line can be relativized using *coord_ng* and *punct_ng*

In [None]:
plotrelative(arbeid_ng, coord_ng + punct_ng)

## Examine identity

Identity is modelled in terms of what expressed in letters going from immigrants. For this purpose we try out an approach where a particular selection of words, words that are believed to connect to identity dimensions like family, work, health among others, and are compared with other texts. 

### URNs for the texts

In order to specify a particular corpus the function *get_urns* is used. It finds books based on title, author or year, and suggests a couple of reduced URN-identifiers.

In [None]:
# The letters 
a_urns = get_urns({'title':'%amerika til%', 'year':1980})
amerika_urns = [x[0] for x in a_urns]
a_urns

In [None]:
# The letters 
a_urns2 = get_urns({'title':'%amerika%','year':1950, 'next':50})
amerika_urns2 = [x[0] for x in a_urns2]
a_urns2

Note that the set of URNs that are taken along are just the reduced ones

In [None]:
get_urns({'title':'%kaleval%'})

### URNs can be expected for frequency distribution

The command is *get_urnfreq* which simply takes a reduced URN and returns the top N words.

In [None]:
get_urnfreq(2008090300033, top=100)

In [None]:
# convert the result to a dataframe for further processing

urnfreq = pd.DataFrame.from_dict(
    dict(
        get_urnfreq(2008090300033, top=1500)), 
                       orient='index').sort_values(by=0, ascending=False)


### Plot frequencies as barcharts

Commands may look cryptic - they are here just to show that the visuals are produced by a single line. In practice a more user friendly interface is preferred.

In [None]:
urnfreq.iloc[:30].plot(kind='bar', figsize=(15,10), legend=False, fontsize=18);

In [None]:
urnfreq.iloc[300:350].sort_values(by=0).plot(kind='barh',figsize=(18,18), fontsize=20)

## URN-sets for comparison

In [None]:
h_urn = get_urns({'author':'Hamsun%knut%', 'year':1955})
u1980 = get_urns({'year':1980})
u1920 = get_urns({'year':1920})

In [None]:
rand_urns = [x[0] for x in u1980[:60]]
hamsun_urns = [x[0] for x in u1920[:5]]
old_urns = [x[0] for x in u1920[2:38]]

In [None]:
old_urns

### Define a collection of marker words

These words may function as markers of identity, and serve as a comparison

In [None]:
embed_words = ['arbeid', 
               'arbeidet', 'arbeide',
               'land','landet','hjem','hjemme','hjemlandet',
               'rejse', 'reise','sjøen','hav','havet',
               'jorda','åkeren', 
               'gårdbruker','Gaardbruger',
               'slite', 'streve', 
               'familie','familien','kone','bror','broren','søster',
               'venn', 'venner','kamerat','kameraten','venninne','venninnen',
               'amerikanerne','amerikanere','landsmenn', 'landsmand','landsmann','engelsk', 'engelskmann',
               'indianer','indianere',
              'fattige','rike','fattigdom','rikdom','rigdom', 'Rige','rige','Fattige', 'Rigdom']
hi_words = ['og','.','i',]
words = embed_words + hi_words

### Get data for the words 

Each word is extracted from each urnset. The function

In [None]:
amerika_brev_urn = get_words_from_urnset(words, amerika_urns)

In [None]:
amerika_urn2 = get_words_from_urnset(words, amerika_urns2)

In [None]:
hamsun_urn = get_words_from_urnset(words, hamsun_urns)

In [None]:
rand_urn = get_words_from_urnset(words, rand_urns)

In [None]:
old_urn = get_words_from_urnset(words, old_urns)

## The letters

Her we look at the data from letters using a heatmap. Each URN is represented in a column, and the individual words from the marker set is coloured according to its frequency within a column.

In [None]:
heatmap(
pd.DataFrame(amerika_urn2).loc[embed_words]
)

## The data can be summarized over rows:

In [None]:
heatmap(pd.DataFrame(pd.DataFrame(amerika_urn2).loc[embed_words].sum(axis=1)))

### Repeat the process for the other urn_sets

In [None]:
heatmap(
pd.DataFrame(
    hamsun_urn  # <=== fill in urnset here
).loc[embed_words]
)

### Compare the URN-sets with each other

In the comparison all the urnsets are aggregated to one column.

In [None]:
result = pd.DataFrame(
    pd.concat([
    pd.DataFrame(amerika_brev_urn).loc[embed_words].sum(axis = 1),
    pd.DataFrame(amerika_urn2).loc[embed_words].sum(axis = 1),
    pd.DataFrame(hamsun_urn).loc[embed_words].sum(axis = 1),
    pd.DataFrame(rand_urn).loc[embed_words].sum(axis = 1),
    pd.DataFrame(old_urn).loc[embed_words].sum(axis = 1)
    ],
    axis=1
))

# column names for easy interpretation

result.columns = 'A-Brev Amerika Hamsun Rand Old'.split()
heatmap(result)

## Comparing by way of division

In [None]:
import numpy as np
comparo = pd.DataFrame(
    pd.DataFrame(
        amerika_brev_urn).loc[embed_words].sum(axis=1)/pd.DataFrame(
        hamsun_urn).loc[embed_words].sum(axis=1))
comparo=comparo.replace([np.inf, -np.inf], np.nan).dropna()
comparo.describe()