
# Data and computations for paper 

Lars G. Johnsen
National Library of Norway

In [1]:
import dhlab.nbtext as nb
import dhlab.module_update as mu
mu.update('collocations')
from collocations import dist_coll_urn, urn_coll, calculate_midpoint, dist
import pandas as pd

Updated file `C:\Users\yoons\Documents\GitHub\newspapers_coll_conc\collocations.py`

In [2]:
def large_corpus_dist_coll(collword, urns = None, after= 10, before = 0, n = 300):
    colls_freq = []
    colls_dist = []
    coll = pd.DataFrame()
    for i in range(0, len(urns), n):
        a = urn_coll(collword, urns = urns[i:i + n], after= after, before = before)
        colls_freq.append(nb.frame(a['freq']))
        colls_dist.append(nb.frame(a['dist']))
    coll['freq'] = pd.concat(colls_freq, axis=1, sort=False).sum(axis=1)
    coll['dist'] = pd.concat(colls_dist, axis = 1, sort=False).mean(axis=1)
    coll['dist_score'] = round(dist(coll['dist'], calculate_midpoint(before, after), coll['freq']), 2)
    return coll

In [3]:
def large_corpus_coll(collword, urns = None, after= 5, before = 5, n = 300):
    colls = []
    for i in range(0, len(urns), n):
        colls.append(nb.urn_coll(collword, urns=urns[i:i + n], after= after, before = before))
    coll = pd.concat(colls, axis=1, sort=False).sum(axis=1)
    return pd.DataFrame(coll)

In [4]:
def make_coll_df(small, large, tot):
    """take two collocations, small and large, and a reference tot. The reference must av a column call tot as well"""
    coll = pd.DataFrame()
    coll['small'] = small['freq']
    coll['large'] = large['freq']
    #coll['ratio'] = coll['small']/coll['large']
    coll['srel'] = coll.small/coll.small.sum()
    coll['lrel'] = coll.large/coll.large.sum()
    coll['ratio'] = coll.srel/coll.lrel
    coll['adjusted'] = coll.small**0.01*coll.ratio
    coll['nb'] = coll.small/tot.tot
    return coll

### Reference corpus

The reference is the total counts from approximately 450 000 books from nb.no

# Corpus

The corpus for doing collocations is a sample of 800 books from fictional literature, dewey decimal code 813.

In [5]:
smd = 5
lmd = 10
collword = 'skrive'
corpus_size= 1000
period = (1980, 2000)

In [6]:
def filenames(smd = 5, lmd = 10, collword = 'motor', corpus_size= 1000, period = (1980, 2000)):
    return {
    'small_right': '_'.join([collword, str(smd), str(corpus_size), '-'.join([str(x) for x in period]), str(0), str(int(2*smd)), '.csv']),
    'large_right': '_'.join([collword, str(smd), str(corpus_size),  '-'.join([str(x) for x in period]), str(0), str(int(2*lmd)), '.csv']),
    'small_left' : '_'.join([collword, str(lmd), str(corpus_size),  '-'.join([str(x) for x in period]), str(int(2*smd)), str(0), '.csv']),
    'large_left' : '_'.join([collword, str(lmd), str(corpus_size),  '-'.join([str(x) for x in period]), str(int(2*lmd)), str(0), '.csv'])
        }

In [7]:
fnames = filenames(smd=smd, lmd=lmd, collword=collword, corpus_size=corpus_size, period=period)
fnames

{'small_right': 'skrive_5_1000_1980-2000_0_10_.csv',
 'large_right': 'skrive_5_1000_1980-2000_0_20_.csv',
 'small_left': 'skrive_10_1000_1980-2000_10_0_.csv',
 'large_left': 'skrive_10_1000_1980-2000_20_0_.csv'}

In [8]:
corpus = nb.book_corpus(words=[collword], period = period, limit = corpus_size)

In [9]:
corpus = nb.frame([x for x in nb.metadata(corpus) if x[8] == 'nob'], ['urn'])

In [10]:
print(len(corpus))
corpus.head()

807


Unnamed: 0,urn,1,2,3,4,5,6,7,8,9
0,2014102008058,"Oldeide, Liv Mathilde",1995,Perlestein og tåresmil,,O.E. Eilertsen forl.,fiction,5915c7586cde2385c048c2308790aa98,nob,
1,2007080804022,"Rem, Håvard",1991,Galgeland,dikt,Cappelen,fiction,c93a65f7c701efb126b78847ff7fc82a,nob,
2,2014072208029,"Ytterland, Vegard",1999,Skolen - en sentral arena for barns bevegelses...,,Høgskolen i Oslo,unknown,b2bb134e9e2bd74e85b25aec435c1750,nob,
3,2008011601089,"Verne, Jules",1993,Tsarens kurer,,Gyldendal,fiction,852d01ff7bcd443335877a6cecdea5a6,nob,fre
4,2008060904086,,1990,Forbrukerrådets håndbok i personlig økonomi,,Forbrukerrådet,notfiction,6662485bac88c25a178e7fbe78e5e125,nob,


Set up the distance parameters and collword, smd is small distance while lmd is large distance. These values are half of the actual window, and used to make a normalized score, call ascore. See below.

In [11]:
a1 = nb.frame(large_corpus_dist_coll(collword, urns=list(corpus.urn), after= int(2*smd), before = 0), 'freq')
print(1)
a2 = nb.frame(large_corpus_dist_coll(collword, urns=list(corpus.urn), after= int(2*lmd), before = 0), 'freq')  
print(2)
b1 = nb.frame(large_corpus_dist_coll(collword, urns=list(corpus.urn), after=0, before = int(2*smd)), 'freq')
print(3)
b2 = nb.frame(large_corpus_dist_coll(collword, urns=list(corpus.urn), after=0, before = int(2*lmd)), 'freq')




1
2
3


In [12]:
a1.to_csv(fnames['small_right'])
a2.to_csv(fnames['large_right'])
b1.to_csv(fnames['small_left'])
b2.to_csv(fnames['large_left'])