
# Data and computations for paper 

Lars G. Johnsen
National Library of Norway

In [1]:
import dhlab.nbtext as nb
import dhlab.module_update as mu
mu.update('collocations')
from collocations import dist_coll_urn
import pandas as pd

Updated file `D:\documents\GitHub\newspapers_coll_conc\collocations.py`

In [2]:
def df_jaccard(df, col1, col2, number=100, asc=False):
    s1 = df.sort_values(by=col1, ascending=asc)[:number].index
    s2 = df.sort_values(by=col2, ascending=asc)[:number].index
    
    #print(len(set(s1)&set(s2)))
    
    return jaccard(s1, s2)

In [3]:
def jaccard(s1, s2):
    return len(set(s1)&set(s2))/len(set(s1)|set(s2))

In [4]:
def make_coll_df(small, large, tot):
    """take two collocations, small and large, and a reference tot. The reference must av a column call tot as well"""
    coll = pd.DataFrame()
    coll['small'] = small['freq']
    coll['large'] = large['freq']
    #coll['ratio'] = coll['small']/coll['large']
    coll['srel'] = coll.small/coll.small.sum()
    coll['lrel'] = coll.large/coll.large.sum()
    coll['ratio'] = coll.srel/coll.lrel
    coll['nb'] = coll.small/tot.tot
    return coll

### Reference corpus

The reference is the total counts from approximately 450 000 books from nb.no

In [5]:
tot = nb.frame(nb.totals(50000), 'tot')

In [6]:
nb.normalize_corpus_dataframe(tot)
tot.head()

Unnamed: 0,tot
.,0.058921
",",0.051453
og,0.025297
i,0.021425
det,0.012728


# Corpus

The corpus for doing collocations is a sample of 800 books from fictional literature, dewey decimal code 813.

In [7]:
corpus = nb.book_corpus(ddk="813%", period=(1980, 2000), limit = 5000)

In [8]:
corpus

Unnamed: 0,urn,author,title,year
0,2010041503037,"Snelling, Lauraine",DL på flyttefot,1999
1,2009021004144,"Zach, Cheryl",Tause tårer,2000
2,2011021808094,"Boswell, Barbara",Forførende blikk,1996
3,2010040603082,"Thomas, Jerry D.",Slangestien -og andre spennende fortellinger,1999
4,2010070208014,"Clancy, Tom",Rainbow 6,1999
...,...,...,...,...
4995,2015072308195,"Dreyer, Eileen",Ut av mørket,1993
4996,2008073104060,"Hart, Alison","Ta tøylene, Sara!",1996
4997,2013010708129,"Seger, Maura",Nattens kvinne,1995
4998,2008090100042,"Turow, Scott",Blomsterbarnas arv,1999


Set up the distance parameters and collword, smd is small distance while lmd is large distance. These values are half of the actual window, and used to make a normalized score, call ascore. See below.

In [9]:
smd = 5
lmd = 10
collword = 'demokrati'

In [10]:
def large_corpus_coll(collword, urns = None, after= 5, before = 5, n = 300):
    colls = []
    for i in range(0, len(urns), n):
        colls.append(nb.urn_coll(collword, urns=urns[i:i + n], after= after, before = before))
    coll = pd.concat(colls, axis=1, sort=False).sum(axis=1)
    return pd.DataFrame(coll)

In [11]:
a1 = nb.frame(large_corpus_coll(collword, urns=list(corpus.urn), after= int(2*smd), before = 0), 'freq')
a2 = nb.frame(large_corpus_coll(collword, urns=list(corpus.urn), after= int(2*lmd), before = 0), 'freq')

In [12]:
b1 = nb.frame(large_corpus_coll(collword, urns=list(corpus.urn), after=0, before = int(2*smd)), 'freq')
b2 = nb.frame(large_corpus_coll(collword, urns=list(corpus.urn), after=0, before = int(2*lmd)), 'freq')

## Create collocation dataframe 

Based on data from a1 and a2, and b1 and b2. The name for the collocations after is `coll` while `collb` is for the collocates coming before.

In [13]:
coll = make_coll_df(a1, a2, tot)
collb = make_coll_df(b1, b2, tot)

## Sorting 

Sorting on the reference is by column 'nb'

In [36]:
collb[collb.small > 1].sort_values(by='ratio', ascending=False)[:20][:20].fillna(0).style.background_gradient()

Unnamed: 0,small,large,srel,lrel,ratio,nb
),9,9,0.00179641,0.00104871,1.71297,1122.35
øst,3,3,0.000598802,0.000349569,1.71297,91237.3
fritt,3,3,0.000598802,0.000349569,1.71297,68138.6
ante,3,3,0.000598802,0.000349569,1.71297,168114.0
likhet,6,6,0.0011976,0.000699138,1.71297,197307.0
mitt,7,7,0.00139721,0.000815661,1.71297,39089.7
helst,3,3,0.000598802,0.000349569,1.71297,23893.0
generasjoner,3,3,0.000598802,0.000349569,1.71297,334283.0
stemmer,3,3,0.000598802,0.000349569,1.71297,66633.1
rene,3,3,0.000598802,0.000349569,1.71297,51874.3


In [15]:
coll[coll.small < coll.large].sort_values(by='ratio', ascending=False)[:20].fillna(0).style.background_gradient()

Unnamed: 0,small,large,srel,lrel,ratio,nb
Kina,12,13,0.00239521,0.0015062,1.59023,569233.0
hvor,9,10,0.00179641,0.00115861,1.55048,10538.4
frihet,9,10,0.00179641,0.00115861,1.55048,269729.0
landet,15,17,0.00299401,0.00196964,1.52008,85319.1
leve,6,7,0.0011976,0.00081103,1.47665,67087.3
',6,7,0.0011976,0.00081103,1.47665,2436.85
Denne,5,6,0.000998004,0.000695169,1.43563,17956.7
holde,5,6,0.000998004,0.000695169,1.43563,23385.6
praksis,5,6,0.000998004,0.000695169,1.43563,56701.5
heller,5,6,0.000998004,0.000695169,1.43563,17116.7


# collb

In [25]:
collb[collb.small < collb.large].sort_values(by='ratio', ascending=False)[:40].fillna(0).style.background_gradient()

Unnamed: 0,small,large,srel,lrel,ratio,nb
lever,22,23,0.00439122,0.00268003,1.6385,254043.0
vårt,16,17,0.00319361,0.00198089,1.61221,141160.0
verdens,11,12,0.00219561,0.00139828,1.57023,246423.0
sant,9,10,0.00179641,0.00116523,1.54168,97273.3
frihet,24,27,0.00479042,0.00314612,1.52264,719277.0
faktisk,8,9,0.00159681,0.00104871,1.52264,88205.9
land,8,9,0.00159681,0.00104871,1.52264,31375.4
Dette,8,9,0.00159681,0.00104871,1.52264,11978.9
virkelig,13,15,0.00259481,0.00174784,1.48458,84972.5
ønsket,6,7,0.0011976,0.000815661,1.46826,60894.0


In [17]:
collb.sort_values(by='nb', ascending=False)[:40].fillna(0).style.background_gradient()

Unnamed: 0,small,large,srel,lrel,ratio,nb
Reagan,5,5,0.000998004,0.000582615,1.71297,3348450.0
Ekte,3,3,0.000598802,0.000349569,1.71297,2237290.0
sosialisme,4,5,0.000798403,0.000582615,1.37038,1994910.0
liberalt,2,2,0.000399202,0.000233046,1.71297,1875720.0
føling,2,2,0.000399202,0.000233046,1.71297,1833690.0
styreform,2,2,0.000399202,0.000233046,1.71297,1827140.0
nazister,2,2,0.000399202,0.000233046,1.71297,1819560.0
opphøyd,2,2,0.000399202,0.000233046,1.71297,1728420.0
kommunismen,4,4,0.000798403,0.000466092,1.71297,1721630.0
russere,4,5,0.000798403,0.000582615,1.37038,1671760.0


# coll

In [18]:
coll.sort_values(by='ratio', ascending=False)[:10].fillna(0).style.background_gradient()

Unnamed: 0,small,large,srel,lrel,ratio,nb
rett,7,7,0.00139721,0.00081103,1.72275,19487.3
hardt,7,7,0.00139721,0.00081103,1.72275,105759.0
Hadde,1,1,0.000199601,0.000115861,1.72275,21164.6
MISTRY,1,1,0.000199601,0.000115861,1.72275,0.0
ROHINTON,1,1,0.000199601,0.000115861,1.72275,0.0
Russiawallaen,1,1,0.000199601,0.000115861,1.72275,0.0
avgjør,1,1,0.000199601,0.000115861,1.72275,103268.0
avlegs,1,1,0.000199601,0.000115861,1.72275,0.0
beseire,1,1,0.000199601,0.000115861,1.72275,665170.0
pris,2,2,0.000399202,0.000231723,1.72275,33898.8


In [19]:
coll.sort_values(by='nb', ascending=False)[['nb', 'mass_dist','combo']][:10].fillna(0).style.background_gradient()

KeyError: "['mass_dist', 'combo'] not in index"

# jaccard similarity

# for collb

In [None]:
jaccard_scoresb = nb.frame({'ratio': {x:df_jaccard(collb, 'nb', 'ratio', x) for x in range(5, 210, 5)},
                            'srel': {x:df_jaccard(collb, 'nb', 'srel', x) for x in range(5, 210, 5)},
                           'lrel': {x:df_jaccard(collb, 'nb', 'lrel', x) for x in range(5, 210, 5)}
                          }).transpose()

In [None]:
jaccard_scoresb.plot(title='Jaccard similarity of reference corpus');

# for coll

In [None]:
jaccard_scores = nb.frame({'ratio': {x:df_jaccard(coll, 'nb', 'ratio', x) for x in range(5, 210, 5)},
                            'srel': {x:df_jaccard(coll, 'nb', 'srel', x) for x in range(5, 210, 5)},
                           'lrel': {x:df_jaccard(coll, 'nb', 'lrel', x) for x in range(5, 210, 5)}
                          }).transpose()

In [None]:
jaccard_scores.plot();