# Code for collocations in newspapers

Some startup code, make sure to activate

In [1]:
import dhlab.module_update as mu
mu.update('dhlab_v2', silent = True)
import dhlab_v2 as d2

import pandas as pd

from random import sample

from util import sampling, sort, check_words, collocation, coll_dist

In [2]:
import util

In [3]:
mu.css()

# Setting things up
Set up the data for analysis, totals and a collocation function

In [4]:
tot = d2.totals(200000)

With collocations, the information association is computed using PMI (pointwise mutual information), with probabilites as proportions of frequency, it takes the form: $pmi(x,y) = \frac{p(x|y)}{p(x)} = \frac{p(y|x)}{p(y)}$. It is a probabilistic version of relevance. That $y$ is relevant for $x$ and vice versa. PMI is used instead of $\text{tf-idf}$ for computing associations between words.

The PMI-values are computed on normalized frequencies, which means that the actual number can be interpreted as a disproportion number.

## Build a corpus

The target corpus consists of newspapers published in Norway between 2020 and 2021

In [20]:
corpus = d2.document_corpus(doctype = 'digavis', from_year = 2000, to_year = 2022, limit = 300000)

Check how many documents there are in the corpus

In [21]:
len(corpus)

300000

In [7]:
import imp
imp.reload(util)

<module 'util' from 'C:\\Users\\yoons\\Documents\\Github\\avisconc\\util.py'>

# Collocations

Try out different words to get a feel of things

In [24]:
samplecorp = corpus.sample(30000)

In [25]:
coll = {x:coll_dist(x, corpus = samplecorp , totals = tot, window = 20) for x in "naturlig overnaturlig drømmefanger overtro".split()}

In [54]:
coll['unaturlig'] = coll_dist('unaturlig', corpus = samplecorp , totals = tot, window = 20)

In [60]:
coll['unaturlig'].sort_values(by=('before', 'bdist'), ascending = True).head(20)

place,before,before,before,before,after,after,after,after
kind,counts,bdist,freq,pmi,counts,bdist,freq,pmi
føles,37.0,3.179487,162235.0,91.044544,5.0,13.0,162235.0,12.303507
Ikke,75.0,3.311688,2846584.0,10.518021,15.0,15.588235,2846584.0,2.103637
helt,160.0,3.91358,7896580.0,8.088681,46.0,12.375,7896580.0,2.325532
derfor,55.0,4.561404,7726932.0,2.841531,16.0,12.388889,7726932.0,0.82664
ikke,1235.0,4.692805,99731314.0,4.943473,353.0,10.980282,99731314.0,1.413015
vel,62.0,4.8125,7301331.0,3.389896,10.0,14.583333,7301331.0,0.546766
virker,32.0,5.0,1409824.0,9.061119,4.0,14.166667,1409824.0,1.132657
kanskje,61.0,5.412698,4985306.0,4.884665,28.0,11.366667,4985306.0,2.242176
ville,118.0,5.425,12981990.0,3.628587,42.0,12.522727,12981990.0,1.291551
skummelt,9.0,5.454545,34056.0,105.49834,,,34056.0,


In [52]:
naturlig = coll['naturlig'][coll['naturlig'][('after','bdist')] < 5].sort_values(by=('after', 'counts'), ascending = False).head(30)

In [58]:
unaturlig = coll['unaturlig'][coll['unaturlig'][('after','bdist')] < 5].sort_values(by=('after', 'counts'), ascending = False).head(30)

In [53]:
print(', '.join(naturlig.index))

nok, del, valg, helaften, hører, utvikling, avgang, konsekvens, forklaring, faller, spørre, lys, sammenligne, død, søke, vis, favoritt, skuffet, midtpunkt, autoritet, samlingspunkt, reaksjon, nedslagsfelt, samarbeidspartner, invitere, førstevalg, forlengelse, stoppested, årsak, oppfølging


In [59]:
print(', '.join(unaturlig.index))

høyt, høye, høy, dødsfall, tenke, ettersom


In [39]:
coll['overtro'].sort_values(by=('before', 'bdist'), ascending = True).head(20)

place,before,before,before,before,after,after,after,after
kind,counts,bdist,freq,pmi,counts,bdist,freq,pmi
gammel,80.0,1.987805,1987643.0,40.360246,2.0,15.5,1987643.0,1.008879
Tro,147.0,2.597315,245233.0,601.091548,1.0,15.333333,245233.0,4.088542
17.30,63.0,3.753846,7789.0,8110.75053,1.0,20.0,7789.0,128.725817
10.40,23.0,4.2,3434.0,6716.294687,,,3434.0,
mye,45.0,4.297872,6802869.0,6.633193,17.0,12.526316,6802869.0,2.505556
Dagens,11.0,4.384615,222048.0,49.67616,1.0,18.333333,222048.0,4.515444
ettersom,11.0,4.538462,466259.0,23.657435,,,466259.0,
nytte,12.0,4.571429,923670.0,13.027666,2.0,16.5,923670.0,2.171003
Ifølge,14.0,4.9375,556900.0,25.208849,4.0,12.333333,556900.0,7.201619
dårligere,10.0,5.0,331550.0,30.244971,,,331550.0,


In [41]:
coll['drømmefanger'].sort_values(by=('before', 'bdist'), ascending = True).head(20)

place,before,before,before,before,after,after,after,after
kind,counts,bdist,freq,pmi,counts,bdist,freq,pmi
egen,6.0,5.75,5057558.0,53.902448,1.0,18.333333,5057558.0,8.983743
«,6.0,6.875,69483522.0,3.923445,,,69483522.0,
din,5.0,7.142857,3767328.0,60.302412,,,3767328.0,
en,13.0,8.733333,176861805.0,3.3397,3.0,16.2,176861805.0,0.7707
",",14.0,10.375,822864040.0,0.773033,20.0,12.227273,822864040.0,1.104333
Lag,7.0,10.444444,494655.0,642.974499,,,494655.0,
på,11.0,10.461538,170778213.0,2.926566,5.0,10.428571,170778213.0,1.330257
årets,3.0,10.6,220329.0,618.653826,,,220329.0,
om,5.0,10.714286,84917324.0,2.675296,4.0,12.833333,84917324.0,2.140237
vev,3.0,11.0,140949.0,967.06879,,,140949.0,


In [28]:
coll['naturlig'].sort_values(by=('after', 'bdist'), ascending = True).head(20)

place,before,before,before,before,after,after,after,after
kind,counts,bdist,freq,pmi,counts,bdist,freq,pmi
avgang,24.0,12.307692,111322.0,3.50853,501.0,1.512922,111322.0,73.24757
førstevalg,4.0,13.166667,6926.0,9.398802,102.0,1.634615,6926.0,239.692354
nedslagsfelt,4.0,11.0,25426.0,2.560218,110.0,1.830357,25426.0,70.412723
nok,706.0,11.060734,8487341.0,1.353718,9134.0,1.876861,8487341.0,17.515634
konsekvens,24.0,11.769231,268622.0,1.454001,402.0,1.940594,268622.0,24.356842
hundefor,,,,,38.0,1.95,2388.0,258.991654
midtpunkt,4.0,13.5,59160.0,1.10034,142.0,2.0,59160.0,39.065796
fortsettelse,9.0,13.454545,75126.0,1.949608,59.0,2.016393,75126.0,12.781983
forklaring,33.0,12.457143,611743.0,0.87789,322.0,2.092593,611743.0,8.566893
forekommende,1.0,15.666667,86603.0,0.187915,40.0,2.119048,86603.0,7.517329


In [28]:
coll['naturlig'].sort_values(by=('after', 'bdist'), ascending = True).head(20)

place,before,before,before,before,after,after,after,after
kind,counts,bdist,freq,pmi,counts,bdist,freq,pmi
avgang,24.0,12.307692,111322.0,3.50853,501.0,1.512922,111322.0,73.24757
førstevalg,4.0,13.166667,6926.0,9.398802,102.0,1.634615,6926.0,239.692354
nedslagsfelt,4.0,11.0,25426.0,2.560218,110.0,1.830357,25426.0,70.412723
nok,706.0,11.060734,8487341.0,1.353718,9134.0,1.876861,8487341.0,17.515634
konsekvens,24.0,11.769231,268622.0,1.454001,402.0,1.940594,268622.0,24.356842
hundefor,,,,,38.0,1.95,2388.0,258.991654
midtpunkt,4.0,13.5,59160.0,1.10034,142.0,2.0,59160.0,39.065796
fortsettelse,9.0,13.454545,75126.0,1.949608,59.0,2.016393,75126.0,12.781983
forklaring,33.0,12.457143,611743.0,0.87789,322.0,2.092593,611743.0,8.566893
forekommende,1.0,15.666667,86603.0,0.187915,40.0,2.119048,86603.0,7.517329


In [28]:
coll['naturlig'].sort_values(by=('after', 'bdist'), ascending = True).head(20)

place,before,before,before,before,after,after,after,after
kind,counts,bdist,freq,pmi,counts,bdist,freq,pmi
avgang,24.0,12.307692,111322.0,3.50853,501.0,1.512922,111322.0,73.24757
førstevalg,4.0,13.166667,6926.0,9.398802,102.0,1.634615,6926.0,239.692354
nedslagsfelt,4.0,11.0,25426.0,2.560218,110.0,1.830357,25426.0,70.412723
nok,706.0,11.060734,8487341.0,1.353718,9134.0,1.876861,8487341.0,17.515634
konsekvens,24.0,11.769231,268622.0,1.454001,402.0,1.940594,268622.0,24.356842
hundefor,,,,,38.0,1.95,2388.0,258.991654
midtpunkt,4.0,13.5,59160.0,1.10034,142.0,2.0,59160.0,39.065796
fortsettelse,9.0,13.454545,75126.0,1.949608,59.0,2.016393,75126.0,12.781983
forklaring,33.0,12.457143,611743.0,0.87789,322.0,2.092593,611743.0,8.566893
forekommende,1.0,15.666667,86603.0,0.187915,40.0,2.119048,86603.0,7.517329


In [16]:
coll = collocation("naturlig", corpus = corpus.sample(10000), totals=tot)

In [16]:
coll = collocation("naturlig", corpus = corpus.sample(10000), totals=tot)

In [18]:
coll[coll.pmi > 20].sort_values(by = 'counts', ascending = False).head(50)

Unnamed: 0,counts,freq,pmi
–,1612.0,32303.0,1207.864692
NRK,230.0,233171.0,23.875331
nyheter,169.0,143270.0,28.551409
avgang,166.0,111322.0,36.093018
Ap,143.0,145338.0,23.81513
¦,102.0,110036.0,22.436829
Daglig,97.0,76822.0,30.562032
FOTO,96.0,34193.0,67.956364
spillere,96.0,89135.0,26.068682
Frp,94.0,17135.0,132.782197
