# Code for collocations in newspapers

Some startup code, make sure to activate

In [112]:
import dhlab.module_update as mu
mu.update('dhlab_v2', silent = True)
import dhlab_v2 as d2

import pandas as pd

In [113]:
mu.css("css.css")

In [114]:
from random import sample

In [115]:
def sampling(a, b):
    res = a
    if b < len(a):
        res = sample(a, b)
    return res

In [116]:
def sort(df, col = None, up = True):
    if col is None:
        col = df.columns[0]
    return df.sort_values(by = col, ascending = (up == False))
    

In [117]:
def check_words(df, word_parts):
    """ do a df.loc[] on matches for word_parts - a kindof regex search through matches"""
    return pd.concat([df.loc[i] for i in [df[a].index for a in [df.index.str.contains(w) for w in word_parts]]])

# Setting things up
Set up the data for analysis, totals and a collocation function

In [118]:
tot = d2.totals(200000)

With collocations, the information association is computed using PMI (pointwise mutual information), with probabilites as proportions of frequency, it takes the form: $pmi(x,y) = \frac{p(x|y)}{p(x)} = \frac{p(y|x)}{p(y)}$. It is a probabilistic version of relevance. That $y$ is relevant for $x$ and vice versa. PMI is used instead of $\text{tf-idf}$ for computing associations between words.

The PMI-values are computed on normalized frequencies, which means that the actual number can be interpreted as a disproportion number.

In [18]:
def collocation(word, before = 20, after = 20, corpus = None, samples = 500000, totals = tot ):
    """ Collocations from counts - no distance here"""
    coll = d2.urn_collocation(urns = sampling(list(corpus.urn), samples), word = word, before = before, after = after)
    combo = pd.concat([coll, totals], axis = 1)
    combo.freq = combo.freq.fillna(min(combo.freq))
    combo.counts = combo.counts.fillna(min(combo.counts))
    combo['pmi'] = (combo.counts/combo.counts.sum())**1.02/(combo.freq/combo.freq.sum())
    return combo

## Build a corpus

The target corpus consists of newspapers published in Norway between 2020 and 2021

In [9]:
corpus = d2.document_corpus(doctype = 'digavis', from_year = 2020, to_year = 2021, limit = 100000)

Check how many documents there are in the corpus

In [10]:
len(corpus)

43490

Substitute the red string below with any substring describing a newspaper, to see how many are included

In [11]:
len(corpus[corpus.urn.str.contains('vestnytt')])

138

In [111]:
d2.concordance(urns = list(corpus.urn), words = """ "av korona"  """).apply(make_link).sample(5).style

NameError: name 'make_link' is not defined

In [103]:
concs = d2.concordance(urns = list(corpus.urn), words = """ sykkelritt korona  """)

In [104]:
concs.sample(min(5, len(concs))).style

Unnamed: 0,docid,urn,conc
25,203313043,URN:NBN:no-nb_digavis_gjengangeren_null_null_20210421_0_0_1,"... La Flèche Wallonne , kvinner Sykkelritt . Direkte . 12.35 Proffjentene ( r ) 12.55 UEFA – Ett år med korona ( r ) 14.00 Sykkel : Endagsritt : La Flèche Wallonne..."
14,202879367,URN:NBN:no-nb_digavis_vestnytt_null_null_20201103_33_83_1,... Bilen kjem mellom anna til å bli brukt under leiteaksjonar og arrangement slik som sykkelritt . No i korona-tida har den gamle Røde Kors-bilen...
23,203322943,URN:NBN:no-nb_digavis_helgelendingen_null_null_20210421_91_59_1,"... La Flèche Wallonne , kvinner Sykkelritt . Direkte . 12.35 Proffjentene ( r ) 12.55 UEFA – Ett år med korona ( r ) 14.00 Sykkel : Endagsritt : La Flèche Wallonne..."
3,203368924,URN:NBN:no-nb_digavis_ostlandetsblad_null_null_20210421_114_75_1,"... La Flèche Wallonne , kvinner Sykkelritt . Direkte . 12.35 Proffjentene ( r ) 12.55 UEFA – Ett år med korona ( r ) 14.00 Sykkel : Endagsritt : La Flèche Wallonne..."
4,202498786,URN:NBN:no-nb_digavis_lindesnes_null_null_20200811_132_94_1,... Tour of Norway for kids har holdt på siden 2005 og de arrangerer sykkelritt for barn i hele Norge . – Vi er innom femti steder i...


# Collocations

Try out different words to get a feel of things

In [12]:
def coll_dist(word, window = 10, corpus = None):
    before = collocation(word = word,before = window, after = 0, corpus = corpus)
    after = collocation(word = word,before = 0, after = window, corpus = corpus)
    before.columns = pd.MultiIndex.from_tuples([('before', x) for x in before.columns], names = ["place", "kind"])
    after.columns = pd.MultiIndex.from_tuples([('after', x) for x in after.columns], names = ["place", "kind"])
    result = pd.concat([before, after], axis = 1)
    return result

In [19]:
dcoll = coll_dist('korona', corpus = corpus, window = 20)

In [105]:
from math import log, sqrt, exp

In [63]:
dcoll[('before','score')] = (dcoll[('before','counts')].apply(log))*(dcoll[('before','bdist')]**-1)

In [64]:
dcoll[('after','score')] = (dcoll[('after','counts')].apply(log))*(dcoll[('after','bdist')]**-1)

In [110]:
dcoll[dcoll[('before', 'bdist') ] < 5].sort_values(by = ('before', 'counts'), ascending = True).head(50).style.background_gradient()

place,before,before,before,before,before,after,after,after,after,after,score,before,after
kind,counts,dist,bdist,freq,pmi,counts,dist,bdist,freq,pmi,Unnamed: 11_level_1,score,score
upåvirket,12.0,26.0,4.714286,28627.0,2.81841,2.0,23.0,15.75,28627.0,0.449276,0.527101,0.527101,0.044009
ložžejit,12.0,12.0,3.714286,2388.0,33.786688,,,,,,0.669013,0.669013,
bridgespilling,13.0,26.0,4.4,2388.0,36.660888,,,,,,0.582943,0.582943,
utbredelsen,13.0,28.0,4.533333,49976.0,1.751765,1.0,2.0,14.0,49976.0,0.126904,0.565798,0.565798,0.0
kurere,13.0,28.0,4.533333,19406.0,4.511295,3.0,42.0,16.4,19406.0,1.002227,0.565798,0.565798,0.066989
betviler,13.0,26.0,4.4,6585.0,13.294791,2.0,35.0,18.75,6585.0,1.95314,0.582943,0.582943,0.036968
skumlere,13.0,26.0,4.4,2388.0,36.660888,1.0,18.0,19.333333,,,0.582943,0.582943,0.0
hater,13.0,13.0,3.533333,99631.0,0.878704,11.0,110.0,11.538462,99631.0,0.734623,0.725929,0.725929,0.207818
kunnskapom,14.0,38.0,4.875,2388.0,39.539516,3.0,21.0,12.2,,,0.541345,0.541345,0.09005
Informasjonstelefon,15.0,43.0,4.882353,2388.0,42.422264,1.0,11.0,17.0,,,0.554661,0.554661,0.0


In [62]:
dcoll.loc['hater']

place   kind  
before  counts       13.000000
        dist         13.000000
        bdist         3.533333
        freq      99631.000000
        pmi           0.878704
after   counts       11.000000
        dist        110.000000
        bdist        11.538462
        freq      99631.000000
        pmi           0.734623
score                 0.725929
before  score        -0.036596
after   score        -0.026728
Name: hater, dtype: float64

In [61]:
dcoll[[("before", 'pmi'), ('before', 'counts'),('after','counts'),('after','pmi')]].sort_values(by = ('after', 'pmi'), ascending = False).head(20)

place,before,before,after,after
kind,pmi,counts,counts,pmi
–,4502.544806,18675.0,19116.0,4571.09868
karantene,953.856945,1398.0,1961.0,1335.41574
2021,1362.19109,1457.0,818.0,749.44726
NYHETER,1250.587594,515.0,313.0,746.018128
ahte,687.093169,511.0,418.0,554.944478
smittevern,285.781044,301.0,569.0,542.417283
viruset,171.572684,385.0,1076.0,485.23278
leat,456.55796,570.0,582.0,462.326114
munnbind,319.592702,244.0,340.0,444.417488
FHI,424.117757,223.0,226.0,426.215773
