# Collocations and concordances for newspapers

In [1]:
import dhlab.nbtext as nb
import dhlab.module_update as mu
mu.update('collocations')
from collocations import dist_coll_urn
import pandas as pd

Updated file `D:\Documents\GitHub\newspapers_coll_conc\collocations.py`

In [2]:
tot = nb.frame(nb.totals(50000), 'tot')

In [3]:
corpus = nb.book_corpus(ddk="813%", period=(1980, 2000), limit = 400)

In [4]:
corpus

Unnamed: 0,urn,author,title,year
0,2014022808345,"Hart, Mallory Dorn",duft av sjasmin,1993
1,2014072208257,"Cord, Barry",Skurker må dø,2000
2,2013083008212,"James, Ellen",Jakten på den rette,1999
3,2015011908049,"Puzo, Mario",fjerde K,1992
4,2016071108131,"Kienzle, William",Rosenkrans-mordene,1987
...,...,...,...,...
395,2008062501004,"Shuler, Linda Lay",Ørnens stemme,1996
396,2008071504131,"Greenburg, Dan",Oldefar i tissekassen,1997
397,2010092808140,"Murphy, Warren",Katastrofene,1990
398,2010081108124,"Finder, Joseph",Farlige krefter,1996


In [5]:
def make_coll_df(small, large, tot):
    """take two collocations, small and large, and a reference tot. The reference must av a column call tot as well"""
    coll = pd.DataFrame()
    coll['small'] = small['freq']
    coll['large'] = large['freq']
    coll['ratio'] = coll['small']/coll['large']
    coll['srel'] = coll.small/coll.small.sum()
    coll['lrel'] = coll.large/coll.large.sum()
    coll['ratio_rel'] = coll.srel/coll.lrel
    coll['nb'] = coll.small/tot.tot
    coll['combo'] = small.ascore*large.ascore
    coll['mass_dist'] = coll.srel**coll.combo * coll.ratio_rel
    return coll

Set up the distance parameters and collword, smd is small distance while lmd is large distance. These values are half of the actual window, and used to make a normalized score, call ascore. See below.

In [6]:
smd = 2.5
lmd = 5
collword = 'spiser'

In [7]:
a1 = dist_coll_urn(collword, urns=list(corpus.urn), after= int(2*smd), before = 0)
a2 = dist_coll_urn(collword, urns=list(corpus.urn), after= int(2*lmd), before = 0)

In [8]:
b1 = dist_coll_urn(collword, urns=list(corpus.urn), after=0, before = int(2*smd))
b2 = dist_coll_urn(collword, urns=list(corpus.urn), after=0, before = int(2*lmd))

In [9]:
a1['ascore'] = smd/a1.score
a2['ascore'] = lmd/a2.score

In [10]:
b1['ascore'] = smd/b1.score
b2['ascore'] = lmd/b2.score

In [11]:
 b1.loc['nok']

freq      1.000000
dist     -3.000000
score    -3.000000
ascore   -0.833333
Name: nok, dtype: float64

Create collocation dataframe based on data from a1 and a2

In [12]:
coll = make_coll_df(a1, a2, tot)
collb = make_coll_df(b1, b2, tot)

In [13]:
coll[coll.small > 3].sort_values(by='mass_dist', ascending=False)[:20]

Unnamed: 0,small,large,ratio,srel,lrel,ratio_rel,nb,combo,mass_dist
lunsj,49.0,52.0,0.942308,0.00982,0.005987,1.640069,0.000204,4.740774,7.775195
middag,87.0,91.0,0.956044,0.017435,0.010478,1.663976,0.000134,4.368949,7.269828
kveldsmat,5.0,5.0,1.0,0.001002,0.000576,1.740481,0.000151,3.889719,6.769981
grøten,4.0,4.0,1.0,0.000802,0.000461,1.740481,0.000106,3.013864,5.245573
frokost,45.0,48.0,0.9375,0.009018,0.005527,1.631701,0.000161,3.187149,5.200475
hos,15.0,15.0,1.0,0.003006,0.001727,1.740481,2e-06,2.880184,5.012906
jo,12.0,14.0,0.857143,0.002405,0.001612,1.491841,1e-06,3.28084,4.894491
salat,5.0,5.0,1.0,0.001002,0.000576,1.740481,4.7e-05,2.104377,3.662628
kaker,4.0,4.0,1.0,0.000802,0.000461,1.740481,2.8e-05,1.929012,3.357409
opp,34.0,41.0,0.829268,0.006814,0.004721,1.443326,1e-06,2.278112,3.288057


In [14]:
collb.sort_values(by='mass_dist', ascending=False)[:20]

Unnamed: 0,small,large,ratio,srel,lrel,ratio_rel,nb,combo,mass_dist
lunsj,2.0,2.0,1.0,0.000401,0.000228,1.759719,8.33441e-06,4.740774,8.342433
lunsjen,1.0,1.0,1.0,0.0002,0.000114,1.759719,2.186366e-05,1.923077,3.384076
langsomt,1.0,1.0,1.0,0.0002,0.000114,1.759719,8.077864e-07,1.923077,3.384076
kvelds,1.0,1.0,1.0,0.0002,0.000114,1.759719,2.520161e-05,1.923077,3.384076
vanligvis,2.0,2.0,1.0,0.000401,0.000228,1.759719,1.607924e-06,1.923077,3.384076
all,1.0,1.0,1.0,0.0002,0.000114,1.759719,1.72789e-07,1.733703,3.050831
egen,3.0,3.0,1.0,0.000601,0.000342,1.759719,4.812665e-07,1.692368,2.978093
hel,1.0,1.0,1.0,0.0002,0.000114,1.759719,8.923823e-07,1.674167,2.946064
tidlig,4.0,5.0,0.8,0.000802,0.000569,1.407776,2.018621e-06,2.07655,2.923316
jo,2.0,4.0,0.5,0.000401,0.000456,0.87986,2.271616e-07,3.28084,2.886679


In [15]:
coll.sort_values(by='nb', ascending=False)[:20]

Unnamed: 0,small,large,ratio,srel,lrel,ratio_rel,nb,combo,mass_dist
lunsj,49.0,52.0,0.942308,0.00982,0.005987,1.640069,0.000204,4.740774,7.775195
frokost,45.0,48.0,0.9375,0.009018,0.005527,1.631701,0.000161,3.187149,5.200475
kveldsmat,5.0,5.0,1.0,0.001002,0.000576,1.740481,0.000151,3.889719,6.769981
middag,87.0,91.0,0.956044,0.017435,0.010478,1.663976,0.000134,4.368949,7.269828
grøten,4.0,4.0,1.0,0.000802,0.000461,1.740481,0.000106,3.013864,5.245573
iskrem,3.0,3.0,1.0,0.000601,0.000345,1.740481,7.8e-05,2.431528,4.232028
pizza,3.0,3.0,1.0,0.000601,0.000345,1.740481,6.2e-05,2.994012,5.211021
drikker,21.0,22.0,0.954545,0.004208,0.002533,1.661368,6e-05,1.279257,2.125316
pai,2.0,2.0,1.0,0.000401,0.00023,1.740481,5.7e-05,1.333333,2.320641
pasta,3.0,3.0,1.0,0.000601,0.000345,1.740481,5.5e-05,0.958515,1.668278


In [16]:
coll.sort_values(by='mass_dist', ascending=False)[:20]

Unnamed: 0,small,large,ratio,srel,lrel,ratio_rel,nb,combo,mass_dist
lunsj,49.0,52.0,0.942308,0.00982,0.005987,1.640069,0.000204193,4.740774,7.775195
middag,87.0,91.0,0.956044,0.017435,0.010478,1.663976,0.0001342597,4.368949,7.269828
kveldsmat,5.0,5.0,1.0,0.001002,0.000576,1.740481,0.0001514417,3.889719,6.769981
grøten,4.0,4.0,1.0,0.000802,0.000461,1.740481,0.0001059406,3.013864,5.245573
pizza,3.0,3.0,1.0,0.000601,0.000345,1.740481,6.217359e-05,2.994012,5.211021
frokost,45.0,48.0,0.9375,0.009018,0.005527,1.631701,0.0001612759,3.187149,5.200475
hos,15.0,15.0,1.0,0.003006,0.001727,1.740481,1.553345e-06,2.880184,5.012906
jo,12.0,14.0,0.857143,0.002405,0.001612,1.491841,1.36297e-06,3.28084,4.894491
insekter,3.0,3.0,1.0,0.000601,0.000345,1.740481,1.792522e-05,2.558278,4.452633
iskrem,3.0,3.0,1.0,0.000601,0.000345,1.740481,7.804573e-05,2.431528,4.232028


In [17]:
a1.sort_values(by='ascore', ascending=False)[:30]

Unnamed: 0,freq,dist,score,ascore
lunsj,49.0,1.38,1.41,1.77305
dem,16.0,1.36,1.46,1.712329
jo,12.0,1.36,1.5,1.666667
middag,87.0,1.51,1.53,1.633987
kveldsmat,5.0,1.2,1.56,1.602564
nok,7.0,1.33,1.57,1.592357
pizza,3.0,1.0,1.67,1.497006
senere,3.0,1.0,1.67,1.497006
alltid,6.0,1.4,1.67,1.497006
grøten,4.0,1.33,1.75,1.428571


In [18]:
a2.sort_values(by='ascore', ascending=False)[:30]

Unnamed: 0,freq,dist,score,ascore
lunsj,52.0,1.8,1.87,2.673797
middag,91.0,1.83,1.87,2.673797
kveldsmat,5.0,1.2,2.06,2.427184
frokost,48.0,2.05,2.12,2.358491
hos,15.0,1.93,2.17,2.304147
grøten,4.0,1.33,2.37,2.109705
pizza,3.0,1.0,2.5,2.0
jo,14.0,2.31,2.54,1.968504
insekter,3.0,1.25,2.67,1.872659
salat,5.0,2.0,2.7,1.851852


In [19]:
coll['combo'] = a1.ascore*a2.ascore

In [20]:
coll['mass_dist'] = coll.combo * coll.ratio_rel

In [21]:
coll.sort_values(by='mass_dist', ascending=False)[['nb', 'mass_dist']][:10].fillna(0).style.background_gradient()

Unnamed: 0,nb,mass_dist
lunsj,0.000204193,7.7752
middag,0.00013426,7.26983
kveldsmat,0.000151442,6.76998
grøten,0.000105941,5.24557
pizza,6.21736e-05,5.21102
frokost,0.000161276,5.20047
hos,1.55335e-06,5.01291
jo,1.36297e-06,4.89449
insekter,1.79252e-05,4.45263
iskrem,7.80457e-05,4.23203


In [22]:
top10 = coll.sort_values(by='mass_dist', ascending=False)[['nb', 'combo', 'mass_dist']][:10]

In [23]:
top10['eng'] = ''
top10['eng'] = ['dinner', 'supper (alt)', 'lunch', 'supper', 'cake', 'breakfast', 'dinner',
       'pizza', 'breakfast.', 'proper']

In [24]:
top10 = top10[['eng', 'nb', 'mass_dist']].fillna(0)

In [25]:
top10

Unnamed: 0,eng,nb,mass_dist
lunsj,dinner,0.000204,7.775195
middag,supper (alt),0.000134,7.269828
kveldsmat,lunch,0.000151,6.769981
grøten,supper,0.000106,5.245573
pizza,cake,6.2e-05,5.211021
frokost,breakfast,0.000161,5.200475
hos,dinner,2e-06,5.012906
jo,pizza,1e-06,4.894491
insekter,breakfast.,1.8e-05,4.452633
iskrem,proper,7.8e-05,4.232028


In [26]:
coll.sort_values(by='mass_dist', ascending=False)[['nb', 'combo', 'mass_dist']][:10].fillna(0).to_csv()

',nb,combo,mass_dist\r\nlunsj,0.00020419304160120683,4.7407744529146285,7.775195313914806\r\nmiddag,0.00013425967364096803,4.368949005627206,7.269827619870266\r\nkveldsmat,0.00015144172522413375,3.889718695543938,6.769981336833486\r\ngrøten,0.00010594062028233176,3.0138637733574445,5.245572519360603\r\npizza,6.217358865953743e-05,2.9940119760479043,5.211020844083376\r\nfrokost,0.00016127587133769376,3.1871494135645073,5.200474572296045\r\nhos,1.5533451339237218e-06,2.8801843317972353,5.012905996324447\r\njo,1.3629695927162906e-06,3.2808398950131235,4.894490894049066\r\ninsekter,1.7925215998852785e-05,2.5582775628824628,4.452633393513866\r\niskrem,7.804573480059315e-05,2.4315281668222846,4.23202848273578\r\n'

In [27]:
%%HTML
<table border="1" style="border-collapse: separate;" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>nb</th>\n      <th>combo</th>\n      <th>mass_dist</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>middag</th>\n      <td>0.000798</td>\n      <td>8.281985</td>\n      <td>14.886919</td>\n    </tr>\n    <tr>\n      <th>aftens</th>\n      <td>0.000403</td>\n      <td>8.018475</td>\n      <td>14.719922</td>\n    </tr>\n    <tr>\n      <th>lunsj</th>\n      <td>0.001513</td>\n      <td>8.194035</td>\n      <td>14.638931</td>\n    </tr>\n    <tr>\n      <th>kveldsmat</th>\n      <td>0.000575</td>\n      <td>7.038288</td>\n      <td>12.274517</td>\n    </tr>\n    <tr>\n      <th>kake</th>\n      <td>0.000036</td>\n      <td>4.883385</td>\n      <td>8.964678</td>\n    </tr>\n    <tr>\n      <th>frokost</th>\n      <td>0.000591</td>\n      <td>4.980477</td>\n      <td>8.720121</td>\n    </tr>\n    <tr>\n      <th>middag.</th>\n      <td>0.000000</td>\n      <td>4.699248</td>\n      <td>8.626649</td>\n    </tr>\n    <tr>\n      <th>pizza</th>\n      <td>0.000414</td>\n      <td>4.931939</td>\n      <td>8.230738</td>\n    </tr>\n    <tr>\n      <th>frokost,</th>\n      <td>0.000000</td>\n      <td>3.955696</td>\n      <td>7.261673</td>\n    </tr>\n    <tr>\n      <th>skikkelig</th>\n      <td>0.000010</td>\n      <td>3.843552</td>\n      <td>7.055804</td>\n    </tr>\n  </tbody>\n</table>

Unnamed: 0,nb,combo,mass_dist
middag,0.000798,8.281985,14.886919
aftens,0.000403,8.018475,14.719922
lunsj,0.001513,8.194035,14.638931
kveldsmat,0.000575,7.038288,12.274517
kake,3.6e-05,4.883385,8.964678
frokost,0.000591,4.980477,8.720121
middag.,0.0,4.699248,8.626649
pizza,0.000414,4.931939,8.230738
"frokost,",0.0,3.955696,7.261673
skikkelig,1e-05,3.843552,7.055804
