In [4]:
import dhlab.nbtext as nb
import dhlab.module_update as mu
mu.update('collocations', overwrite=True)
from collocations import urn_coll, collocation
from numpy import log

In [137]:

from matplotlib import colors

cmap = "Blues"

inspect = lambda harry, w: harry.loc[w]

def check(word, frames):
    return {c:inspect(frames[c]['score'], word) for c in frames if word in frames[c].index}

def dist(obs_mean, expected, freq):
    factor = ((freq-1)/(freq))*obs_mean
    ratio = obs_mean/(obs_mean - factor)
    return obs_mean + (expected - obs_mean)/ratio


def create_frame(coll, expected):
    df = nb.frame(nb.frame(coll).transpose(), 'freq doc dist'.split())
    df['score'] = dist(df['dist'], expected, df['freq'])
    return df

def colls2df(colls, expected):
    colls_df = dict()
    for c in colls:
        colls_df[c] = create_frame(colls[c], expected)
    return colls_df

def calculate_midpoint(before, after):
    if before == 0:
        corr = 1
    elif after == 0:
        corr = -1
    else:
        corr = 0
    return (after - before + corr)/2
    
def make_collocations(word, period=(1945, 1990), step = 3, before = 0, after = 10):
    colls = dict()
    for year in range(period[0], period[1], step):
        print('behandler: ', year, year + step)
        try:
            colls[(year, year + step)] = collocation(word, yearfrom = year, yearto = year + step, corpus='avis', before= before, after = after)
        except:
            # try again - things may have loaded on the server...
            print('prøver en gang til for: ', (year, year + step))
            try:
                colls[(year, year + step)] = collocation(word, yearfrom = year, yearto = year + step, corpus='avis', before= before, after = after)
            except:
                print('klarte ikke: ', (year, year + step))
    colls_df = colls2df(colls, calculate_midpoint(before, after))
    return  colls_df, score_df(colls_df)


score_df = lambda df: nb.frame({x:df[x]['score'] for x in df }).transpose()
display_vals = lambda kr_df, word, clip = 0: kr_df[kr_df >= clip].loc[word]

def show_frame(df, colnum = 0,  clip = 0, fillval= 10, cmap = 'Blues', up = True, axis=0, first_row=0, number_of_rows = 20): 
    if up == True:
        cmap = cmap + '_r'
        dfc = df[df >= clip]
    else:
        dfc = df[df <= clip]
    return dfc.sort_values(by = df.columns[colnum], ascending=up)[first_row:first_row + number_of_rows].fillna(fillval).style.background_gradient(cmap=cmap,axis=axis)

In [138]:
helse_r, helse_right = make_collocations('helse', period=(1965, 1990), before=0, after=10)

behandler:  1965 1968
behandler:  1968 1971
behandler:  1971 1974
behandler:  1974 1977
behandler:  1977 1980
behandler:  1980 1983
behandler:  1983 1986
behandler:  1986 1989
behandler:  1989 1992


In [139]:
helse_l, helse_left = make_collocations('helse', period=(1965, 1990), before=10, after=0)

behandler:  1965 1968
behandler:  1968 1971
behandler:  1971 1974
behandler:  1974 1977
behandler:  1977 1980
behandler:  1980 1983
behandler:  1983 1986
behandler:  1986 1989
behandler:  1989 1992


In [157]:
def make_scores(word, period=(1965, 1990), before=10, after=10):
    print('Right context:', (0, after))
    df_r, df_right = make_collocations(word, period=period, before=0, after = after)
    print('Left context:', (before, 0))
    df_l, df_left = make_collocations(word, period=period, before=before, after = 0)
    return df_r, df_right, df_l, df_left

In [158]:
hiv_r, hiv_right, hiv_l, hiv_left = make_scores('HIV', period=(1965, 1990), before=10, after=10)

Right context: (0, 10)
behandler:  1965 1968
behandler:  1968 1971
behandler:  1971 1974
behandler:  1974 1977
behandler:  1977 1980
behandler:  1980 1983
behandler:  1983 1986
behandler:  1986 1989
behandler:  1989 1992
Left context: (10, 0)
behandler:  1965 1968
behandler:  1968 1971
behandler:  1971 1974
behandler:  1974 1977
behandler:  1977 1980
behandler:  1980 1983
behandler:  1983 1986
behandler:  1986 1989
behandler:  1989 1992


In [169]:
sykdom_r, sykdom_right, sykdom_l, sykdom_left = make_scores('sykdom', period=(1965, 1990), before=10, after=10)

Right context: (0, 10)
behandler:  1965 1968
prøver en gang til for:  (1965, 1968)
behandler:  1968 1971
behandler:  1971 1974
behandler:  1974 1977
behandler:  1977 1980
behandler:  1980 1983
behandler:  1983 1986
behandler:  1986 1989
behandler:  1989 1992
prøver en gang til for:  (1989, 1992)
Left context: (10, 0)
behandler:  1965 1968
behandler:  1968 1971
behandler:  1971 1974
behandler:  1974 1977
behandler:  1977 1980
behandler:  1980 1983
behandler:  1983 1986
behandler:  1986 1989
behandler:  1989 1992


In [168]:
show_frame(hiv_left, colnum=8, fillval=-10, up=False, axis=1)

Unnamed: 0_level_0,1965,1968,1971,1974,1977,1980,1983,1986,1989
Unnamed: 0_level_1,1968,1971,1974,1977,1980,1983,1986,1989,1992
102,-10.0,-5.5,-5.5,-10,-10,-10.0,-10.0,-5.5,-1.19565
711,-10.0,-10.0,-10.0,-10,-10,-10.0,-10.0,-10.0,-1.20663
71102,-10.0,-10.0,-10.0,-10,-10,-10.0,-10.0,-10.0,-1.23684
PÅL,-10.0,-10.0,-10.0,-10,-10,-10.0,-10.0,-10.0,-1.64286
731,-10.0,-10.0,-10.0,-10,-10,-10.0,-10.0,-10.0,-1.64286
pågår,-10.0,-10.0,-10.0,-10,-10,-10.0,-10.0,-7.25,-1.64286
Om,-10.0,-10.0,-10.0,-10,-10,-10.0,-10.0,-4.8,-1.72077
katteslektens,-10.0,-10.0,-10.0,-10,-10,-10.0,-10.0,-1.75,-1.75
MED,-10.0,-5.5,-10.0,-10,-10,-5.75,-10.0,-2.125,-1.75641
omkring,-10.0,-10.0,-10.0,-10,-10,-10.0,-3.61111,-2.31161,-1.79644


In [175]:
show_frame(sykdom_right, colnum=4, up=True, axis=1)

Unnamed: 0_level_0,1965,1968,1971,1974,1977,1980,1983,1986,1989
Unnamed: 0_level_1,1968,1971,1974,1977,1980,1983,1986,1989,1992
sokes,2.1665,3.60879,2.75561,1.5,1.60332,4.09556,3.9,4.56667,5.35417
Arbeidsgiver,10.0,10.0,10.0,10.0,1.75,10.0,10.0,10.0,5.0
selges,2.8769,2.07018,4.37397,5.21828,1.84781,2.67168,2.33904,2.15359,3.11442
tvang,2.80769,1.75781,1.75,1.97656,1.9,2.60227,2.75,3.98125,3.47348
Luftveisinfeksjoner,10.0,10.0,10.0,1.99,1.90693,10.0,10.0,10.0,10.0
Sykdommer,5.5,5.5,10.0,1.86364,1.92094,5.0,5.5,4.27778,4.94444
Navn,6.75,5.5,10.0,5.5,2.06,5.0,5.5,6.05556,6.5
inntreffer,1.9375,3.61111,2.63846,1.78307,2.08333,3.08333,2.19163,1.70833,3.28571
fødselshjelp,5.5,10.0,10.0,5.5,2.0875,2.10606,10.0,10.0,10.0
bort-,2.21705,2.79499,2.1822,2.10598,2.09661,2.10606,2.58333,2.7,2.56897


In [174]:
show_frame(sykdom_left, colnum=4, fillval=-10, up=False, axis=1)

Unnamed: 0_level_0,1965,1968,1971,1974,1977,1980,1983,1986,1989
Unnamed: 0_level_1,1968,1971,1974,1977,1980,1983,1986,1989,1992
barns,-4.32143,-3.61111,-1.46639,-1.2192,-1.10546,-2.29135,-1.3314,-1.64657,-1.4335
kv.,-5.5,-5.5,-10.0,-10.0,-1.12162,-1.15,-10.0,-10.0,-5.5
Grunnet,-1.05643,-1.08437,-1.13991,-1.55547,-1.14016,-1.08892,-1.10142,-1.19409,-1.24697
tids,-1.13159,-1.12234,-1.88912,-1.18526,-1.18214,-1.12586,-1.24915,-1.4557,-1.12367
kronisk,-2.27868,-1.56216,-2.2781,-2.07047,-1.21948,-1.76335,-1.64158,-1.50791,-1.86023
revmatisk,-1.79653,-1.9,-1.45,-1.24,-1.23684,-1.5625,-1.93939,-1.60084,-1.12162
dødelig,-2.2,-1.58681,-1.62078,-1.4418,-1.26327,-1.61393,-1.43135,-1.13639,-1.23296
smittsom,-1.22024,-1.19566,-2.15751,-2.20922,-1.33161,-1.21905,-1.29766,-1.31806,-1.94518
mental,-2.9375,-1.40909,-1.83333,-1.24422,-1.34444,-1.45,-1.88095,-3.7,-3.46296
Andersens,-2.0625,-1.40909,-1.64286,-1.94444,-1.375,-4.26543,-1.75,-1.9,-2.68519


In [145]:
show_frame(helse_right, colnum=7, up=True, axis=1)

Unnamed: 0_level_0,1965,1968,1971,1974,1977,1980,1983,1986,1989
Unnamed: 0_level_1,1968,1971,1974,1977,1980,1983,1986,1989,1992
vesenet,10.0,5.5,5.5,10.0,5.5,2.6,1.94444,1.40909,1.75
Nytil,10.0,10.0,10.0,10.0,10.0,10.0,5.5,1.45139,10.0
Nytilsatte,10.0,10.0,10.0,10.0,1.9375,1.67555,2.1,1.69141,10.0
søster,3.57407,2.5,2.78571,4.62,3.16667,2.21875,1.88462,1.75,2.4375
messige,5.5,5.5,3.5,10.0,1.9,2.5,3.16667,1.9,1.7375
messig,3.25,3.25,5.5,3.25,5.5,3.33333,3.33333,1.9,5.5
stasjonen,5.5,5.5,4.94444,3.83333,5.5,2.5,4.5,1.9,1.91667
Of,4.52778,5.5,7.83333,10.0,10.0,3.48611,3.37,1.9,2.94211
løs,1.61265,2.57692,3.26257,2.11222,1.34917,1.4423,1.48446,1.91892,1.88649
sosialtjeneste,10.0,10.0,2.35,2.44805,2.26923,2.67842,2.15341,2.08358,2.55


In [146]:
show_frame(helse_left, colnum=8, fillval=-10, up=False, axis=1)

Unnamed: 0_level_0,1965,1968,1971,1974,1977,1980,1983,1986,1989
Unnamed: 0_level_1,1968,1971,1974,1977,1980,1983,1986,1989,1992
sviktende,-1.11425,-1.05573,-1.04671,-1.0332,-1.08922,-1.0534,-1.07721,-1.09637,-1.05261
Sviktende,-1.64286,-1.45,-2.125,-1.5,-1.5,-1.64286,-2.125,-3.25,-1.18
mentale,-1.40554,-1.27306,-1.22708,-1.37162,-1.50521,-1.18635,-1.36209,-1.19605,-1.19497
Sinnets,-1.04455,-1.19028,-1.225,-1.11181,-1.48416,-1.16667,-1.34615,-2.125,-1.22304
mental,-1.41196,-1.30697,-1.25833,-1.32824,-1.26391,-1.14278,-1.20821,-1.26512,-1.23661
skrantende,-3.25,-1.40909,-2.5,-2.125,-1.9,-1.5,-1.3,-1.27778,-1.25269
Husdyras,-10.0,-10.0,-10.0,-5.5,-10.0,-10.0,-10.0,-10.0,-1.28125
kroppog,-10.0,-10.0,-10.0,-10.0,-10.0,-10.0,-10.0,-10.0,-1.3
darlig,-1.375,-1.75,-1.75,-2.72222,-2.125,-1.92857,-2.875,-5.875,-1.34615
eiga,-2.5,-3.8125,-2.0,-3.16667,-1.96154,-1.48545,-1.57841,-1.46258,-1.35563


In [150]:
helse_left.loc['.']

1965  1968   -6.422876
1968  1971   -6.257627
1971  1974   -5.909580
1974  1977   -6.241331
1977  1980   -6.415106
1980  1983   -6.523024
1983  1986   -6.595112
1986  1989   -6.445508
1989  1992   -6.427677
Name: ., dtype: float64

In [152]:
helse_right.loc['.']

1965  1968    4.877031
1968  1971    4.904218
1971  1974    4.377833
1974  1977    4.487399
1977  1980    4.756179
1980  1983    4.636051
1983  1986    4.374847
1986  1989    4.566849
1989  1992    5.233406
Name: ., dtype: float64