In [1]:
import dhlab as dh
import dhlab.api.dhlab_api as api

In [19]:
tot = dh.totals(500000)

In [21]:
tot['tot_rel'] = tot.freq/tot.freq.sum()

In [None]:
dh.Corpus(doctype="digavis", from_year=1950, to_year=1960, limit=25000).frame

In [75]:
class WordDistance():
    def __init__(self, word, corpus=None, total=None,  before=0, after=10, limit=25000):
        """total should have on columns freq and tot_rel """
        
        self.corpus = corpus
        res = api.urn_collocation(list(self.corpus.urn), word=word, before=before, after = after)
        res['rel'] = res.counts/res.counts.sum()
        res['assoc'] = (res.rel/total.tot_rel)
        res.assoc = res['assoc'].fillna(res.rel/min(total.tot_rel))
        self.collocation = res
        self.word = word
        self.before = before
        self.after = after
        self.total = total

    def check(self, words=None):
        return self.collocation.loc[[x for x in words if x in self.collocation.index]]
    
    def compare_drift(self, other):
        df = self.collocation[['dist']].rename(columns={'dist': 'dist_self'}).join(
            other.collocation[['dist']].rename(columns={'dist': 'dist_other'}),
            how='inner'
        )
        df['drift'] = df['dist_other'] - df['dist_self']
        expected_drift = (other.after - self.after) / 2
        df['norm_drift'] = df['drift'] / expected_drift
        return df
        
    def compute_variance(self, larger_instance):
        """
        Estimate positional variance via drift in bdist over increasing window size.
        """
    
        df = self.collocation[['bdist']].rename(columns={'bdist': 'bdist_small'}).join(
            larger_instance.collocation[['bdist']].rename(columns={'bdist': 'bdist_large'}),
            how='inner'
        )
        df['drift'] = df['bdist_large'] - df['bdist_small']
    
        expected_drift = (larger_instance.after - self.after) / 2
        df['norm_drift'] = df['drift'] / expected_drift
    
        self.collocation = self.collocation.join(df[['drift', 'norm_drift']])
        
        return df

In [76]:
spise = WordDistance("spise", corpus=c, before=0, after=20, total = tot)

In [77]:
spise40 = WordDistance("spise", corpus=c, before=0, after=40, total = tot)

In [78]:
spise.compute_variance(spise40)

Unnamed: 0,bdist_small,bdist_large,drift,norm_drift
taremel,1.716981,8.932432,7.215451,0.721545
lunsj,1.956989,3.459184,1.502194,0.150219
kirsebær,3.187500,8.243243,5.055743,0.505574
middag,3.302974,6.168053,2.865079,0.286508
hvalkjøtt,3.586207,9.314286,5.728079,0.572808
...,...,...,...,...
viljesyrke,20.000000,33.333333,13.333333,1.333333
neren,20.000000,33.333333,13.333333,1.333333
Stange,20.000000,33.600000,13.600000,1.360000
bunkevls,20.000000,33.333333,13.333333,1.333333


In [81]:
spise.collocation.sort_values(by="bdist").head(10)

Unnamed: 0,counts,dist,bdist,rel,assoc,drift,norm_drift
taremel,51,51,1.716981,0.00025,3882.554836,7.215451,0.721545
lunsj,184,324,1.956989,0.000901,103.282064,1.502194,0.150219
kirsebær,30,62,3.1875,0.000147,106.902615,5.055743,0.505574
middag,536,1737,3.302974,0.002623,62.697531,2.865079,0.286508
hvalkjøtt,27,64,3.586207,0.000132,219.310547,5.728079,0.572808
frokost,215,741,3.599078,0.001052,81.461197,3.948166,0.394817
lunch,33,88,3.657143,0.000162,82.951105,1.142857,0.114286
hatten,32,86,3.705882,0.000157,19.898464,4.109907,0.410991
mette,109,386,3.837838,0.000533,236.749796,1.444213,0.144421
slikkerier,63,217,3.953846,0.000308,932.070773,4.903297,0.49033


| Høy assoc | Lav bdist | Lav norm\_drift | Tolkning                                  |
| --------- | --------- | --------------- | ----------------------------------------- |
| ✅         | ✅         | ✅               | Kjerneord (valens, syntaks)               |
| ✅         | ❌         | ✅               | Diskursivt relevant, tematisk tett        |
| ✅         | ❌         | ❌               | Langdistanse partner, kanskje stilistisk  |
| ❌         | ✅         | ✅               | Syntaktisk nær, men ikke semantisk viktig |
| ❌         | ❌         | ❌               | Støy                                      |


In [91]:
spise.check(". , : og i ? har hatt".split())

Unnamed: 0,counts,dist,bdist,rel,assoc,drift,norm_drift
.,12347,121277,9.824034,0.06043,0.921489,9.858304,0.98583
",",8455,82855,9.801939,0.041381,0.956167,9.80998,0.980998
:,800,9182,11.498753,0.003915,0.40392,10.936599,1.09366
og,5471,53885,9.852914,0.026777,1.240275,10.117693,1.011769
i,4173,45901,11.003832,0.020424,0.94191,10.185898,1.01859
?,876,8097,9.267654,0.004287,1.63743,8.943289,0.894329
har,1357,15053,11.10596,0.006642,1.129871,10.323163,1.032316
hatt,55,586,10.982456,0.000269,1.158318,10.714265,1.071427


In [55]:
spise.collocation.sort_values(by="counts", ascending=False).head(20).to_excel("spise_hi_counts.xlsx")

In [60]:
spise.collocation.sort_values(by="bdist", ascending=True).head(20)#.to_excel("spise_bdist.xlsx")

Unnamed: 0,counts,dist,bdist,rel,assoc
taremel,51,51,1.716981,0.00025,3882.554836
lunsj,184,324,1.956989,0.000901,103.282064
kirsebær,30,62,3.1875,0.000147,106.902615
middag,536,1737,3.302974,0.002623,62.697531
hvalkjøtt,27,64,3.586207,0.000132,219.310547
frokost,215,741,3.599078,0.001052,81.461197
lunch,33,88,3.657143,0.000162,82.951105
hatten,32,86,3.705882,0.000157,19.898464
mette,109,386,3.837838,0.000533,236.749796
slikkerier,63,217,3.953846,0.000308,932.070773


In [58]:
spise.collocation.sort_values(by="assoc", ascending=False).head(20)#.to_excel("spise_assoc.xlsx")

Unnamed: 0,counts,dist,bdist,rel,assoc
taremel,51,51,1.716981,0.00025,3882.554836
mineralsalter,21,337,16.391304,0.000103,1291.382668
slikkerier,63,217,3.953846,0.000308,932.070773
Dorethe,8,91,13.1,3.9e-05,924.87995
hjemmeleksene,8,11,5.1,3.9e-05,635.79868
avmagringskur,5,52,13.142857,2.4e-05,542.278463
dieten,5,69,15.571429,2.4e-05,502.678681
gaflene,6,84,15.5,2.9e-05,480.592673
råkost,26,117,5.607143,0.000127,468.433379
migrenen,4,50,15.0,2e-05,446.101573


In [59]:
spise.check("kniver gafler tallerken tallerkener bord glass askjett askjetter".split())

Unnamed: 0,counts,dist,bdist,rel,assoc
kniver,4,26,11.0,2e-05,6.803929
gafler,5,38,11.142857,2.4e-05,32.176826
tallerken,13,93,8.866667,6.4e-05,26.700361
tallerkener,7,55,10.555556,3.4e-05,24.245915
bord,38,343,9.575,0.000186,3.742591
glass,38,365,10.125,0.000186,6.464119


In [62]:
spise40.check("kniver gafler tallerken tallerkener bord glass askjett askjetter".split())

Unnamed: 0,counts,dist,bdist,rel,assoc
kniver,8,153,23.3,2e-05,6.803929
gafler,8,137,21.7,2e-05,25.741461
tallerken,17,213,15.421053,4.2e-05,17.457928
tallerkener,10,150,19.166667,2.4e-05,17.318511
bord,61,1071,18.269841,0.000149,3.003922
glass,82,1758,21.880952,0.000201,6.974444


In [64]:
spise40.collocation.sort_values(by="bdist").head(20)

Unnamed: 0,counts,dist,bdist,rel,assoc
lunsj,194,598,3.459184,0.000475,54.44761
lunch,33,88,4.8,8.1e-05,41.475553
mette,115,538,5.282051,0.000281,124.890947
middag,599,3627,6.168053,0.001466,35.033415
mett,138,835,6.535714,0.000338,92.743108
drikke,349,2427,7.14245,0.000854,36.996329
frokost,252,1837,7.547244,0.000617,47.740051
hatten,36,217,7.815789,8.8e-05,11.192886
spaghetti,19,90,8.095238,4.6e-05,92.169566
kirsebær,35,225,8.243243,8.6e-05,62.359859


In [28]:
res[res.counts > 12].sort_values(by="bdist", ascending=True).head(20)

Unnamed: 0,counts,dist,bdist,avg_dist,rel,assoc
menneskeverd,13,33,8.866667,2.538462,5e-05,41.613334
vesteuropeisk,13,61,10.733333,4.692308,5e-05,157.979202
forutsetter,27,230,11.37931,8.518519,0.000104,6.652112
diktatur,185,2070,11.604278,11.189189,0.00071,221.363031
sosialisme,49,574,13.215686,11.714286,0.000188,100.350212
Fiolinkonsert,13,107,13.8,8.230769,5e-05,44.382145
forstand,42,513,13.931818,12.214286,0.000161,9.074619
kommunisme,26,295,14.107143,11.346154,0.0001,96.951973
Henry,19,223,15.380952,11.736842,7.3e-05,2.497724
rettssikkerhet,15,171,15.941176,11.4,5.8e-05,55.157106


In [29]:
res[res.counts > 12].sort_values(by="assoc", ascending=False).head(20)

Unnamed: 0,counts,dist,bdist,avg_dist,rel,assoc
folkedemokrati,21,289,16.913043,13.761905,8.1e-05,1040.113259
diktatur,185,2070,11.604278,11.189189,0.00071,221.363031
folkestyre,65,980,16.119403,15.076923,0.000249,185.568033
demokrati,495,11861,24.066398,23.961616,0.0019,182.15853
parlamentarisme,25,336,16.148148,13.44,9.6e-05,180.504548
vesteuropeisk,13,61,10.733333,4.692308,5e-05,157.979202
demokratiets,53,1193,23.509091,22.509434,0.000203,155.637146
Demokratiet,23,424,20.96,18.434783,8.8e-05,140.893449
styreform,26,604,25.142857,23.230769,0.0001,138.499689
prisloven,15,384,28.470588,25.6,5.8e-05,132.279997


In [None]:
res.sort_values