In [1]:
surface = 'groningen'
# wiki = 'simplewiki-20211120'
wiki = 'nlwiki-20220301'
modelfile = f'wiki/{wiki}/experiments/clean-q0.25.32b.vw'
datafile = f'wiki/{wiki}/experiments/clean-q0.25.dat'

import subprocess
args = ["minimel", "audit", modelfile, datafile, surface ]
feats = set()
with subprocess.Popen(args, stdout=subprocess.PIPE, stderr=None) as process:
    for line in process.stdout:
        line = line.decode('utf8').rstrip()
        if line.startswith('\t'):
            feats.update(set(f for f in line[1:].split('\t') if f.startswith('l^')))
len(feats)

creating quadratic features for pairs: ls
only testing
using no cache
Reading datafile = none
num sources = 0
Num weight bits = 32
learning rate = 0.5
initial_t = 0
power_t = 0.5
Enabled reductions: gd, scorer-identity, csoaa_ldf-prob, shared_feature_merger
Input label = cs
Output pred = prob
average  since         example        example        current        current  current
loss     last          counter         weight          label        predict features
0.000000 0.000000            1            1.0          known            749     3750
0.000000 0.000000            2            2.0          known            749      900
0.250000 0.500000            4            4.0          known            752     1410
0.125000 0.000000            8            8.0          known            752     2040
0.125000 0.125000           16           16.0          known            749     6600
0.125000 0.125000           32           32.0          known            749     1170
0.125000 0.125000         

254730

In [2]:
import io
import pandas as pd

df = pd.read_csv(
    io.StringIO('\n'.join(feats)), 
    sep="\*|:|\^|=", 
    header=None,
    engine="python",
    usecols=[2,4,7],
    names=['wid', 'feat', 'weight']
).dropna()

# Normalize weights
df['weight'] = -(df['weight'] - df.groupby('feat')['weight'].transform('mean'))

In [32]:
select_ents = set(df['wid'].unique())

import sqlite3

con = sqlite3.connect(f'wiki/{wiki}/index_{wiki}.db')
ent_label = {}
for e in select_ents:
    l = pd.read_sql_query(f'select * from mapping where wikidata_id="Q{e}" limit 1', con)
    ent_label[e] = l['wikipedia_title'][0]

print(dict(sorted(ent_label.items())))

{749: 'Groningen_(stad)', 752: 'Groningen_(provincie)', 17937: 'Station_Groningen', 24711: 'FC_Groningen', 504552: 'Ommelanden_(Groningen)', 508935: 'Groningen_(Suriname)', 743622: 'Bisdom_Groningen-Leeuwarden', 850730: 'Rijksuniversiteit_Groningen', 892526: 'Groningen_(gemeente)', 2030842: 'Rechtbank_Groningen', 2039165: 'Stad_en_Lande', 2342188: 'Heerlijkheid_Groningen', 2537137: 'Kardinge_(ijsbaan)', 2664259: 'Donar_(basketbalclub)', 13638631: 'GHHC_Groningen'}


In [40]:
def topfeat(gr):
    gr = gr.drop(columns='wid').set_index('feat').dropna()
    gr = gr.loc[gr['weight'].apply('abs').sort_values().index[::-1]]
    return gr.head(10).reset_index()

tops = df.groupby('wid').apply(topfeat)
tops = tops.swaplevel().unstack().swaplevel(axis=1).sort_index(axis=1).T
tops.index = tops.index.set_levels([ent_label[wid] for wid in tops.index.levels[0]], level=0)

import seaborn as sns
cmap = sns.diverging_palette(0, 230, 90, 60, as_cmap=True)
tops.style.background_gradient(cmap=cmap, subset=pd.IndexSlice[pd.IndexSlice[:,'weight'],:])

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9
wid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Groningen_(stad),feat,stad,groningen,nederlands,gasthuis,categorie,straat,minerva,pand,seizoen,speelronde
Groningen_(stad),weight,1.510383,1.036810,0.986174,0.941110,0.806381,0.803516,0.801374,0.765340,0.670075,0.640870
Groningen_(provincie),feat,waterschap,provincie,voormalig,geografie,hogeland,buurtschap,eemsdelta,categorie,westerkwartier,polder
Groningen_(provincie),weight,2.051362,2.033106,1.359907,1.191668,1.179173,1.103790,1.074791,0.965262,0.962080,0.843486
Station_Groningen,feat,hij,station,leeuwarden,centraal,treinstellen,tussen,treinen,zwolle,stations,dienstregeling
Station_Groningen,weight,-0.910364,0.607182,0.584886,0.568979,0.524219,0.508955,0.503167,0.468957,0.464625,0.429113
FC_Groningen,feat,categorie,seizoen,verkeersplein,voormalig,waterschap,stad,eredivisie,groningen,club,22
FC_Groningen,weight,-0.624343,0.541916,-0.483899,0.468281,-0.433318,-0.380059,0.373951,-0.317086,0.316307,-0.305664
Ommelanden_(Groningen),feat,groningen,categorie,seizoen,is,brouwer,stijl,oranje,streekproducten,nederlands,nederlandse
Ommelanden_(Groningen),weight,-0.422829,-0.388203,-0.356352,-0.316209,0.311297,0.277870,0.275813,-0.260262,-0.252546,-0.249970
