In [1]:
# surface = 'madrid'
surface = 'utrecht'
# wiki = 'simplewiki-20211120'
wiki = 'nlwiki-20220301'
# wiki = 'eswiki-20220301'
modelfile = f'wiki/{wiki}/experiments/clean-q0.25.32b.vw'
datafile = f'wiki/{wiki}/experiments/clean-q0.25.dat'

import subprocess
args = ["minimel", "audit", modelfile, datafile, surface ]
feats = set()
with subprocess.Popen(args, stdout=subprocess.PIPE, stderr=None) as process:
    for line in process.stdout:
        line = line.decode('utf8').rstrip()
        if line.startswith('\t'):
            feats.update(set(f for f in line[1:].split('\t') if f.startswith('l^')))
len(feats)

creating quadratic features for pairs: ls
only testing
using no cache
Reading datafile = none
num sources = 0
Num weight bits = 32
learning rate = 0.5
initial_t = 0
power_t = 0.5
Enabled reductions: gd, scorer-identity, csoaa_ldf-prob, shared_feature_merger
Input label = cs
Output pred = prob
average  since         example        example        current        current  current
loss     last          counter         weight          label        predict features
0.000000 0.000000            1            1.0          known            803      208
0.500000 1.000000            2            2.0          known            803     3172
0.250000 0.000000            4            4.0          known            776     1664
0.250000 0.250000            8            8.0          known            776     2418
0.375000 0.500000           16           16.0          known            776     1794
0.187500 0.000000           32           32.0          known            803     1898
0.109375 0.031250         

253045

In [2]:
import io
import pandas as pd

df = pd.read_csv(
    io.StringIO('\n'.join(feats)), 
    sep="\*|:|\^|=", 
    header=None,
    engine="python",
    usecols=[2,4,7],
    names=['wid', 'feat', 'weight']
).dropna()

# Normalize weights
df['weight'] = -(df['weight'] - df.groupby('feat')['weight'].transform('mean'))

In [3]:
select_ents = set(df['wid'].unique())

import sqlite3

con = sqlite3.connect(f'wiki/{wiki}/index_{wiki}.db')
ent_label = {}
for e in select_ents:
    l = pd.read_sql_query(f'select * from mapping where wikidata_id="Q{e}" limit 1', con)
    ent_label[e] = l['wikipedia_title'][0]

print(dict(sorted(ent_label.items())))

{776: 'Utrecht_(provincie)', 803: 'Utrecht_(stad)', 18108: 'Utrecht_(Zuid-Afrika)', 24680: 'FC_Utrecht', 221653: 'Universiteit_Utrecht', 261716: 'Aartsbisdom_Utrecht_(rooms-katholiek)', 575655: 'Station_Utrecht_Centraal', 707767: 'Sticht_Utrecht', 847384: 'Utrechts_Conservatorium', 2012748: 'Vechtsebanen', 2193594: 'Hr.Ms._Utrecht_(1901)', 2679365: 'Heerlijkheid_Utrecht', 85308316: 'BVC_Utrecht'}


In [4]:
def topfeat(gr):
    gr = gr.drop(columns='wid').set_index('feat').dropna()
    gr = gr.loc[gr['weight'].apply('abs').sort_values().index[::-1]]
    return gr.head(10).reset_index()

tops = df.groupby('wid').apply(topfeat)
tops = tops.swaplevel().unstack().swaplevel(axis=1).sort_index(axis=1).T
tops.index = tops.index.set_levels([ent_label[wid] for wid in tops.index.levels[0]], level=0)

import seaborn as sns
cmap = sns.diverging_palette(0, 230, 90, 60, as_cmap=True)
tops.style.background_gradient(cmap=cmap, subset=pd.IndexSlice[pd.IndexSlice[:,'weight'],:])

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9
wid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Utrecht_(provincie),feat,provincie,geografie,baarn,waterschap,gemeentelijk,wakkerendijk,provincies,categorie,heuvelrug,monument
Utrecht_(provincie),weight,2.026356,1.091671,1.050475,1.034872,0.924649,0.920758,0.797283,0.759567,0.754640,0.730448
Utrecht_(stad),feat,utrecht,stad,provincie,schilderij,nederlands,binnenstad,museum,straat,oudegracht,evenement
Utrecht_(stad),weight,1.303295,1.053762,-1.021294,0.956520,0.950493,0.889732,0.883750,0.714924,0.701241,0.673608
Utrecht_(Zuid-Afrika),feat,categorie,nederlands,rotterdammers,zuid,type,republiek,is,of,januari,brug
Utrecht_(Zuid-Afrika),weight,-0.595000,-0.383369,-0.372808,0.362396,0.358189,0.343506,-0.339488,-0.330202,-0.313993,-0.305968
FC_Utrecht,feat,categorie,volksvertegenwoordiging,eibert,club,voormalig,fc,roelandszoon,seizoen,stad,contract
FC_Utrecht,weight,-0.498340,-0.460120,-0.438620,0.432645,0.400912,0.384222,-0.371156,0.355233,-0.321003,0.301699
Universiteit_Utrecht,feat,provincie,universiteit,universiteiten,hoogleraar,bisschop,plaats,gemeente,studenten,leiden,groningen
Universiteit_Utrecht,weight,-0.675694,0.647986,0.618577,0.566419,-0.516619,-0.468076,-0.412781,0.408960,0.408899,0.405986
