In [121]:
import tqdm
import pathlib
import re
import json

try:
    import dawg
except ImportError:
    import dawg_python as dawg

root = pathlib.Path('wiki/nlwiki-20220301')
dawgfile = root / 'index_nlwiki-20220301.dawg'
index = dawg.IntDAWG()
index.load(str(dawgfile))

candidates = json.load(open('wiki/nlwiki-20220301/experiments/clean-q1.json'))
entities = set(int(c) for cs in candidates.values() for c in cs)

brackets = re.compile('\([^)]*\) ?')
phrase = re.compile('.{10,}?\. ')

descriptions = []
files = list(pathlib.Path('wiki/nlwiki-20220301/nlwiki-20220301-paragraph-links/').glob('*'))
for fname in tqdm.tqdm(files):
    prev = None
    for line in open(fname):
        page, _, text = line.split('\t', 2)
        if page != prev:
            text, _ = re.subn(brackets,'',text)
            m = re.match(phrase, text)
            if m:
                a,b = m.span()
                e = index.get(page)
                if e in entities:
                    descriptions.append((e,text[a:b]))
            prev = page

print(len(descriptions), 'descriptions')
with open('wiki/nlwiki-20220301/experiments/clean-q1-descriptions.tsv', 'w') as fw:
    for e,d in descriptions:
        print(e,d, sep='\t', file=fw)

100%|██████████| 1000/1000 [00:11<00:00, 86.14it/s]


184022 descriptions


In [36]:
import duckdb

import pandas as pd
import json
import pathlib
import tqdm

try:
    import dawg
except ImportError:
    import dawg_python as dawg

root = pathlib.Path('wiki/nlwiki-20220301')
dawgfile = root / 'index_nlwiki-20220301.dawg'
index = dawg.IntDAWG()
index.load(str(dawgfile))

def loads(x):
    try:
        return json.loads(x)
    except:
        return {}

root = pathlib.Path('wiki/nlwiki-20220301/nlwiki-20220301-paragraph-links/')
n = 0
texts, links = [], []
for fname in tqdm.tqdm(list(root.glob('*'))[:50]):
    df = pd.read_csv(fname, sep='\t', on_bad_lines='skip', names=['page', 'links', 'text'])
    df['page'] = df['page'].map(index.get).astype('Int64')
    df.index += n
    n += len(df)
    pairs = df.links.map(lambda x: list(loads(x).items())).explode()
    links.append( pd.DataFrame({'name':pairs.str[0], 'target':pairs.str[1]}) )
    df.drop(columns=['links'], inplace=True)
    texts.append( df )
texts = pd.concat( texts ).reset_index()
links = pd.concat( links ).reset_index()


100%|██████████| 50/50 [00:03<00:00, 14.29it/s]


In [118]:
duckdb.query(
f"""
select target, count(*) c from texts, links 
where texts.index=links.index and name == 'Utrecht'
group by target
order by c desc
limit 10
""").df()

Unnamed: 0,target,c
0,803,900
1,776,174
2,707767,24
3,261716,16
4,2677914,5
5,221653,5
6,575655,4
7,2679365,4
8,347488,3
9,85308316,2


In [122]:
import stopwordsiso as stopwords
import numpy as np

descfile = 'wiki/nlwiki-20220301/experiments/clean-q1-descriptions.tsv'
e_desc = pd.read_csv(descfile, sep='\t', on_bad_lines='skip', header=None, index_col=0)[1].to_dict()

# ents = [11775750, 2595790] # Openbaar Ministerie
# ents = [29520, 148] # China
ents = [803, 776, 707767] # Utrecht

sample = duckdb.query(
f"""
select text, target from texts, links 
where texts.index=links.index and target in {str(ents)}
""").df().drop_duplicates()

from sklearn.feature_extraction.text import TfidfVectorizer
stop_words = list(stopwords.stopwords('nl'))

tfidf = TfidfVectorizer(ngram_range=(1,3), min_df=10, max_df=.75, stop_words=stop_words)
X = tfidf.fit_transform(sample['text'])

feat_weight = dict(zip(tfidf.get_feature_names_out(), tfidf.idf_))

for e in ents:
    print(e, e_desc[e])
    wdocs = tfidf.inverse_transform(X[sample['target'] == e])
    wcount = pd.Series(wdocs).explode().value_counts()
    top_feats = pd.Series({
        w:np.log1p(c)*feat_weight[w]
        for w,c in wcount.to_dict().items()
    }).sort_values()[::-1][:10]
    print(top_feats)
    print()

803 Utrecht is een stad en gemeente in Nederland en de hoofdstad van de provincie Utrecht. 
werk                16.226999
den haag            16.168381
utrecht stad        16.106695
haag                16.104145
rotterdam           16.102599
universiteit        16.101863
hoogleraar          16.091765
stad utrecht        16.040620
museum              16.017313
nederlandse stad    15.991953
dtype: float64

776 Utrecht is met een landoppervlakte van 1.485 km² de op een na kleinste provincie van Nederland. 
provincie utrecht                15.735772
nederlandse provincie            15.461026
nederlandse provincie utrecht    15.311498
eemnes                           15.273796
provincie                        15.219166
eemnes provincie                 15.066386
eemnes provincie utrecht         15.066386
zuid holland                     14.828159
noord holland                    14.789730
holland utrecht                  14.663517
dtype: float64

707767 Het Sticht Utrecht was het territorium