# Getting Started

First, install the library with extras to train models:
```
pip install -e git+https://github.com/bennokr/minimel.git#egg=minimel[train]
```

In [2]:
wiki = 'iawiki-latest' # use Interlingua language Wikipedia version to test
root = 'wiki/' + wiki
!mkdir -p $root
!wikimapper download $wiki --dir $root
outdb = f'{root}/index_{wiki}.db'
!wikimapper create $wiki --dumpdir $root --target $outdb

2024-03-06 09:59:56,309 - wikimapper.download - INFO - [wiki/iawiki-latest/iawiki-latest-page.sql.gz] already exists, skipping downloading [https://dumps.wikimedia.org/iawiki/latest/iawiki-latest-page.sql.gz]!
2024-03-06 09:59:56,309 - wikimapper.download - INFO - [wiki/iawiki-latest/iawiki-latest-page_props.sql.gz] already exists, skipping downloading [https://dumps.wikimedia.org/iawiki/latest/iawiki-latest-page_props.sql.gz]!
2024-03-06 09:59:56,309 - wikimapper.download - INFO - [wiki/iawiki-latest/iawiki-latest-redirect.sql.gz] already exists, skipping downloading [https://dumps.wikimedia.org/iawiki/latest/iawiki-latest-redirect.sql.gz]!
2024-03-06 09:59:56,588 - wikimapper.processor - INFO - Creating index for [iawiki-latest] in [wiki/iawiki-latest/index_iawiki-latest.db]
2024-03-06 09:59:56,591 - wikimapper.processor - INFO - Parsing pages dump
2024-03-06 09:59:56,983 - wikimapper.processor - INFO - Creating database index on 'wikipedia_title'
2024-03-06 09:59:57,035 - wikimapper

In [3]:
!minimel -v index $outdb

Loading mapping...: 100%|█████████████| 34465/34465 [00:00<00:00, 628081.33it/s]
INFO:root:Building IntDAWG trie...
INFO:root:Saving to wiki/iawiki-latest/index_iawiki-latest.dawg...


In [4]:
!wget -P $root https://dumps.wikimedia.org/iawiki/latest/iawiki-latest-pages-articles.xml.bz2
!bunzip2 $root/iawiki-latest-pages-articles.xml.bz2

--2024-03-06 10:00:00--  https://dumps.wikimedia.org/iawiki/latest/iawiki-latest-pages-articles.xml.bz2
Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 208.80.154.71, 2620:0:861:3:208:80:154:71
Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|208.80.154.71|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11275975 (11M) [application/octet-stream]
Saving to: ‘wiki/iawiki-latest/iawiki-latest-pages-articles.xml.bz2’


2024-03-06 10:00:04 (3.67 MB/s) - ‘wiki/iawiki-latest/iawiki-latest-pages-articles.xml.bz2’ saved [11275975/11275975]

bunzip2: Output file wiki/iawiki-latest/iawiki-latest-pages-articles.xml already exists.


In [34]:
dump = f'{root}/{wiki}-pages-articles.xml'
dawg = f'{root}/index_{wiki}.dawg'
!minimel -v get-paragraphs -n 100 $dump $dawg 

INFO:root:Finished in 12s################] | 100% Completed | 12.9s[2K
INFO:root:Wrote 100 partitions


In [6]:
!minimel -v get-disambig -n 100 -d disambiguation $dump $dawg

INFO:root:Using disambiguation template disambiguation, not None
INFO:root:Extracting disambiguation links...
INFO:root:Finished in 13s################] | 100% Completed | 13.5s[2K
INFO:root:Writing to wiki/iawiki-latest/disambig.json


In [7]:
paragraphlinks = f'{root}/{wiki}-paragraph-links/'
!minimel -v count $paragraphlinks

INFO:root:Counting links...
INFO:root:Finished in 2s#################] | 100% Completed |  2.5s[2K
INFO:root:Got 32490 counts.
INFO:root:Aggregating...
INFO:root:Finished in 2s#################] | 100% Completed |  2.2s[2K
INFO:root:Writing to wiki/iawiki-latest/count.min2.json


In [17]:
import shlex, urllib, json

# Get Wikidata IDs for disambiguation and list articles
lang = wiki.split('wiki')[0]
query = urllib.parse.quote("""
SELECT DISTINCT ?s WHERE {
  {?s wdt:P31 wd:Q4167410 .} # disambig
  UNION
  {?s wdt:P31 wd:Q13406463 .} # list
  UNION
  {?s wdt:P360 ?l . } # list of

  ?page schema:about ?s .
  ?page schema:inLanguage "%s" .
}
""" % lang)
url = shlex.quote(f"https://query.wikidata.org/sparql?format=json&query={query}")
badent = f'{root}/badent.json'
!wget -O $badent $url

bindings = json.load(open(badent)).get('results', []).get('bindings', [])
qids = [b.get('s', {}).get('value', '')[31:] for b in bindings]
badent = f'{root}/badent.txt'
with open(badent, 'w') as fw:
    print('\n'.join(qids), file=fw)

--2024-03-06 10:10:14--  https://query.wikidata.org/sparql?format=json&query=%0ASELECT%20DISTINCT%20%3Fs%20WHERE%20%7B%0A%20%20%7B%3Fs%20wdt%3AP31%20wd%3AQ4167410%20.%7D%20%23%20disambig%0A%20%20UNION%0A%20%20%7B%3Fs%20wdt%3AP31%20wd%3AQ13406463%20.%7D%20%23%20list%0A%20%20UNION%0A%20%20%7B%3Fs%20wdt%3AP360%20%3Fl%20.%20%7D%20%23%20list%20of%0A%0A%20%20%3Fpage%20schema%3Aabout%20%3Fs%20.%0A%20%20%3Fpage%20schema%3AinLanguage%20%22ia%22%20.%0A%7D%0A
Resolving query.wikidata.org (query.wikidata.org)... 185.15.59.224, 2a02:ec80:300:ed1a::1
Connecting to query.wikidata.org (query.wikidata.org)|185.15.59.224|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/sparql-results+json]
Saving to: ‘wiki/iawiki-latest/badent.json’

    [ <=>                                   ] 55,974      --.-K/s   in 0.001s  

2024-03-06 10:10:14 (52.4 MB/s) - ‘wiki/iawiki-latest/badent.json’ saved [55974]



In [18]:
disambigfile = f'{root}/disambig.json'
countfile = f'{root}/count.min2.json'
!minimel -v clean -b $badent $outdb $disambigfile $countfile

Counting entities...: 100%|███████████| 11447/11447 [00:00<00:00, 294410.67it/s]
INFO:root:Removing 130 bad entities
Loading labels...: 100%|██████████████| 34465/34465 [00:00<00:00, 196894.89it/s]
Filtering names...: 100%|██████████████| 11387/11387 [00:00<00:00, 69138.81it/s]
INFO:root:Filtering out 1 bad names
INFO:root:Keeping 11386 good names
INFO:root:Writing to wiki/iawiki-latest/clean.json


In [19]:
cleanfile = f'{root}/clean.json'
!minimel -v vectorize $paragraphlinks $cleanfile

INFO:root:Vectorizing training examples for 286 ambiguous names
INFO:root:Writing to wiki/iawiki-latest/vec.clean.dat.parts
INFO:root:Finished in 1s#################] | 100% Completed |  1.1s[2K
INFO:root:Wrote 34 partitions
INFO:root:Concatenating to wiki/iawiki-latest/vec.clean.dat
Concatenating: 100%|██████████████████████████| 34/34 [00:00<00:00, 4602.88it/s]


In [21]:
vecfile = f'{root}/vec.clean.dat'
!minimel -v train $vecfile

INFO:root:Writing to wiki/iawiki-latest/model.20b.vw
creating quadratic features for pairs: ls sf
final_regressor = wiki/iawiki-latest/model.20b.vw
creating cache_file = wiki/iawiki-latest/vec.clean.dat.cache
Reading datafile = wiki/iawiki-latest/vec.clean.dat
num sources = 1
Num weight bits = 20
learning rate = 0.5
initial_t = 0
power_t = 0.5
decay_learning_rate = 1
Enabled learners: gd, scorer-identity, csoaa_ldf-prob, shared_feature_merger
Input label = CS
Output pred = SCALARS
average  since         example        example        current        current  current
loss     last          counter         weight          label        predict features
0.000000 0.000000            1            1.0        unknown              0     1330
0.000000 0.000000            2            2.0        unknown              0      344
0.000000 0.000000            4            4.0        unknown              0      364
0.250000 0.500000            8            8.0          known            183      712
0.18

In [41]:
modelfile = f'{root}/model.20b.vw'
!minimel -v run --evaluate -o /dev/null $dawg $cleanfile $modelfile $paragraphlinks/*

Predicting: 100%|███████████████████████| 59134/59134 [00:16<00:00, 3681.70it/s]
INFO:root:,,0
micro,precision,0.9081185405247082
micro,recall,0.9081185405247082
micro,fscore,0.9081185405247082
macro,precision,0.9232628882555252
macro,recall,0.9058898794831253
macro,fscore,0.911819107954825
,support,191192.0

