In [1]:
%matplotlib inline

# This script downloads and reformats the wikipedia link dataset found at: http://snap.stanford.edu/data/index.html

Aim: to give each page a domain-level label, in addition to category level. Done by querying the wikipedia api for each category, and choosing domain to confer on submembers.

In [2]:
import os, sys, gc
import pickle
import numpy as np
import wikipediaapi
import matplotlib.pyplot as plt
from collections import Counter

In [3]:
wDir = '../../..'
dataDir = '{0}/WIKIdata'.format(wDir)
os.listdir(dataDir)

['pages_to_categories2020.p',
 'wiki-topcats.txt',
 'wiki-topcats-categories.txt',
 '.DS_Store',
 'categories_to_domains2020.p',
 'catLens2020.npy',
 'catLens2020all.npy',
 'matrices2020',
 'categories_to_names2020.p',
 'wiki-topcats-page-names.txt',
 'categories_to_categories2020.p',
 'domains_to_names2020.p',
 'domains2020.npy',
 'figs2020']

In [4]:

categoryFile = '{0}/wiki-topcats-categories.txt'.format(dataDir)

with open('{0}/categories_to_names2020.p'.format(dataDir), 'rb') as pfile:
    categoryNames = pickle.load(pfile)

with open('{0}/domains_to_names2020.p'.format(dataDir), 'rb') as pfile:
    domainNames = pickle.load(pfile)


with open('{0}/categories_to_domains2020.p'.format(dataDir), 'rb') as pfile:
    catDomMappings = pickle.load(pfile)

categories = np.int_(list(catDomMappings.keys()))
print('numCats: ', categories.shape)
r = 1791488 # number of nodes (see SNAP website or count as below)

numCats:  (2033,)


In [5]:
masterCatList = []

with open(categoryFile, 'r') as f:
    for i, line in enumerate(f):
        if i % 1000 == 0:
            print('at category: {0}'.format(i))
        if i in categories:
            print(i)
            linkList = line.split(' ')[1:]
            linkList[0] = linkList[0].replace('\n', '')
            #print(linkList)
            linkList[-1] = linkList[-1].replace('\n', '')
            #print(linkList)
            linkArray = np.empty((len(linkList), 2))
            linkArray[:, 0] = i
            try:
                linkArray[:, 1] = np.int_(linkList)
                masterCatList.append(linkArray)
            except:
                continue
            #print(line)
            #print(linkList)
            #print(linkList[-1])
            #print(linkArray)
            #print(masterCatList)
            

f.close()

at category: 0
5
13
32
33
34
35
38
39
40
41
42
43
44
45
47
48
49
50
51
52
53
58
60
63
64
65
66
68
70
72
74
76
78
82
83
84
86
87
88
90
92
93
99
102
103
104
105
106
112
113
115
116
119
120
121
123
127
129
130
132
135
137
140
142
143
144
148
151
152
154
158
159
161
165
166
168
171
172
175
178
181
189
190
193
198
200
202
204
227
228
243
245
247
260
261
265
266
267
273
286
287
288
290
291
317
318
319
338
340
349
350
352
356
360
378
380
381
385
387
390
398
399
400
404
413
414
415
420
421
428
429
431
433
434
444
459
460
467
469
471
473
476
478
484
501
507
508
509
511
513
517
520
523
530
532
562
570
595
599
605
606
608
625
634
640
641
645
647
648
650
654
661
664
668
670
672
673
674
675
681
683
686
687
688
689
691
693
694
695
696
698
699
700
707
709
715
717
719
743
771
774
788
789
814
818
827
928
954
955
962
980
996
997
at category: 1000
1000
1003
1007
1012
1013
1015
1017
1026
1029
1047
1062
1072
1084
1096
1102
1105
1107
1109
1114
1157
1191
1194
1233
1282
1292
1293
1299
1302
1320
1329
1355
1358

14091
14095
14108
14116
14154
14155
14156
14158
14160
14161
14162
14163
14164
14165
14166
14188
14189
14193
14202
14205
14215
14223
14233
14238
14240
14255
14266
14268
14273
14280
14328
14329
14341
14345
14351
14352
14354
14362
14372
14383
14391
14392
14397
14429
14430
14431
14432
14433
14434
14439
14444
14445
14446
14447
14448
14449
14450
14454
14456
14466
14469
14480
14485
14489
14529
14530
14532
14554
14556
14557
14575
14597
14616
14635
14642
14711
14716
14728
14736
14770
14777
14778
14782
14795
14801
14804
14805
14808
14809
14811
14812
14819
14823
14830
14844
14847
14855
14929
14930
14931
14934
14935
14939
14940
14948
14976
14992
14994
14996
14997
14998
at category: 15000
15001
15002
15003
15010
15072
15118
15143
15146
15148
15162
15177
15217
15228
15235
15258
15267
15275
15279
15283
15286
15295
15323
15352
15423
15460
15471
15474
15485
15494
15543
15544
15548
15551
15607
15695
15745
15768
15825
15861
15862
15863
15865
15868
15869
15870
15871
15872
15874
15883
15890
15896
15904
159

In [6]:
#print(linkList)
#print(line)
#print(linkArray)
#print(masterCatList[0])
masterCatArray = np.vstack(masterCatList)
print(masterCatArray.shape)

pages_to_categories = {}
for i in np.arange(r):   #r:
    if i % 10000 == 0:
        print('at {0}'.format(i))
    location = np.where(masterCatArray[:, 1] == i)[0]
    if location.shape[0] > 0:
        #print('match at {0}. Locations: {1}'.format(i, location))
        pages_to_categories[i] = [np.int(masterCatArray[x, 0]) for x in location]

(925479, 2)
at 0
at 10000
at 20000
at 30000
at 40000
at 50000
at 60000
at 70000
at 80000
at 90000
at 100000
at 110000
at 120000
at 130000
at 140000
at 150000
at 160000
at 170000
at 180000
at 190000
at 200000
at 210000
at 220000
at 230000
at 240000
at 250000
at 260000
at 270000
at 280000
at 290000
at 300000
at 310000
at 320000
at 330000
at 340000
at 350000
at 360000
at 370000
at 380000
at 390000
at 400000
at 410000
at 420000
at 430000
at 440000
at 450000
at 460000
at 470000
at 480000
at 490000
at 500000
at 510000
at 520000
at 530000
at 540000
at 550000
at 560000
at 570000
at 580000
at 590000
at 600000
at 610000
at 620000
at 630000
at 640000
at 650000
at 660000
at 670000
at 680000
at 690000
at 700000
at 710000
at 720000
at 730000
at 740000
at 750000
at 760000
at 770000
at 780000
at 790000
at 800000
at 810000
at 820000
at 830000
at 840000
at 850000
at 860000
at 870000
at 880000
at 890000
at 900000
at 910000
at 920000
at 930000
at 940000
at 950000
at 960000
at 970000
at 980000
at 990000
at

In [7]:
print(pages_to_categories[57][0])
print(len(list(pages_to_categories.keys())))
print(len(set(pages_to_categories.keys())))
unpackedList = [item for sublist in list(pages_to_categories.values()) for item in sublist]
print(len(set(unpackedList)))

6356
764800
764800
2032


In [8]:
#print(pages_to_categories)
testChoice = np.random.choice(list(pages_to_categories.keys()))
print('random choice: {0}; category: {1}'.format(testChoice, pages_to_categories[testChoice]))

with open(categoryFile, 'r') as f:
    for i, line in enumerate(f):
        if ' {0} '.format(testChoice) in line:
            print('file name is: {0}; line: {1} '.format(line.split(';')[0], i))

            
print('mapped names are: ', [categoryNames[x] for x in pages_to_categories[testChoice]])
#print('domains are: {0}'.format([domainNames[x] for x in catDomMappings[pages_to_categories[testChoice]]]))

random choice: 642583; category: [6567, 6569]
file name is: Category:Procedural_programming_languages; line: 6567 
file name is: Category:Object-oriented_programming_languages; line: 6569 
mapped names are:  ['Procedural_programming_languages', 'Object-oriented_programming_languages']


In [9]:
with open('{0}/pages_to_categories2020.p'.format(dataDir), 'wb') as pfile:
    pickle.dump(pages_to_categories, pfile)