In [11]:
%matplotlib inline

# This script downloads and reformats the wikipedia link dataset found at: http://snap.stanford.edu/data/index.html

Aim: to give each page a domain-level label, in addition to category level. Done by querying the wikipedia api for each category, and choosing domain to confer on submembers.

In [12]:
import os, sys, gc
import numpy as np
import pickle
import wikipediaapi
import matplotlib.pyplot as plt
from collections import Counter

In [13]:
wDir = '../../..'
dataDir = '{0}/WIKIdata'.format(wDir)
os.listdir(dataDir)

['pages_to_categories2020.p',
 'wiki-topcats.txt',
 'wiki-topcats-categories.txt',
 '.DS_Store',
 'categories_to_domains2020.p',
 'catLens2020.npy',
 'catLens2020all.npy',
 'matrices2020',
 'categories_to_names2020.p',
 'wiki-topcats-page-names.txt',
 'categories_to_categories2020.p',
 'domains_to_names2020.p',
 'domains2020.npy',
 'figs2020']

In [14]:
linkFile = '{0}/wiki-topcats.txt'.format(dataDir)

categoryFile = '{0}/wiki-topcats-categories.txt'.format(dataDir)


with open('{0}/pages_to_categories2020.p'.format(dataDir), 'rb') as pfile:
    pagesCatsMappings = pickle.load(pfile)

with open('{0}/categories_to_names2020.p'.format(dataDir), 'rb') as pfile:
    categoryNames = pickle.load(pfile)

with open('{0}/domains_to_names2020.p'.format(dataDir), 'rb') as pfile:
    domainNames = pickle.load(pfile)


with open('{0}/categories_to_domains2020.p'.format(dataDir), 'rb') as pfile:
    catDomMappings = pickle.load(pfile)

categories = np.int_(list(catDomMappings.keys()))
example_pages = np.int_(list(pagesCatsMappings.keys()))
print(example_pages[:5])
print([(x, categoryNames[x]) for x in categories[:5]])
print('numCats: ', categories.shape)
r = 1791488 # number of nodes (see SNAP website or count as below)

[52 55 56 57 60]
[(5, 'Laboulbeniomycetes'), (13, 'Euphorbia'), (32, 'RNA'), (33, 'Anatomy'), (34, 'History_of_medicine')]
numCats:  (2033,)


In [15]:
linkDict = {}
for category in categories:
    linkDict[category] = []
validEdges = 0
validEndpoints = 0
#print(linkDict)

In [16]:
with open(linkFile, 'r') as f:
    for i, line in enumerate(f):
        if i % 1000000 == 0:
            print('at line: {0}'.format(i))
#         if i > 1000000:
#             print('breaking')
#             break
        edge = line.split(' ')
        edge[0] = np.int(edge[0])
        edge[-1] = np.int(edge[-1].replace('\n', ''))
        
        try:
            pagesCatsMappings[edge[0]]
            validEndpoints += 1
        except:
            continue
        try:
            pagesCatsMappings[edge[1]]
            validEndpoints += 1
        except:
            continue
        try:
            pagesCatsMappings[edge[0]]
            pagesCatsMappings[edge[1]]
            #print('both endpoints checkout')
            validEdges += 1
            targetCats = [x for x in pagesCatsMappings[edge[1]]]
            baseCats = [x for x in pagesCatsMappings[edge[0]]]
            for bCat in baseCats:
                for tCat in targetCats:
                    linkDict[bCat].append(tCat)
            
        except Exception as e:
            print(e)
            continue

at line: 0
at line: 1000000
at line: 2000000
at line: 3000000
at line: 4000000
at line: 5000000
at line: 6000000
at line: 7000000
at line: 8000000
at line: 9000000
at line: 10000000
at line: 11000000
at line: 12000000
at line: 13000000
at line: 14000000
at line: 15000000
at line: 16000000
at line: 17000000
at line: 18000000
at line: 19000000
at line: 20000000
at line: 21000000
at line: 22000000
at line: 23000000
at line: 24000000
at line: 25000000
at line: 26000000
at line: 27000000
at line: 28000000


In [17]:
print(validEndpoints)
print(validEdges)

print([(x, categoryNames[x]) for x in baseCats])
print([(x, categoryNames[x]) for x in targetCats])

print([linkDict[x] for x in list(linkDict.keys())[:3]])
#print(linkDict)



19757448
7113711
[(1047, 'Living_people')]
[(814, 'Association_football_terminology')]
[[7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5, 7975, 7975, 5, 5,

In [18]:

with open('{0}/categories_to_categories2020.p'.format(dataDir), 'wb') as pfile:
    pickle.dump(linkDict, pfile)        