In [1]:
import os, sys, gc
import pickle
import numpy as np

# This script downloads and reformats the wikipedia link dataset found at: http://snap.stanford.edu/data/index.html

Run second.

1. Runs through included categories and stores array of pages that are associated with each included category.
2. Stores mapping that reindexes old category indices (their line number in `wiki-topcats-categories.txt`) to our new ones from `generate_domains.ipynb`.
3. Runs through pages and finds where page index is included in a category. Outputs these as a mapping from page index to new category index.
4. Generates a test case.

### Input
`wiki-topcats-categories.txt`
This file assigned categories to pages. Each line is a single string that takes the following format: `Category:Buprestoidea; 301 302 303`, where the category is given first with some wrapper text, than a list of all pages that are from that category.

`categories_to_names2020`
Dictionary that maps each new category index to category names.

### Output
`pages_to_categories2020.p`
Dictionary that maps each page idx to included category idxs, reindexed as in generate_domains.ipynb


In [2]:
wDir = '.'
dataDir = '{0}/WIKIdata'.format(wDir)
os.listdir(dataDir)

['wiki-topcats.txt',
 'wiki-topcats-categories.txt',
 '.DS_Store',
 'numDomainsPerCatRaw.npy',
 'categories_to_domains2020.p',
 'matrices2020',
 'categories_to_names2020.p',
 'wiki-topcats-page-names.txt',
 'domains_to_names2020.p',
 'domains2020.npy',
 'figs2020']

In [3]:
# original dataset file
categoryFile = '{0}/wiki-topcats-categories.txt'.format(dataDir)

# dictionary that maps (filtered) category indices to names
with open('{0}/categories_to_names2020.p'.format(dataDir), 'rb') as pfile:
    categoryNames = pickle.load(pfile)

# kept category (re)indices
categoryIdxs = np.sort(np.int_(list(categoryNames.keys())))
assert (categoryIdxs == np.arange(len(categoryIdxs))).all()

# kept category names
categories = np.asarray([categoryNames[x] for x in categoryIdxs])
print('number of categories after filtering: ', categories.shape)
print('categories[:5]', categories[:5])
r = 1791488 # number of raw pages in dataset (see SNAP website or count as below)

number of categories after filtering:  (2079,)
categories[:5] ['Laboulbeniomycetes' 'Euphorbia' 'RNA' 'Anatomy' 'History_of_medicine']


In [4]:
# this section converts the information in wiki-topcats-categories.txt into a list of arrays, keeping only information
# for kept categories

# this will contain arrays mapping kept category (old) indices: array of page indices
masterCatList = []

# this will provide the means of reindexing
oldToNewCategoryMappings = {}
# this file contains a category on each line, and the pages assigned to this category
with open(categoryFile, 'r') as f:
    for i, line in enumerate(f): # for each category: page set of assignments
        
        # print every 1000 categories, for monitoring
        if i % 1000 == 0:
            print('at category: {0}'.format(i))
            
        # just return category name
        category_name = line.split(';')[0].replace('Category:', '')
        
        if category_name in categories: # if this is a category we have kept
            print(category_name)
            
            # store the mapping from old category idx to new (reindexed) category idx
            oldToNewCategoryMappings[i] = np.where(categories==category_name)[0]
            
            linkList = line.split(' ')[1:] # look only at page indices
                
            # replaces final newline character
            linkList[-1] = linkList[-1].replace('\n', '')

            # create a temporary array that will store old category index in first column and page idx in second
            # for all pages (i.e., it will have #pages rows and two columns; category (old) index: page index)
            linkArray = np.empty((len(linkList), 2))
            
            # first column is old category index
            linkArray[:, 0] = i
            try: 
                linkArray[:, 1] = np.int_(linkList) # second column is (unique) page indices
                masterCatList.append(linkArray)
            except Exception as e:
                print(e)
                continue

f.close()


at category: 0
Laboulbeniomycetes
Euphorbia
RNA
Anatomy
History_of_medicine
Bacteriology
Metabolism
Microbiology
Membrane_biology
Proteins
Cell_biology
Enzymes
DNA
Molecular_biology
Biotechnology
Immune_system
Immunology
Human_cells
Laboratory_techniques
Biochemistry
Developmental_biology
Digestive_system
Pediatrics
Stereochemistry
Functional_groups
Organic_chemistry
Chemical_reactions
Chlorides
Physical_chemistry
Potassium_compounds
IARC_Group_2B_carcinogens
Food_additives
Sodium_compounds
Sequenced_genomes
Physiology
Carboxylic_acids
Medical_Subject_Headings
Alkenes
Nutrition
Food_science
Pharmacology
Lactams
Amines
Medical_emergencies
Autoimmune_diseases
Eli_Lilly_and_Company
Medical_signs
Ketones
Alcohols
Biomolecules
Integral_membrane_proteins
Signal_transduction
Protein_domains
Protein_families
Cytokines
Aldehydes
Organic_reactions
Diabetes
Carbamates
Gene_expression
Molecular_genetics
Animal_virology
Bacteria
Disability
Epidemiology
Chemical_engineering
Protein_structure
Piperid

Anglo-Catholicism
Modernist_architects
Architectural_history
Indigenous_languages_of_the_Americas
Languages_of_the_United_States
Semantics
English_language
Linguistic_morphology
Grammar
Linguistics
Sociolinguistics
Rhetoric
Linguists
Language_education
Flemish_people
English_given_names
Portuguese_explorers
Peruvian_football_clubs
Peru_international_footballers
Spanish_explorers
Spanish_colonization_of_the_Americas
Rivers_of_Queensland
Rivers_of_New_South_Wales
Dufek_Coast
Satellite_television
Iraq_War
History_of_Iraq
Fertile_Crescent
Mesopotamian_deities
International_relations_theory
International_relations
at category: 5000
University_of_California,_Berkeley
Toyota_concept_vehicles
Linfield_F.C._players
Circuses
Districts_of_Yemen
Physical_oceanography
Evolutionary_biology
Evolutionary_biologists
Game_theory
Anglo-Indian_people
South_China_AA_footballers
Men's_magazines
North_Norfolk
Ocean_liners
Cruise_ships
Oil_tankers
Vesper_bats
Mammals_of_the_United_States
Zoology
Megafauna
Fau

American_Jews
Jewish_actors
Hollywood_blacklist
Anthology_television_series
Medical_television_series
Lists_of_monarchs
Lists_of_prime_ministers
Member_states_of_the_United_Nations
Republics
Film_soundtracks
Covers_albums
Concept_albums
Double_albums
Rock_and_Roll_Hall_of_Fame_inductees
Virgin_Records_artists
Warner_Bros._Records_artists
Mercury_Records_artists
Elektra_Records_artists
Columbia_Records_artists
Rock_ballads
Pop_ballads
at category: 12000
Arista_Records_artists
Theme_music
Districts_of_Cambodia
Lists_of_people_by_nationality
Communes_of_Mauritania
Ancient_Egyptians
Communes_of_Mali
Lakes_of_Switzerland
Municipalities_of_the_canton_of_Fribourg
Castles_in_Switzerland
Municipalities_of_the_canton_of_Vaud
Cities_in_Switzerland
Municipalities_of_the_canton_of_Solothurn
Municipalities_of_the_canton_of_Zurich
Albanian_people
Christmas_television_specials
German_poets
Eurovision_Song_Contest_winners
High_schools_in_California
Greek_Revival_architecture
National_Register_of_Histor

In [5]:
# find which categories a page is in and returns a mapping from page index to categor(ies) ind(ices)

# this turns list of arrays, above, into one array of size (num_of_cat:page_mappings, 2)
# where the first column is the (old) category index and the second column the page index
masterCatArray = np.vstack(masterCatList)
print(masterCatArray.shape)

# create dictionary that maps page indices to categor(ies) ind(ices)
pages_to_categories = {}
pages_to_old_categories = {}

for i in np.arange(r):  
    # for monitoring, print every 10,000 pages
    if i % 10000 == 0:
        print('at {0}'.format(i))
    
    # check where page mentioned (i.e., return array index where page index is in second column)
    location = np.where(masterCatArray[:, 1] == i)[0]

    if location.shape[0] > 0: # if page belongs to a kept category
        # store page idx in dictionary as key, which maps to old category index, then new category idx
        pages_to_categories[i] = np.hstack([oldToNewCategoryMappings[np.int(masterCatArray[x, 0])] for x in location])

(939575, 2)
at 0
at 10000
at 20000
at 30000
at 40000
at 50000
at 60000
at 70000
at 80000
at 90000
at 100000
at 110000
at 120000
at 130000
at 140000
at 150000
at 160000
at 170000
at 180000
at 190000
at 200000
at 210000
at 220000
at 230000
at 240000
at 250000
at 260000
at 270000
at 280000
at 290000
at 300000
at 310000
at 320000
at 330000
at 340000
at 350000
at 360000
at 370000
at 380000
at 390000
at 400000
at 410000
at 420000
at 430000
at 440000
at 450000
at 460000
at 470000
at 480000
at 490000
at 500000
at 510000
at 520000
at 530000
at 540000
at 550000
at 560000
at 570000
at 580000
at 590000
at 600000
at 610000
at 620000
at 630000
at 640000
at 650000
at 660000
at 670000
at 680000
at 690000
at 700000
at 710000
at 720000
at 730000
at 740000
at 750000
at 760000
at 770000
at 780000
at 790000
at 800000
at 810000
at 820000
at 830000
at 840000
at 850000
at 860000
at 870000
at 880000
at 890000
at 900000
at 910000
at 920000
at 930000
at 940000
at 950000
at 960000
at 970000
at 980000
at 990000
at

In [6]:
# Test whether the above worked

# choose random page index
testChoice = np.random.choice(list(pages_to_categories.keys()))
print('test page number is {}'.format(testChoice))

# find new category indices
newCatIdxs = pages_to_categories[testChoice]
print(newCatIdxs)
for x in newCatIdxs:
    print(x)
    print('new category is: ({}, {})'.format(x, categoryNames[x]))


# find old category names and print (N.B. some of these will not have been kept)
with open(categoryFile, 'r') as f:
    for i, line in enumerate(f):
        if ' {0} '.format(testChoice) in line:
            print('category name is: {0}; line: {1} '.format(line.split(';')[0], i))
            

test page number is 162372
[220 280]
220
new category is: (220, Living_people)
280
new category is: (280, Polish_writers)
category name is: Category:Living_people; line: 1047 
category name is: Category:Polish_writers; line: 1783 
category name is: Category:People_from_Warsaw; line: 13983 


In [7]:
# Saves page to new category mappings
with open('{0}/pages_to_categories2020.p'.format(dataDir), 'wb') as pfile:
    pickle.dump(pages_to_categories, pfile)