In [1]:
import pandas as pd
import re
import numpy as np
import json
import networkx as nx

## Load raw SQL data from Wikipedia Dump

In [None]:
# From enwiki-20171001-category.sql
# cat enwiki-20171001-category.sql | python tocsv.py > enwiki-20171001-category.csv
category = pd.read_csv('./enwiki-20171001-category.csv', header=None, sep=',', encoding='latin1')
category.columns = ['cat_id', 'cat_title', 'cat_page', 'cat_subcats', 'cat_files']
# From enwiki-20171001-page.sql
# cat enwiki-20171001-page.sql | python tocsv.py > enwiki-20171001-page.csv
page = pd.read_csv('./enwiki-20171001-page.csv', header=None, sep=',', usecols=[0, 1, 2, 5, 12, 13], encoding='latin1')
page.columns = ['page_id', 'page_namespace', 'page_title',
                'page_is_redirect', 'page_content_model', 'page_lang']
# From enwiki-20171001-categorylinks.sql
# cat enwiki-20171001-categorylinks.sql | python tocsv.py --ignore_column 2 --ignore_column 3 --ignore_column 4 --ignore_column 5 > enwiki-20171001-categorylinks.csv
categorylinks = pd.read_csv('./enwiki-20171001-categorylinks.csv', header=None, sep=',', encoding='latin1')
categorylinks.columns = ['cl_from', 'cl_to', 'cl_type']
# cat enwiki-20171001-templatelinks.sql | python tocsv.py > enwiki-20171001-templatelinks.csv
templatelinks = pd.read_csv('./enwiki-20171001-templatelinks.csv', header=None, sep=',', encoding='latin1')
templatelinks.columns = ['tl_from', 'tl_namespace', 'tl_title', 'tl_from_namespace']

In [None]:
templatelinks[templatelinks.tl_title.apply(lambda x: str(x).startswith('Tracking'))].tl_title.unique()

## Clean up categories

According to https://en.wikipedia.org/wiki/Wikipedia:Categorization, there are a few non-content categories with the following prefixes:

* `Wikipedia`
* `WikiProject`

Besides these, we also need to remove categories with template `Wikipedia_category` and `Maintenance_category` and its descendants according to https://en.wikipedia.org/wiki/Template:Maintenance_category. Note that `Template:Wikipedia_category` redirects to the same template page.

So here we will first find all templates inherit from these two category templates, and then remove all categories using these templates. 

Then we will check if any category with the abovementioned prefix still exists.

### Remove all categories that use `Wikipedia Category` and `Maintenance Category` as templates

In [4]:
def find_wikipedia_category_templates(templatelink_df, page_df):
    # Two basic templates
    selected_templates = set(['Wikipedia_category', 'Maintenance_category', 'Tracking_category', 'Trackingcat', 'Tracking_category/doc', 'Tracking_cat'])
    # Do BFS here to find all templates using these two root templates
    curr_parent_templates = selected_templates.copy()
    template_page_df = page_df[page_df.page_namespace == 10]  # limit namespace to template
    template_df = templatelink_df[templatelink_df.tl_from_namespace == 10]  # limit namespace to template
    print("all non-template elements are removed")
    while len(curr_parent_templates):
        # all template ids inherit from curr_parent_templates
        next_template_ids = template_df[template_df.tl_title.isin(curr_parent_templates)].tl_from.unique()
        next_templates = template_page_df[template_page_df.page_id.isin(next_template_ids)].page_title.unique()
        selected_templates |= curr_parent_templates
        # Remove seen templates
        template_df = template_df[~template_df.tl_title.isin(curr_parent_templates)]
        # update template
        curr_parent_templates = set(next_templates)
        print("find %d new non-content template" % len(curr_parent_templates))
    print("In total we find %d wikipedia templates." % len(selected_templates))
    cate_template_df = templatelink_df[templatelink_df.tl_from_namespace == 14]
    wiki_category = cate_template_df[cate_template_df.tl_title.isin(selected_templates)].tl_from
    print("In total we find %d wikipedia categories." % len(wiki_category))
    content_category = page_df[page_df.page_namespace==14]
    content_category = content_category[~content_category.page_id.isin(wiki_category)]
    return content_category[['page_id', 'page_title']]
    
content_categories = find_wikipedia_category_templates(templatelinks, page)

all non-template elements are removed
find 48 new non-content template
find 31 new non-content template
find 8 new non-content template
find 0 new non-content template
In total we find 61 wikipedia templates.
In total we find 52646 wikipedia categories.


### Remove other categories

* Year related categories `1990s_animated_films`, `Conflicts_in_1954`, `October_2017_events`, `1990_deaths`, ...
* Other remaining categories with name `Wikipedia` or `WikiProject`
* Other remaining stub categories

In [5]:
# Remove all categories that grouped by years
YEAR_REGEX=re.compile(r'^\d+(s|AD|BC)?\_(birth|deaths|establishments|disestablishments|books|by|events|in|km|works|conflicts|elections)')
YEAR_REGEX_2=re.compile(r'^\d{4}(s)?\_')  # 1990s_animated_films
YEAR_REGEX_3=re.compile(r'.*\_\d{4}$')  # Conflicts_in_1954
# October_2017_events
YEAR_REGEX_4=re.compile(r'^(January|February|March|April|May|June|July|August|September|October|November|December)\_\d{4}')
content_categories = content_categories[content_categories.page_title.apply(lambda x: YEAR_REGEX.match(x) is None and YEAR_REGEX_2.match(x) is None and YEAR_REGEX_3.match(x) is None and YEAR_REGEX_4.match(x) is None)]
print("After removing all year related categories, there are %d categories left." % len(content_categories))

# Remove all categories starts with Wikipedia
WIKI_REGEX=re.compile(r'^(wikipedia|wikiproject)(\_|\-)')
content_categories = content_categories[content_categories.page_title.apply(lambda x: WIKI_REGEX.match(x.lower()) is None and ('wikipedia' not in x.lower()) and ('wikiproject' not in x.lower()))]
print("After removing all Wikipedia related categories, there are %d categories left." % len(content_categories))

# Remove stub categories and container categories
content_categories = content_categories[content_categories.page_title.apply(lambda x: not x.lower().startswith('stub_categories') and not x.lower().startswith('container_categories') and ('stub' not in x.lower()))]
print("After removing all stub categories, there are %d categories left." % len(content_categories))

# Remove something_by_something cateogires, e.g. 
content_categories = content_categories[content_categories.page_title.apply(lambda x: '_by_' not in x.lower() )]# and '_in_' not in x.lower())]
print("After removing all *by*/*in* categories, there are %d categories left." % len(content_categories))

# Remove chronology
DIGIT=re.compile('(?<=[^\(])\d{4}')
DIGIT2=re.compile('^\d+(s)?$')
DIGIT3=re.compile('^\d+_BC$')
# def digit_validation(x):
#     digs = DIGIT.findall(str(x))
#     return len(digs) == 1 and (int(digs[0]) <= 2020) and (int(digs[0]) >= 1000)
content_categories = content_categories[content_categories.page_title.apply(lambda x: len(DIGIT.findall(x)) == 0 and len(DIGIT2.findall(x)) == 0 and (DIGIT3.match(x) is None) and x != 'Years' and x != 'Decades')]
print("After removing all 4-digit years, there are %d categories left." % len(content_categories))

# Remove template and archive links
content_categories = content_categories[content_categories.page_title.apply(lambda x: '_template_' not in str(x).lower() and 'webarchive' not in str(x).lower() and 'all_articles' not in str(x).lower() and 'external_links' not in str(x).lower() and 'cs1_errors' not in x.lower() and not x.startswith('CS1_') and not x.startswith('Articles_') and not x.startswith('All_') and 'disambiguation' not in x.lower())]
print("After removing template and archive categories, there are %d categories left." % len(content_categories))


After removing all year related categories, there are 1305682 categories left.
After removing all Wikipedia related categories, there are 1252749 categories left.
After removing all stub categories, there are 1235610 categories left.
After removing all *by*/*in* categories, there are 1140758 categories left.
After removing all 4-digit years, there are 1117524 categories left.
After removing template and archive categories, there are 1115479 categories left.


## Generate category network
After we cleaned up categories, now we construct the category network using the `categorylinks`. We will need to first extract category_to_category edges first, then convert page_id (`tl_from`) into titles, and then only keep the ones within our `categorylinks`.

In [6]:
cate_links = categorylinks[categorylinks.cl_type == 'subcat'] # only cate-to-cate links
cate_page = page[page.page_namespace == 14]  # only cates are source
# convert category ids into category names
cate_links = cate_links.merge(page, left_on='cl_from', right_on='page_id')[['page_title', 'cl_to']]
cate_links.columns = ['cl_from', 'cl_to']
content_category_set = set(content_categories.page_title.values)  # set of valid encyclopedia categories
# Get cate-cate links within valid encyclopedia categories
cate_links = cate_links[(cate_links.cl_from.isin(content_category_set)) & (cate_links.cl_to.isin(content_category_set))]

# Reindex all selected categories, output the new index into a file
content_category_id_pd = pd.DataFrame({'title':list(content_category_set), 'title_id':np.arange(len(content_category_set))})
content_category_id_pd.to_csv('./data/content_category_id.txt', sep=' ', index=False)

## Create `Networkx` category-to-category network

In [7]:
import networkx as nx

In [8]:
digraph = nx.DiGraph()
digraph.add_edges_from(cate_links.values)  # create graph using the filtered category-to-category links
revgraph = digraph.reverse()  # reversed graph, from large-topic -> small-topic

## Find nearest main topic ancestor for all categories

In [9]:
def find_nearest_main_topic_ancestor(g):
    parents = dict()
    for root_node in g.neighbors('Main_topic_classifications'):
        if root_node not in parents:
            parents[root_node] = dict()
        parents[root_node][root_node] = 0
        res = nx.algorithms.shortest_paths.unweighted.single_source_shortest_path_length(g, root_node, cutoff=50)
        for k,v in res.items():
            if k not in parents:
                parents[k] = dict()
            parents[k][root_node] = v
    return parents

parents = find_nearest_main_topic_ancestor(revgraph)
del revgraph
# Convert result into a pandas dataframe
nearest_parent = {k:min(v, key=v.get) for k,v in parents.items()}
nearest_parent_pd = pd.DataFrame({'title':list(nearest_parent.keys()), 'label':list(nearest_parent.values())})

# Assign labels to the nodes in the graph
for node in digraph.nodes():
    try:
        lbl = nearest_parent[node]
    except KeyError:
        lbl = 'Unknown'
    digraph.nodes[node]['label'] = lbl

In [10]:
## Save NetworkX
nx.write_gpickle(digraph, './data/category_network.nx.pickle')

# Save the nearest main topic ancestor into a file
# Generate links between individual category and their nearest root category
# cate_to_root = nearest_parent_pd.merge(content_category_id_pd, left_on='label', right_on='title')[['title_x', 'title_id']]
# cate_to_root.columns = ['cl_from', 'cl_to_id']
# cate_to_root = cate_to_root.merge(content_category_id_pd, left_on='cl_from', right_on='title')[['title_id', 'cl_to_id']]
# cate_to_root.columns = ['cl_from_id', 'cl_to_id']
# cate_to_root.to_csv('./category_network.root.txt', sep=' ', index=False)

# output main topic colors
tableau_colors = ['31,119,180', '174,199,232', '255,127,14', '255,187,120', '44,160,44',
                  '152,223,138', '214,39,40', '255,152,150', '148,103,189', '197,176,213',
                  '140,86,75', '196,156,148', '227,119,194', '247,182,210', '127,127,127',
                  '199,199,199', '188,189,34', '219,219,141', '23,190,207', '158,218,229',
                  '65,68,81', '159,205,153']

parent_color = {k:v for k,v in zip(sorted(set([str(x) for x in nearest_parent.values()])), tableau_colors)}
parent_ids = {k:v for v,k in enumerate(parent_color.keys())}
# nearest_parent_id_pd['label_id'] = nearest_parent_id_pd.label.apply(lambda x: parent_ids[x])
# nearest_parent_id_pd[['title_id', 'label_id']].to_csv('./data/category_label.id.txt', sep=' ', index=False)

with open('./data/category_label.color.txt', 'w') as fout:
    for k,v in parent_color.items():
        r,g,b = [int(x) for x in v.split(',')]
        fout.write('%s %d %d %d\n' % (k, r,g,b))

## Find page and their categories

In [11]:
# Find all page -> ??? links
page_links = categorylinks[categorylinks.cl_type == 'page']

# Find all page -> category
selected_pages = page_links[page_links.cl_to.isin(content_category_set)].merge(page[page.page_namespace == 0][['page_id', 'page_title']], left_on='cl_from', right_on='page_id')[['page_title', 'cl_to']]
selected_pages.columns = ['cl_from', 'cl_to']
selected_pages=selected_pages.merge(content_category_id_pd, left_on='cl_to', right_on='title')[['cl_from', 'cl_to', 'title_id']]
selected_pages.columns=['cl_from', 'cl_to', 'cl_to_id']
# Add redirection pages
redirection = pd.read_csv('../enwiki-20171001-redirect.csv', header=None, sep=',', encoding='latin1', usecols=[0,1,2])
redirection.columns = ['rd_from', 'rd_namespace', 'rd_title']
redirect = redirection[redirection.rd_namespace == 0].merge(page, left_on='rd_from', right_on='page_id')[['page_title', 'rd_title']]
redirect = redirect.merge(selected_pages, left_on='rd_title', right_on='cl_from')[['page_title', 'cl_to', 'cl_to_id']]
redirect.columns = ['cl_from', 'cl_to', 'cl_to_id']
selected_pages = pd.concat([selected_pages, redirect])
selected_pages.drop_duplicates().to_csv('./data/page_category.txt', sep=' ', index=False)

In [12]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [1]:
import networkx as nx
import numpy as np
import pandas as pd

In [14]:
digraph = nx.read_gpickle('./data/category_network.nx.pickle').reverse(False)

labels = nx.get_node_attributes(digraph, 'label')
subgraphs = list()
for root_node in digraph.neighbors('Main_topic_classifications'):
    print("processing %s" % root_node)
    _subgraph = digraph.subgraph([x for (x,y) in labels.items() if y == labels[root_node]])
    print(_subgraph.number_of_nodes())
    # child -> parent
    _subtree = pd.DataFrame(np.asarray([[y,x] for (x,y) in nx.bfs_tree(_subgraph, root_node, False).edges()]), columns=['cl_from', 'cl_to'])
    # node -> depth
    _subdepth = pd.DataFrame(np.asarray([[x,y + 1] for (x,y) in nx.algorithms.shortest_paths.unweighted.single_source_shortest_path_length(_subgraph, root_node, cutoff=100).items()]), columns=['title', 'depth'])
    
    _subedges = _subtree.merge(_subdepth, left_on='cl_from', right_on='title')[['cl_from', 'cl_to', 'depth']]
    _subedges.depth = _subedges.depth.astype(int)
    _subedges['label'] = root_node
    subgraphs.append(_subedges)
    subgraphs.append(pd.DataFrame([[root_node, 'Main_topic_classifications', 1, root_node]], columns=['cl_from', 'cl_to', 'depth', 'label']))
subtree = pd.concat(subgraphs)
category_id_pd = pd.read_csv('./data/content_category_id.txt', sep=' ')
subtree = subtree.merge(category_id_pd, left_on='cl_from', right_on='title')[['cl_from','title_id', 'cl_to', 'depth', 'label']]
subtree.columns = ['cl_from', 'cl_from_id', 'cl_to', 'depth', 'label']
subtree = subtree.merge(category_id_pd, left_on='cl_to', right_on='title')[['cl_from','cl_from_id', 'cl_to', 'title_id', 'depth', 'label']]
subtree.columns = ['cl_from', 'cl_from_id', 'cl_to', 'cl_to_id', 'depth', 'label']

# Save graph into files
for max_depth in range(1, subtree.depth.max() + 1):
    subtree[subtree.depth <= max_depth][['cl_from_id', 'cl_to_id', 'label', 'depth']].to_csv('./data/category_tree.d%d.txt'%max_depth, sep=' ', index=False)

processing Events
10090
processing Health
12194
processing Culture
166300
processing World
224798
processing Games
276
processing Nature
14428
processing Humanities
29103
processing Reference_works
751
processing Sports
51777
processing Life
15604
processing Science_and_technology
13825
processing History
66564
processing Philosophy
633
processing Politics
185603
processing Law
9387
processing Mathematics
2106
processing Matter
3595
processing Religion
10367
processing People
1637
processing Geography
49391
processing Arts
12835
processing Society
14358
