In [1]:
from collections import defaultdict, deque
import networkx as nx
import statistics
import numpy as np

In [2]:
categories = {}
with open('wiki-topcats-categories.txt', 'r') as f:    
    for i in f:
        i = i.strip('').lstrip('Category:').replace(';','').split()
        if len(i)-1 >=3500:
            key = i[0]
            value = list(map(int, i[1:]))
            categories.update({key:value})
        else:
            pass

Making the list of page names:

In [3]:
with open('wiki-topcats-page-names.txt', 'r') as f:
    pagenames = [' '.join(i.split()) for i in f]

In [4]:
pagenames[0:10]

['0 Chiasmal syndrome',
 '1 Kleroterion',
 '2 Pinakion',
 '3 LyndonHochschildSerre spectral sequence',
 "4 Zariski's main theorem",
 '5 FultonHansen connectedness theorem',
 "6 Cayley's ruled cubic surface",
 '7 Annulus theorem',
 "8 Bing's recognition theorem",
 '9 BochnerMartinelli formula']

In [5]:
with open('wiki-topcats-reduced.txt', 'r') as f:
    connections = [tuple(map(int, i.strip().split())) for i in f]

In [6]:
connections[0:10]

[(52, 401135),
 (52, 1069112),
 (52, 1163551),
 (62, 12162),
 (62, 167659),
 (62, 279122),
 (62, 1089199),
 (62, 1354553),
 (62, 1400636),
 (62, 1403619)]

In [7]:
nodes = list(set([node for connection in connections for node in connection]))
nodes

[1048576,
 1048577,
 1048578,
 1048579,
 1048582,
 1048583,
 1048584,
 1048585,
 1048586,
 1048587,
 1048588,
 1048589,
 1048590,
 1048592,
 1048596,
 1048601,
 1048603,
 1048610,
 1048618,
 1048620,
 1048621,
 1048624,
 52,
 1048628,
 1048629,
 1048631,
 1048632,
 1048633,
 1048636,
 1048637,
 62,
 1048638,
 64,
 1048639,
 66,
 1048642,
 1048641,
 1048643,
 1048644,
 1048647,
 1048648,
 1048649,
 74,
 1048650,
 1048645,
 1048653,
 1048654,
 1048651,
 1048656,
 1048658,
 1048659,
 1048660,
 1048661,
 1048662,
 1048663,
 1048670,
 95,
 1048672,
 96,
 1048674,
 1048675,
 1048677,
 103,
 104,
 105,
 1048680,
 107,
 108,
 1048683,
 1048686,
 112,
 113,
 1048689,
 1048688,
 117,
 1048695,
 122,
 1048700,
 126,
 1048703,
 1048706,
 133,
 134,
 1048709,
 136,
 137,
 1048713,
 1048710,
 1048715,
 1048720,
 1048721,
 1048722,
 1048723,
 1048725,
 153,
 154,
 155,
 156,
 1048731,
 158,
 1048735,
 160,
 159,
 1048730,
 163,
 1048733,
 1048734,
 166,
 167,
 1048743,
 1048745,
 1048741,
 1048747,
 

Constructing undirected graph dictionary

In [8]:
graph = defaultdict(set)
for i in connections:
    graph[i[0]].add(i[1])
    if i[1] not in graph:
        graph[i[1]] = set()    

In [9]:
graph

defaultdict(set,
            {52: {401135, 1069112, 1163551},
             401135: {60219,
              167532,
              400980,
              401018,
              401019,
              401053,
              401067,
              401137,
              401154,
              401171,
              401184,
              401227,
              401231,
              401295,
              401310,
              401315,
              401457,
              401474,
              401505,
              401609,
              401628,
              401975,
              401981,
              402265,
              402300,
              402715,
              402718,
              447882,
              595633,
              606279,
              630946,
              723911,
              724192,
              776478,
              809904,
              810461,
              824998,
              827334,
              946986,
              961942,
              1058269,
              1060341,
     

In [10]:
G = nx.Graph()

In [11]:
for key, value in graph.items():
    G.add_node(key)
    for attr in value:
        G.node[key][attr] = pagenames[attr] # insert page name as attribute for node

In [12]:
for connection in connections:
    G.add_edge(connection[0], connection[1])

In [13]:
nx.info(G)

'Name: \nType: Graph\nNumber of nodes: 461193\nNumber of edges: 2174451\nAverage degree:   9.4297'

In [14]:
G.node[1032]

{1061891: '1061891 Jodie Foster',
 788645: '788645 Steve Jurvetson',
 1060976: '1060976 Contact (film)',
 279122: '279122 United States',
 866495: '866495 Margaret Turnbull'}

In [15]:
for category, articles in categories.items():
    print(category,len(articles))

English_footballers 9237
The_Football_League_players 9467
Association_football_forwards 6959
Association_football_goalkeepers 3997
Association_football_midfielders 8270
Association_football_defenders 6668
Living_people 418223
Year_of_birth_unknown 3760
Harvard_University_alumni 6154
Major_League_Baseball_pitchers 6580
Members_of_the_United_Kingdom_Parliament_for_English_constituencies 6546
Indian_films 5913
Year_of_death_missing 7851
English_cricketers 3813
Year_of_birth_missing_(living_people) 34721
Rivers_of_Romania 7729
Main_Belt_asteroids 13704
Asteroids_named_for_people 5701
English-language_albums 4853
English_television_actors 3501
British_films 4551
English-language_films 22699
American_films 15302
Fellows_of_the_Royal_Society 3697
People_from_New_York_City 4888
American_Jews 3542
American_television_actors 11661
American_film_actors 13938
Debut_albums 8401
Black-and-white_films 12174
Year_of_birth_missing 7237
Place_of_birth_missing_(living_people) 6767
Article_Feedback_Pilot 

In [16]:
C0 = input()

Article_Feedback_Pilot


In [17]:
C0_list = categories.get(C0)
C0_list[0:10]

[4500, 4924, 5256, 5313, 5345, 5413, 5570, 5759, 5763, 5803]

In [50]:
def bfs(graph, start, goal):
    if start == goal:
        return 0

    queue = deque([start])

    # dict which holds parents, later helpful to retreive path.
    # Also useful to keep track of visited node
    parent = {}
    parent[start] = start

    while queue:
        currNode = queue.popleft()
        for neighbor in graph[currNode]:
            # goal found
            if neighbor == goal:
                parent[neighbor] = currNode
                path = [goal]
                while goal != start:
                    goal = parent[goal]
                    path.insert(0, goal)
                    shortest = len(path)-1
                return shortest
            # check if neighbor already seen
            if neighbor not in parent:
                parent[neighbor] = currNode
                queue.append(neighbor)
    return np.inf # no path found

In [44]:
C0_list[:3]

[4500, 4924, 5256]

In [45]:
type(statistics.median([4,21,2,1,35]))

int

In [51]:
s = time.time()
median_list = []
bfs_path = []
alist = categories.get('Article_Feedback_Pilot')
for article in C0_list[:10]:
    for article1 in alist[:10]:
        try:
            distance = bfs(G,article,article1)
        except:
            distance = np.inf
        bfs_path.append(distance)
median = statistics.median(bfs_path)
component = (category,median)
median_list.append(component)
print(time.time()-s)

41.562336444854736


In [None]:
bfs(G,article,article1)

In [39]:
median_list

[('Windows_games', 1)]

In [None]:
for category in categories.keys():
    if category is not C0:
        print(category)
        bfs_path = []
        alist = categories.get(category)
        for article in C0_list:
            for article1 in alist:   
                try:
                    distance = bfs(G,article,article1)
                except:
                    distance = 0
                bfs_path.append(distance)
        median = median(bfs_path)
        component = tuple(category,median)
        median_list.append(component)
    else:
        pass
       

In [None]:
a = bfs(G,19757,18840)
len(a)

In [None]:
a = np.inf
len(a)