In [1]:
import pandas as pd
import networkx as nx
import numpy as np
from collections import OrderedDict
from tqdm import tqdm

In [2]:
df = pd.read_csv('wiki-topcats-reduced.txt', sep='\t', header = None)
df.columns = ['from', 'to']

In [3]:
#vertexes
fr = df.groupby('from').count()
to = df.groupby('to').count()
V = set(list(fr.index) + list(to.index))
print('Number of nodes =', len(V))

Number of nodes = 461193


In [4]:
#directed cause these two are different
print(df[df['from'] ==52], '\n')
print(df[df['from'] ==401135].head())

   from       to
0    52   401135
1    52  1069112
2    52  1163551 

          from      to
361754  401135   60219
361755  401135  167532
361756  401135  400980
361757  401135  401018
361758  401135  401019


In [5]:
print('Number of edges:',len(df.index))

Number of edges: 2645247


In [6]:
print('The average degree of the vertex is:', len(df.index)/len(V))

The average degree of the vertex is: 5.735661642739591


In [3]:
o = open('wiki-topcats-categories.txt', 'r')
categories = {}
for line in o :
    line = line.replace('\n','')
    l = line.split(' ')
    l[0] = l[0].replace('Category:', '').replace(';','')
    try:
        #print(len(l[1:]))
        if len(l[1:])>=3500:
            categories[l[0]] = list(map(int, l[1:]))
    except:
        pass


In [33]:
input_category = list(categories.keys())[7]
len(categories[input_category])

3760

In [5]:
DG = nx.from_pandas_edgelist(df, 'from', 'to', create_using=nx.DiGraph )

In [46]:
class Dijkstra:

    def dijkstra(self, dic, dist):
        node = list(dic.keys())[0]
        actual_dist = list(dic.values())[0]
        try:
            l = [list(self.graph[node]),1]
            while len(l[0]) > 0:
                dist = l[1]
                for i in l[0]:
                    m = []
                    if (i in self.visited):
                        if self.actual_node in self.visited[i]:
                            if (self.visited[i][self.actual_node] > dist):
                                self.visited[i][self.actual_node] = dist
                        else:
                            self.visited[i][self.actual_node] = dist

                    elif (self.unvisited[i] == -1):
                        del self.unvisited[i]
                        self.visited[i] = {self.actual_node:dist}

                    for j in list(self.graph[i]):
                        if (not(j in self.visited)):
                            m.append(j)
                        elif not(self.actual_node in self.visited[j]):
                            m.append(j)
                l = [m,dist+1]
        except:
            pass       
        
    
    def __init__(self, graph, categories, input_category):
        
        self.graph = graph
        self.nodes = categories
        self.initial = categories[input_category]
        self.unvisited = {}
        for i in self.graph.nodes:
            self.unvisited[i] = -1
        self.visited = {}
        self.l = []
        idx = 1
        for i in self.initial:
            idx+=1
            self.actual_node = i
            self.visited[i] = {self.actual_node: 0}
            try:
                del self.unvisited[i]
            except:
                pass
            self.dijkstra({self.actual_node:0},1)



In [47]:
dij = Dijkstra(DG, categories, input_category)

In [50]:
median = {}
set_of_visited = set(list(dij.visited.keys()))
for i in categories:
    if i != input_category:
        shortest_path=[]
        s = set(categories[i]).intersection(set_of_visited)
        if len(s)> 0:
            for j in s:
                shortest_path += (list(dij.visited[j].values()))
            median[i] = np.median(shortest_path)
        else:
            median[i] = 100**100
median[input_category] = -1

In [44]:
block_ranking = OrderedDict()
block_ranking = OrderedDict(sorted(median.items(), key=lambda x: x[1]))
block_ranking

OrderedDict([('Year_of_birth_unknown', -1),
             ('Year_of_death_missing', 2.0),
             ('Year_of_birth_missing', 2.0),
             ('English_footballers', 3.0),
             ('Association_football_forwards', 3.0),
             ('Association_football_goalkeepers', 3.0),
             ('Association_football_midfielders', 3.0),
             ('Association_football_defenders', 3.0),
             ('Indian_films', 3.0),
             ('English_cricketers', 3.0),
             ('Rivers_of_Romania', 3.0),
             ('British_films', 3.0),
             ('The_Football_League_players', 4.0),
             ('Article_Feedback_Pilot', 4.0),
             ('Fellows_of_the_Royal_Society', 5.0),
             ('Members_of_the_United_Kingdom_Parliament_for_English_constituencies',
              6.0),
             ('Black-and-white_films', 6.0),
             ('Harvard_University_alumni', 7.0),
             ('People_from_New_York_City', 7.0),
             ('American_military_personnel_of_World

In [16]:
# map each article to the categories it is present

articles = {}
for name in categories:
    for article in categories[name]:
        if article in articles:
            articles[article].append(name)
        else:
            articles[article] = [name]

In [17]:
# if an article belongs to multiple categories, choose one according to block_ranking

for article in articles:
    if len(articles[article]) > 1:
        minimum = ''
        for cat in articles[article]:
            if (minimum == '') or (block_ranking[minimum] > block_ranking[cat]):
                minimum = cat
        articles[article] = [minimum]

In [18]:
categories_after_ranking = {}

for i in articles:
    if articles[i][0] in categories_after_ranking:
        categories_after_ranking[articles[i][0]] +=  [i]
    else:
        categories_after_ranking[articles[i][0]] =  [i]


In [19]:
# VERSIONE MODIFICATA
# da quella di prima cambia come trovare gli archi adatti (jj)

idx = 0
jj = nx.to_dict_of_lists(DG)

for name in tqdm(block_ranking):
    print(name)
    if idx == 0:
        weight_dict = {}
        boh = DG.subgraph(categories_after_ranking[name])
        weight_dict[name] = {}
        for i in (boh.in_degree):
            for j in jj[i[0]]:
                    DG[i[0]][j]['weight'] = i[1]
            idx +=1
        for  i in boh.in_degree:
            weight_dict[name][i[0]] = i[1]
    else:
        try:
            boh = DG.subgraph(categories_after_ranking[name])
            weight_dict[name] = {}

            for i in boh.in_degree:
                cumsum = i[1]
                for j in jj[i[0]]:
                        try:
                            cumsum+=(list(DG.edges[j,i[0]].values()))[0]
                        except:
                            pass
                weight_dict[name][i[0]] = cumsum


            for i in boh.in_degree:
                for j in jj[i[0]]:
                        DG[i[0]][j]['weight'] = weight_dict[name][i[0]]
        except:
            pass

  0%|                                                                                           | 0/35 [00:00<?, ?it/s]

Year_of_birth_unknown
Rivers_of_Romania


  6%|████▋                                                                              | 2/35 [00:00<00:04,  7.77it/s]

Article_Feedback_Pilot


  9%|███████                                                                            | 3/35 [00:01<00:11,  2.82it/s]

English_footballers


 11%|█████████▍                                                                         | 4/35 [00:01<00:10,  2.98it/s]

Association_football_goalkeepers


 14%|███████████▊                                                                       | 5/35 [00:01<00:08,  3.63it/s]

The_Football_League_players


 17%|██████████████▏                                                                    | 6/35 [00:01<00:06,  4.37it/s]

Main_Belt_asteroids


 20%|████████████████▌                                                                  | 7/35 [00:01<00:06,  4.44it/s]

Association_football_forwards


 23%|██████████████████▉                                                                | 8/35 [00:02<00:05,  5.18it/s]

Fellows_of_the_Royal_Society


 26%|█████████████████████▎                                                             | 9/35 [00:02<00:04,  5.33it/s]

Harvard_University_alumni


 29%|███████████████████████▍                                                          | 10/35 [00:02<00:05,  4.68it/s]

Association_football_midfielders


 31%|█████████████████████████▊                                                        | 11/35 [00:02<00:04,  5.10it/s]

Year_of_birth_missing


 34%|████████████████████████████                                                      | 12/35 [00:02<00:03,  5.93it/s]

American_military_personnel_of_World_War_II


 37%|██████████████████████████████▍                                                   | 13/35 [00:02<00:04,  5.44it/s]

American_Jews


 40%|████████████████████████████████▊                                                 | 14/35 [00:03<00:04,  4.89it/s]

Asteroids_named_for_people
Association_football_defenders
English_television_actors


 49%|███████████████████████████████████████▊                                          | 17/35 [00:03<00:03,  5.49it/s]

People_from_New_York_City


 51%|██████████████████████████████████████████▏                                       | 18/35 [00:03<00:04,  4.08it/s]

Year_of_death_missing
Members_of_the_United_Kingdom_Parliament_for_English_constituencies


 57%|██████████████████████████████████████████████▊                                   | 20/35 [00:04<00:03,  3.90it/s]

Living_people


 60%|█████████████████████████████████████████████████▏                                | 21/35 [00:18<01:01,  4.40s/it]

English_cricketers
Windows_games


 66%|█████████████████████████████████████████████████████▉                            | 23/35 [00:18<00:37,  3.11s/it]

American_film_actors


 69%|████████████████████████████████████████████████████████▏                         | 24/35 [00:19<00:25,  2.31s/it]

American_television_actors
Year_of_birth_missing_(living_people)
Place_of_birth_missing_(living_people)
English-language_films


 80%|█████████████████████████████████████████████████████████████████▌                | 28/35 [00:20<00:12,  1.74s/it]

British_films
American_films


 86%|██████████████████████████████████████████████████████████████████████▎           | 30/35 [00:21<00:06,  1.27s/it]

Indian_films


 89%|████████████████████████████████████████████████████████████████████████▋         | 31/35 [00:21<00:03,  1.01it/s]

Black-and-white_films


 91%|██████████████████████████████████████████████████████████████████████████▉       | 32/35 [00:21<00:02,  1.37it/s]

English-language_albums


 94%|█████████████████████████████████████████████████████████████████████████████▎    | 33/35 [00:21<00:01,  1.70it/s]

Debut_albums


 97%|███████████████████████████████████████████████████████████████████████████████▋  | 34/35 [00:22<00:00,  2.06it/s]

Major_League_Baseball_pitchers


100%|██████████████████████████████████████████████████████████████████████████████████| 35/35 [00:22<00:00,  1.57it/s]


In [20]:
#rank per ogni nodo (come nello step3 dell'homework)
rank = []
for i in weight_dict:
    rank += list(OrderedDict(sorted(weight_dict[i].items(), key=lambda x: x[1], reverse=True)).keys())


In [None]:
# VERSIONE VISTA INSIEME

idx = 0
for name in tqdm(block_ranking):
    print(name)
    if idx == 0:
        weight_dict = {}
        boh = DG.subgraph(categories_after_ranking[name])
        weight_dict[name] = {}
        for i in boh.in_degree:
            print(idx)
            for j in DG.edges:
                if j[0] == i[0]:
                    DG[i[0]][j[1]]['weight'] = i[1]
            idx +=1
        for i in boh.in_degree: 
            weight_dict[name][i[0]] = i[1]
        

        try:
            boh = DG.subgraph(categories_after_ranking[name])
            weight_dict[name] = {}

            for i in boh.in_degree:
                cumsum = i[1]
                for j in DG.edges:
                    if j[1] == i[0]:
                        try:
                            cumsum+=(list(DG.edges[j[0],j[1]].values()))[0]
                        except:
                            pass
                weight_dict[name][i[0]] = cumsum


            for i in boh.in_degree:
                for j in DG.edges:
                    if j[0] == i[0]:
                        DG[i[0]][j[1]]['weight'] = weight_dict[name][i[0]]
        except:
            pass

weight_dict