In [1]:
import pandas as pd
import networkx as nx
import numpy as np
from collections import OrderedDict
from tqdm import tqdm

In [2]:
df = pd.read_csv('wiki-topcats-reduced.txt', sep='\t', header = None)
df.columns = ['from', 'to']

In [3]:
#vertexes
fr = df.groupby('from').count()
to = df.groupby('to').count()
V = set(list(fr.index) + list(to.index))
print('Number of nodes =', len(V))

Number of nodes = 461193


In [4]:
#directed cause these two are different
print(df[df['from'] ==52], '\n')
print(df[df['from'] ==401135].head())

   from       to
0    52   401135
1    52  1069112
2    52  1163551 

          from      to
361754  401135   60219
361755  401135  167532
361756  401135  400980
361757  401135  401018
361758  401135  401019


In [5]:
print('Number of edges:',len(df.index))

Number of edges: 2645247


In [6]:
print('The average degree of the vertex is:', len(df.index)/len(V))

The average degree of the vertex is: 5.735661642739591


In [8]:
o = open('wiki-topcats-categories.txt', 'r')
categories = {}
for line in o :
    line = line.replace('\n','')
    l = line.split(' ')
    l[0] = l[0].replace('Category:', '').replace(';','')
    try:
        #print(len(l[1:]))
        if len(l[1:])>=3500:
            categories[l[0]] = list(map(int, l[1:]))
    except:
        pass


In [9]:
input_category = list(categories.keys())[7]

In [10]:
DG = nx.from_pandas_edgelist(df, 'from', 'to', create_using=nx.DiGraph )

In [11]:
class Dijkstra:

    def dijkstra(self, dic, dist):
        node = list(dic.keys())[0]
        actual_dist = list(dic.values())[0]
        l = []
        for i in list(self.graph[node]):
            try:
                if (i in self.visited):
                    if self.actual_node in self.visited[i]:
                        if (self.visited[i][1] > dist):
                            self.visited[i][1] = dist
                    else:
                        self.visited[i][self.actual_node] = dist
     
                if (self.unvisited[i] == -1):
                    del self.unvisited[i]
                    self.visited[i] = {self.actual_node:dist}
                l.append({i:dist})
            except:
                pass
                              
        for i in l:
            self.dijkstra(i, dist+1)
            
    
    
    def __init__(self, graph, categories, input_category):
        
        self.graph = graph
        self.nodes = categories
        self.initial = categories[input_category]
        self.unvisited = {}
        for i in graph.nodes:
            self.unvisited[i] = -1
        self.visited = {}
        
        for i in self.initial:
            try:
                self.actual_node = i
                self.visited[i] = {self.actual_node: 0}
                del self.unvisited[i]
                self.dijkstra({self.actual_node:0},1)
            except:
                pass
            


In [12]:
dij =Dijkstra(DG, categories, input_category)

In [13]:
median = {}
set_of_visited = set(list(dij.visited.keys()))
for i in categories:
    if i != input_category:
        shortest_path=[]
        s = set(categories[i]).intersection(set_of_visited)
        if len(s)> 0:
            for j in s:
                shortest_path += (list(dij.visited[j].values()))
            median[i] = np.median(shortest_path)
        else:
            median[i] = 100**100
median[input_category] = -1

In [14]:
block_ranking = OrderedDict()
block_ranking = OrderedDict(sorted(median.items(), key=lambda x: x[1]))

In [15]:
# map each article to the categories it is present

articles = {}
for name in categories:
    for article in categories[name]:
        if article in articles:
            articles[article].append(name)
        else:
            articles[article] = [name]

In [16]:
# if an article belongs to multiple categories, choose one according to block_ranking

for article in articles:
    if len(articles[article]) > 1:
        minimum = ''
        for cat in articles[article]:
            if (minimum == '') or (block_ranking[minimum] > block_ranking[cat]):
                minimum = cat
        articles[article] = [minimum]

In [17]:
categories_after_ranking = {}

for i in articles:
    if articles[i][0] in categories_after_ranking:
        categories_after_ranking[articles[i][0]] +=  [i]
    else:
        categories_after_ranking[articles[i][0]] =  [i]


In [18]:
# VERSIONE MODIFICATA
# da quella di prima cambia come trovare gli archi adatti (jj)

idx = 0
jj = nx.to_dict_of_lists(DG)

for name in tqdm(block_ranking):
    print(name)
    if idx == 0:
        weight_dict = {}
        boh = DG.subgraph(categories_after_ranking[name])
        weight_dict[name] = {}
        for i in (boh.in_degree):
            for j in jj[i[0]]:
                    DG[i[0]][j]['weight'] = i[1]
            idx +=1
        for  i in boh.in_degree:
            weight_dict[name][i[0]] = i[1]
    else:
        try:
            boh = DG.subgraph(categories_after_ranking[name])
            weight_dict[name] = {}

            for i in boh.in_degree:
                cumsum = i[1]
                for j in jj[i[0]]:
                        try:
                            cumsum+=(list(DG.edges[j,i[0]].values()))[0]
                        except:
                            pass
                weight_dict[name][i[0]] = cumsum


            for i in boh.in_degree:
                for j in jj[i[0]]:
                        DG[i[0]][j]['weight'] = weight_dict[name][i[0]]
        except:
            pass



  0%|                                                                                           | 0/35 [00:00<?, ?it/s]

Year_of_birth_unknown
Year_of_birth_missing


  6%|████▋                                                                              | 2/35 [00:00<00:02, 12.85it/s]

Article_Feedback_Pilot


  9%|███████                                                                            | 3/35 [00:00<00:09,  3.28it/s]

English_cricketers


 11%|█████████▍                                                                         | 4/35 [00:01<00:07,  4.05it/s]

Year_of_death_missing
Fellows_of_the_Royal_Society


 17%|██████████████▏                                                                    | 6/35 [00:01<00:05,  4.94it/s]

Black-and-white_films


 20%|████████████████▌                                                                  | 7/35 [00:01<00:07,  3.61it/s]

Main_Belt_asteroids


 23%|██████████████████▉                                                                | 8/35 [00:01<00:06,  3.90it/s]

Asteroids_named_for_people
American_military_personnel_of_World_War_II


 29%|███████████████████████▍                                                          | 10/35 [00:02<00:05,  4.64it/s]

English_television_actors


 31%|█████████████████████████▊                                                        | 11/35 [00:02<00:06,  3.71it/s]

American_film_actors


 34%|████████████████████████████                                                      | 12/35 [00:04<00:14,  1.62it/s]

British_films


 37%|██████████████████████████████▍                                                   | 13/35 [00:04<00:10,  2.02it/s]

American_Jews


 40%|████████████████████████████████▊                                                 | 14/35 [00:04<00:09,  2.32it/s]

American_films


 43%|███████████████████████████████████▏                                              | 15/35 [00:05<00:09,  2.00it/s]

People_from_New_York_City


 46%|█████████████████████████████████████▍                                            | 16/35 [00:05<00:07,  2.47it/s]

American_television_actors


 49%|███████████████████████████████████████▊                                          | 17/35 [00:05<00:05,  3.14it/s]

Harvard_University_alumni


 51%|██████████████████████████████████████████▏                                       | 18/35 [00:05<00:04,  3.61it/s]

Rivers_of_Romania


 54%|████████████████████████████████████████████▌                                     | 19/35 [00:05<00:04,  3.84it/s]

English-language_films


 57%|██████████████████████████████████████████████▊                                   | 20/35 [00:06<00:05,  2.62it/s]

The_Football_League_players


 60%|█████████████████████████████████████████████████▏                                | 21/35 [00:06<00:04,  2.89it/s]

English_footballers
English-language_albums


 66%|█████████████████████████████████████████████████████▉                            | 23/35 [00:07<00:03,  3.42it/s]

Living_people


 69%|████████████████████████████████████████████████████████▏                         | 24/35 [00:17<00:37,  3.38s/it]

Debut_albums


 71%|██████████████████████████████████████████████████████████▌                       | 25/35 [00:17<00:24,  2.44s/it]

Members_of_the_United_Kingdom_Parliament_for_English_constituencies


 74%|████████████████████████████████████████████████████████████▉                     | 26/35 [00:18<00:15,  1.77s/it]

Year_of_birth_missing_(living_people)
Place_of_birth_missing_(living_people)
Association_football_goalkeepers
Association_football_forwards
Windows_games


 89%|████████████████████████████████████████████████████████████████████████▋         | 31/35 [00:18<00:04,  1.25s/it]

Association_football_midfielders
Association_football_defenders
Indian_films


 97%|███████████████████████████████████████████████████████████████████████████████▋  | 34/35 [00:18<00:00,  1.11it/s]

Major_League_Baseball_pitchers


100%|██████████████████████████████████████████████████████████████████████████████████| 35/35 [00:18<00:00,  1.87it/s]


In [19]:
#rank per ogni nodo (come nello step3 dell'homework)
rank = []
for i in weight_dict:
    rank += list(OrderedDict(sorted(weight_dict[i].items(), key=lambda x: x[1], reverse=True)).keys())


[62684,
 170163,
 1656777,
 1656780,
 169696,
 1656794,
 170578,
 1342864,
 1343014,
 1656778,
 666855,
 1342960,
 1779656,
 174582,
 1109348,
 1109485,
 159606,
 159730,
 159920,
 1766063,
 1203095,
 1203235,
 64632,
 1203496,
 1122762,
 1344701,
 34422,
 1443739,
 174427,
 174439,
 166284,
 666857,
 159750,
 168001,
 168145,
 168251,
 168258,
 1765824,
 1765831,
 1765837,
 185120,
 62695,
 1340874,
 170158,
 1203101,
 170969,
 170970,
 170971,
 170972,
 170973,
 1342803,
 1343206,
 958480,
 172050,
 1319048,
 360595,
 1122603,
 1122605,
 156307,
 173221,
 1344730,
 1344821,
 173671,
 1656450,
 1656452,
 1656453,
 1656455,
 1443741,
 1656779,
 1656793,
 1779795,
 1779800,
 174428,
 1345946,
 1345947,
 1190355,
 60061,
 748777,
 175366,
 1109359,
 159614,
 167906,
 159736,
 159749,
 159753,
 159754,
 159766,
 167966,
 168100,
 159914,
 159934,
 176367,
 168194,
 1765823,
 1765832,
 1765845,
 1765848,
 201333,
 1684172,
 324414,
 1766449,
 1766721,
 186174,
 186176,
 456791,
 1144930,
 

In [None]:
# VERSIONE VISTA INSIEME

idx = 0
for name in tqdm(block_ranking):
    print(name)
    if idx == 0:
        weight_dict = {}
        boh = DG.subgraph(categories_after_ranking[name])
        weight_dict[name] = {}
        for i in boh.in_degree:
            print(idx)
            for j in DG.edges:
                if j[0] == i[0]:
                    DG[i[0]][j[1]]['weight'] = i[1]
            idx +=1
        for i in boh.in_degree: 
            weight_dict[name][i[0]] = i[1]
        

        try:
            boh = DG.subgraph(categories_after_ranking[name])
            weight_dict[name] = {}

            for i in boh.in_degree:
                cumsum = i[1]
                for j in DG.edges:
                    if j[1] == i[0]:
                        try:
                            cumsum+=(list(DG.edges[j[0],j[1]].values()))[0]
                        except:
                            pass
                weight_dict[name][i[0]] = cumsum


            for i in boh.in_degree:
                for j in DG.edges:
                    if j[0] == i[0]:
                        DG[i[0]][j[1]]['weight'] = weight_dict[name][i[0]]
        except:
            pass

weight_dict