This notebook reduces the size of our network for visualisation purposes.

In [1]:
import os
import re

import numpy as np
import pandas as pd

from helper import display_df_with_bokeh

In [2]:
DATASETS_FOLDER = "datasets"

In [3]:
# Opening original network with modularity
with open(f"gephi/citations/nodes_with_all_network_stats.csv", 'r') as f:
    gephi_df = pd.read_csv(f, header=0)

In [4]:
# Calculate records per group
modularity_df = gephi_df.groupby('modularity_class').count().sort_values('Id', ascending = False)

# Only taking in consideration where groups have more than 10 nodes
modularity_mt_nodes_df = modularity_df[modularity_df.Id >= 10]

display_df_with_bokeh(modularity_mt_nodes_df, include_index=True)

In [5]:
print("Total nodes that form part of communities with more than 10 members: ", modularity_mt_nodes_df.sum()['Id'])


Total nodes that form part of communities with more than 10 members:  27379


In [6]:
print("Total nodes that form part of communities with less than 10 members: ", modularity_df[modularity_df.Id < 10].sum()['Id'])


Total nodes that form part of communities with less than 10 members:  363


In [7]:
modularity_df[modularity_df.Id < 10]

Unnamed: 0_level_0,Id,Label,timeset,indegree,outdegree,Degree,weighted indegree,weighted outdegree,Weighted Degree,Authority,Hub,componentnumber,strongcompnum,pageranks,Eccentricity,closnesscentrality,harmonicclosnesscentrality,betweenesscentrality,clustering,eigencentrality
modularity_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
102,8,8,0,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
11,7,7,0,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
24,6,6,0,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
109,6,6,0,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
90,5,5,0,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
36,5,5,0,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
95,5,5,0,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
153,5,5,0,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
72,5,5,0,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
124,4,4,0,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4


In [None]:
gephi_df[gephi_df['modularity_class'] == 2]

In [None]:
total_nodes = modularity_mt_nodes_df.sum()['Id']
total_nodes

In [None]:
# DF will contain the most important nodes of each modularity class
nodes_per_modularity_class_df = None

In [None]:
for row in modularity_mt_nodes_df.iterrows():
    
    modularity = row[0]
    
    total_nodes_in_class = modularity_mt_nodes_df.loc[modularity]['Id']
    proportion_of_network = (total_nodes_in_class / total_nodes)
    subset_of_nodes = int(round(proportion_of_network * total_nodes_in_class)) + 1
    
    print(f"Modularity class: {modularity} - Total nodes in class: {total_nodes_in_class} - Proportion of final network: {proportion_of_network} - New total {subset_of_nodes}")
    
    aux = gephi_df[gephi_df.modularity_class == modularity].sort_values('Degree', ascending=False)[:subset_of_nodes]
    
    if nodes_per_modularity_class_df is None:
        nodes_per_modularity_class_df = aux
    else:
        nodes_per_modularity_class_df = nodes_per_modularity_class_df.append(aux, ignore_index=True)

In [None]:
display_df_with_bokeh(nodes_per_modularity_class_df)
print("Most important nodes of our network: ", nodes_per_modularity_class_df.shape[0])

In [None]:
with open(f"{DATASETS_FOLDER}/cit-HepTh.txt", 'r') as f:
    hepth_df = pd.read_csv(f,sep='\t',skiprows=(0,1,2))
    
# Rename columns
hepth_df.columns = ['FromNodeId', 'ToNodeId']

In [None]:
display_df_with_bokeh(hepth_df)

In [None]:
nodes_with_modularity_df = nodes_per_modularity_class_df[['Id', 'modularity_class']]
display_df_with_bokeh(nodes_with_modularity_df)

In [None]:
# Join the original network with our most important nodes with modularity
hepth_df_with_modularity = hepth_df.merge(nodes_with_modularity_df, how="outer", left_on = 'FromNodeId', right_on = 'Id')

hepth_df_with_modularity.columns = ['FromNodeId', 'ToNodeId', 'Id', 'modularity_from']
hepth_df_with_modularity.drop(['Id'], axis=1, inplace=True)

hepth_df_with_modularity = hepth_df_with_modularity.merge(nodes_with_modularity_df, how="outer", left_on = 'ToNodeId', right_on = 'Id')

hepth_df_with_modularity.columns = ['FromNodeId', 'ToNodeId', 'modularity_from', 'Id', 'modularity_to']
hepth_df_with_modularity.drop(['Id'], axis=1, inplace=True)

In [None]:
# Remove all edges that have a node that was not found in nodes_with_modularity_df
hepth_df_with_modularity.dropna(inplace=True)
hepth_df_with_modularity = hepth_df_with_modularity.astype(int)

In [None]:
display_df_with_bokeh(hepth_df_with_modularity)
print(hepth_df_with_modularity.shape)

In [None]:
# Save our new reduced network with most important nodes
hepth_df_with_modularity.to_csv("gephi/reduced_size_network.csv")