In [1]:
import os
import re

import numpy as np
import pandas as pd

In [2]:
DATASETS_FOLDER = "datasets"

In [3]:
# Opening original network with modularity
with open(f"gephi/nodes_with_modularity_and_degree.csv", 'r') as f:
    gephi_df = pd.read_csv(f, header=0)

In [4]:
# Calculate records per group
modularity_df = gephi_df.groupby('modularity_class').count().sort_values('Id', ascending = False)

# Only taking in consideration where groups have more than 10 nodes
modularity_df = modularity_df[modularity_df.Id >= 10]
modularity_df.head()

Unnamed: 0_level_0,Id,Label,timeset,indegree,outdegree,Degree
modularity_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,3378,3378,0,3378,3378,3378
34,2370,2370,0,2370,2370,2370
37,2337,2337,0,2337,2337,2337
54,1899,1899,0,1899,1899,1899
62,1668,1668,0,1668,1668,1668


In [5]:
total_nodes = modularity_df.sum()['Id']

In [6]:
# DF will contain the most important nodes of each modularity class
nodes_per_modularity_class_df = None

In [7]:
for row in modularity_df.iterrows():
    
    modularity = row[0]
    
    
    total_nodes_in_class = modularity_df.loc[modularity]['Id']
    subset_of_nodes = int(round((total_nodes_in_class / total_nodes) * total_nodes_in_class)) + 1
    
    aux = gephi_df[gephi_df.modularity_class == modularity].sort_values('Degree', ascending=False)[:subset_of_nodes]
    
    if nodes_per_modularity_class_df is None:
        nodes_per_modularity_class_df = aux
    else:
        nodes_per_modularity_class_df = nodes_per_modularity_class_df.append(aux, ignore_index=True)

In [8]:
nodes_per_modularity_class_df.head()

Unnamed: 0,Id,Label,timeset,modularity_class,indegree,outdegree,Degree
0,9610043,9610043,,2,1199,19,1218
1,9510017,9510017,,2,1155,10,1165
2,9503124,9503124,,2,1114,10,1124
3,9510135,9510135,,2,775,15,790
4,9410167,9410167,,2,748,25,773


In [9]:
with open(f"{DATASETS_FOLDER}/cit-HepTh.txt", 'r') as f:
    hepth_df = pd.read_csv(f,sep='\t',skiprows=(0,1,2))
    
# Rename columns
hepth_df.columns = ['FromNodeId', 'ToNodeId']

In [10]:
hepth_df.head()

Unnamed: 0,FromNodeId,ToNodeId
0,1001,9304045
1,1001,9308122
2,1001,9309097
3,1001,9311042
4,1001,9401139


In [11]:
nodes_with_modularity_df = nodes_per_modularity_class_df[['Id', 'modularity_class']]
nodes_with_modularity_df.head()

Unnamed: 0,Id,modularity_class
0,9610043,2
1,9510017,2
2,9503124,2
3,9510135,2
4,9410167,2


In [12]:
# Join the original network with our most important nodes with modularity
hepth_df_with_modularity = hepth_df.merge(nodes_with_modularity_df, how="outer", left_on = 'FromNodeId', right_on = 'Id')

hepth_df_with_modularity.columns = ['FromNodeId', 'ToNodeId', 'Id', 'modularity_from']
hepth_df_with_modularity.drop(['Id'], axis=1, inplace=True)

hepth_df_with_modularity = hepth_df_with_modularity.merge(nodes_with_modularity_df, how="outer", left_on = 'ToNodeId', right_on = 'Id')

hepth_df_with_modularity.columns = ['FromNodeId', 'ToNodeId', 'modularity_from', 'Id', 'modularity_to']
hepth_df_with_modularity.drop(['Id'], axis=1, inplace=True)

In [13]:
# Remove all edges that have a node that was not found in nodes_with_modularity_df
hepth_df_with_modularity.dropna(inplace=True)
hepth_df_with_modularity = hepth_df_with_modularity.astype(int)

In [14]:
hepth_df_with_modularity.head()

Unnamed: 0,FromNodeId,ToNodeId,modularity_from,modularity_to
17,9505105,9308122,54,54
18,9506112,9308122,54,54
19,9508155,9308122,54,54
20,9601108,9308122,54,54
22,9609239,9308122,37,54


In [15]:
# Save our new reduced network with most important nodes
hepth_df_with_modularity.to_csv("gephi/reduced_size_network.csv")