In [15]:
import networkx as nx
import matplotlib.pyplot as plt
import json
import time
import datetime
from math import log10

In [2]:
def read_json_file(filename):
    with open(filename) as f:
        js_graph = json.load(f) #, default={'sender': 'source'})
        _attrs = dict(source='sender', target='receiver', name='guid',
              key='guid', link='links')
    #return nx.readwrite.node_link_graph(js_graph, {'link': 'links', 'source': 'sender', 'target': 'receiver', 'key': 'guid'})
    return nx.readwrite.node_link_graph(js_graph, directed=True, multigraph=False, attrs={'link': 'links', 'source': 'sender', 'target': 'receiver', 'key': 'guid', 'name': 'guid'} )

G = read_json_file("data.json")
print(nx.info(G))

Name: 
Type: DiGraph
Number of nodes: 6489
Number of edges: 52318
Average in degree:   8.0626
Average out degree:   8.0626


In [3]:
print ("SCC: ", nx.number_strongly_connected_components(G))
print ("WCC: ", nx.number_weakly_connected_components(G))

SCC:  6007
WCC:  107


In [4]:
G_ud = G.to_undirected()
print("is_connected", nx.is_connected(G_ud))
print("is_strongly_connected", nx.is_strongly_connected(G))
print("number_connected_components", nx.number_connected_components(G_ud))

G_mc = list(G_ud.subgraph(c).copy() for c in nx.connected_components(G_ud))
# Connected components are sorted in descending order of their size
G_components = list(nx.connected_component_subgraphs(G_ud))
_components = (G_ud.subgraph(c) for c in nx.connected_components(G_ud))

# todo: do for-loop to run algorithm for all components
G_mc = G_components[0]

print("is_connected", nx.is_connected(G_mc)) 
print("number_connected_components", nx.number_connected_components(G_mc))
print(nx.info(G_mc))

is_connected False
is_strongly_connected False
number_connected_components 107
is_connected True
number_connected_components 1
Name: 
Type: Graph
Number of nodes: 5330
Number of edges: 44939
Average degree:  16.8627


In [16]:

c = 10  
take_count = c * log10(nx.number_of_nodes(G_mc))
print("take_count",  int(take_count))

take_count 5330
take_count 3.7267272090265724
take_count 37


In [17]:
# Betweenness centrality approximation
c = 10
take_count = int(c * log10(nx.number_of_nodes(G_mc)))
print("take_count",take_count)

current_time = time.time()
bet_cent_edges = nx.edge_betweenness_centrality(G_mc, k=take_count, weight='lastTs') 
 
print("bet_cent_edges took", time.time()-current_time)

print(len(bet_cent_edges))
print(list(bet_cent_edges)[0])
print(bet_cent_edges[list(bet_cent_edges)[0]])

take_count 37
bet_cent_edges took 13.23160195350647
44939
('63ddc39455ee46d37a4730a96fcc05d6', '2415908ec83ea1597814d4bead9aacec')
1.408273678273541e-07


In [18]:
edges_max_goal = 8000
G_copy = G_mc.copy()
print(nx.info(G_copy))
removed_edges = []

def remove_edges(items):
    sorted_bet_cent_edges = sorted(items,reverse=False, key=lambda x: x[1])
    
    print("count :", len(list(sorted_bet_cent_edges)))

    for bet_cent in sorted_bet_cent_edges: 
        if G_copy.degree(bet_cent[0]) > 2 and G_copy.degree(bet_cent[1]) > 2:
            G_copy.remove_edge(*bet_cent)  
            removed_edges.append(bet_cent)
        
        if (G_copy.number_of_edges() < edges_max_goal):
            print("done :",G_copy.number_of_edges())
            break
 

print("G_copy edges before:", G_copy.number_of_edges()) 
remove_edges(bet_cent_edges)
print("G_copy edges after:", G_copy.number_of_edges())  
print("removed_edges:", len(removed_edges))
 

Name: 
Type: Graph
Number of nodes: 5330
Number of edges: 44939
Average degree:  16.8627
G_copy edges before: 44939
count : 44939
G_copy edges after: 9244
removed_edges: 35695


In [None]:
def postprocess(items):
    removed = sorted(items, reverse=True) 
    _components = list(nx.connected_component_subgraphs(G_copy))
    i = 0
     
    for edge in removed: 
        if nx.number_connected_components(G_copy) == 1:
            print("break")
            break
            
        for c in _components:
            if c.has_node(edge[0]) and c.has_node(edge[1]):
                break
            elif c.has_node(edge[0]) or c.has_node(edge[1]): 
                G_copy.add_edge(*edge)
                _components = list(nx.connected_component_subgraphs(G_copy))
    
    
postprocess(removed_edges)
print("done")

In [None]:
print("number_connected_components", nx.number_connected_components(G_mc))
print("number_connected_components", nx.number_connected_components(G_copy))

In [None]:
# save reduced graph into a file
_attrs = dict(source='sender', target='receiver', name='guid',
              key='guid', link='links')
s2 = nx.readwrite.node_link_data(G_copy, attrs={'link': 'links', 'source': 'sender', 'target': 'receiver', 'key': 'guid', 'name': 'guid'})

with open('data_processed.json', 'w') as outfile:
    json.dump(s2, outfile)

