In [1]:
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline

from tqdm import tqdm
from collections import Counter
from pickle import dump

import sys
sys.path.append('../src')
from utils import w_label_prop
from quality_functions import eval_functions
from parse_data import parse_twitter

N_NODES = 90291

def read_graph(path="../datasets/fb_dg_gabr"):
    graph = nx.Graph()
    for i in range(N_NODES):
        graph.add_node(i)
    for v, line in enumerate(open(path)):
        edges = map(int, line.split())
        for u in edges:
            graph.add_edge(v, u)
    return graph

def get_comm_sizes_samples(graph, n_samples=2, verbose=True):
    ans = []
    deltas = []
    for i in tqdm(range(n_samples)):
        labels, cur_deltas = w_label_prop(graph)
        deltas.append(cur_deltas)
        comm_sizes = list(Counter(labels).values())
        ans.append(comm_sizes)
        if i < 5 and verbose:
            print(len(comm_sizes))
            top_sizes = sorted(comm_sizes)[::-1][:min(len(comm_sizes, 5))]
            print(top_sizes)
    return ans, deltas

graph_types = [
    'eps',
    'knn', 
    'inf'
]

f = open('../tmp_files/twitter_cd', "wb")

In [2]:
from time import time

all_deltas = []

for fname in ['conn'] + graph_types:
    t_begin = time()
    print(fname.upper())
    if fname == 'conn':
        graph = parse_twitter()
    else:
        graph = read_graph('../tmp_files/tw_dg_%s' % fname)
    N_NODES = graph.number_of_nodes()
    print("N_nodes:", graph.number_of_nodes())
    print("N_edges:", graph.number_of_edges())
    print("Ratio:", graph.number_of_edges() / graph.number_of_nodes())
    labels, deltas = w_label_prop(graph)
    all_deltas.append(deltas)
    dump((labels, graph), f)
    del labels
    del graph
    print('time', time() - t_begin)
    
"""
KNN
N_nodes: 548553
N_edges: 2542747
Ratio: 4.63537160493152
time 451.4027636051178
"""
f.close()

CONN
N_nodes: 112416
N_edges: 308927
Ratio: 2.7480696697978937
time 52.80955743789673
EPS
N_nodes: 112416
N_edges: 835865
Ratio: 7.43546292342727
time 69.53001308441162
KNN
N_nodes: 112416
N_edges: 588110
Ratio: 5.231550668943923
time 81.71420645713806
INF
N_nodes: 112416
N_edges: 216021
Ratio: 1.9216214773697695
time 42.08502268791199


In [3]:
from pickle import load

from networkx.algorithms.components import connected_components

def comp_sizes(graph):
    comps = []
    for c in connected_components(graph):
        comps.append(len(c))
    return comps

def top_k(array, k=10):
    return sorted(array)[::-1][: min(k, len(array))]


all_graph_types = ['conn', 'eps', 'knn', 'inf']
cur_ind = 0
with open("../tmp_files/twitter_cd", "rb") as f_in:
    with open("../tmp_files/twitter_cd.pickle", "wb") as f_out:
        while True:
            try:
                t_begin = time()
                labels, graph = load(f_in)
                print(all_graph_types[cur_ind].upper())
                cur_ind += 1
                comm_sizes = list(Counter(labels).values())
                cmp_sizes = comp_sizes(graph)
                print("comm", top_k(comm_sizes))
                print("comp", top_k(cmp_sizes))
                dump((graph, comm_sizes, cmp_sizes), f_out)
                del graph
                del comm_sizes, cmp_sizes
                print(time() - t_begin)
            except BaseException as ex:
                print(ex)
                break

CONN
comm [11440, 7486, 2615, 1624, 1327, 1267, 879, 764, 658, 621]
comp [87349, 17, 16, 14, 12, 12, 11, 11, 11, 11]
3.86384916305542
EPS
comm [449, 420, 414, 334, 330, 279, 277, 276, 258, 225]
comp [24985, 166, 142, 140, 105, 72, 67, 64, 52, 50]
6.4585583209991455
KNN
comm [746, 669, 410, 359, 308, 258, 250, 242, 222, 215]
comp [90215, 76, 1, 1, 1, 1, 1, 1, 1, 1]
5.354591369628906
INF
comm [328, 237, 211, 209, 198, 197, 173, 161, 154, 152]
comp [77442, 95, 85, 80, 76, 76, 75, 69, 67, 67]
2.883502960205078
Ran out of input
