In [None]:
import time

import pyspark
import networkx as nx

from les_mis import LES_MIS_GRAPH

sc = pyspark.SparkContext.getOrCreate()
graph = sc.parallelize(LES_MIS_GRAPH)

In [None]:
print(f"Number of partitions: {str(graph.getNumPartitions())}")
print(f"First element in graph: {str(graph.first())}")

In [None]:
graph.first()

In [None]:
len(graph.collect())

In [None]:
all_nodes = graph.flatMap(lambda x: (x[0], x[1])).distinct().collect()
len(all_nodes)

In [None]:
# node_map = {node: i for i, node in enumerate(all_nodes)}
cluster_map = {f"cluster:{i}":[node] for i, node in enumerate(all_nodes)}
cluster_map

In [None]:
# Input Cluster (C)
clusters = sc.parallelize([(cluster_map[cluster_id][0], cluster_id) for cluster_id in cluster_map.keys()], 32)

in_clusters = clusters.map(lambda x: (x[0], x))
# Input Graph (G)
encoded_edges = graph.map(lambda x: (x[0], (x[1], x[2]))).union(
    graph.map(lambda x: (x[1], (x[0], x[2])))
).sortByKey(lambda x: x[0])
len(encoded_edges.collect())

In [None]:

def to_list(a):
    return [a]

def append(a, b):
    a.append(b)
    return a

def extend(a, b):
    a.extend(b)
    return a

encoded_edges = encoded_edges.combineByKey(to_list, append, extend).sortBy(lambda x: x[0])

In [None]:
clusters.first()

In [None]:
collected = encoded_edges.collect()
collected[0:4]

In [None]:
# make less slow :-(
zipped = in_clusters.join(encoded_edges)
zipped.collect()

In [None]:
mod_zipped = zipped.map(lambda x: (x[1][0][1], (x[0], x[1][1]))).collect()
mod_zipped[0:2]

In [None]:
first_agg = zipped.flatMapValues(lambda x: (x[0], )).collect()
first_agg.first()

In [None]:
temp_element = ('cluster:2',
  (
    'Champmathieu',
   [
      ('Valjean', 3),
      ('Judge', 3),
      ('Bamatabois', 2),
      ('Brevet', 2),
      ('Chenildieu', 2),
      ('Cochepaille', 2)
  ]
   )
)

temp_element[1][1]

In [None]:
# The below data structure is what we want to end up with after the Aggregate by C(luster) operation
# on page 693 of the paper. 
alternative = (
    "cluster:2",
    [
        (
            "Champmathieu",
            (
                'Valjean',
                'Judge',
                'Bamatabois',
                'Brevet',
                'Chenildieu',
                'Cochepaille'
            ),
            (
                3,
                3,
                2,
                2,
                2,
                2
            )
        ),
        # ... other elements in the cluster
    ]
)

In [None]:
total_volume = graph.map(lambda x: x[2]).sum() / 2

cut_with_v = total_volume - sum(alternative[1][0][2])
cut_with_v

In [None]:
print(f"First element of cluster: {str(clusters[0])}")
print(f"First encded graph edge: {str(encoded_edges.first())}")

In [None]:
class Node:
    def __init__(self, id, neighbors):
        self.id = id
        self.neighbors = neighbors
    
    def get_volume(self):
        return len(self.neighbors)

    def get_cut(self, other):
        return len(set(self.neighbors).intersection(set(other.neighbors)))