In [None]:
# install GraphFrames using the following command with your Virtual Environment activated and pyspark already installed:
# pyspark --packages graphframes:graphframes:0.6.0-spark2.3-s_2.11

import pyspark

from les_mis import LES_MIS_GRAPH

sc = pyspark.SparkContext.getOrCreate()
graph = sc.parallelize(LES_MIS_GRAPH, 32)

In [None]:
nodes = graph.flatMap(lambda x: (x[0], x[1])).distinct().collect()
node_to_id = {node: i for i, node in enumerate(nodes)}
id_to_node = {i: node for i, node in enumerate(nodes)}
len(nodes)

In [None]:
from itertools import islice


def get_node_attributes(graph: pyspark.rdd.RDD, node_to_id: dict[int, str]):
    """
    Function is designed to take all nodes in a graph and output their node ID's and their attributes
    to a dictionary that can be queried when we want to easily access the attributes of a node.
    """
    encoded_edges = [(node_to_id[edge[0]], node_to_id[edge[1]], edge[2]) for edge in graph.collect()]
    node_attributes = {i:([], []) for i, node in enumerate(nodes)}
    for entry in encoded_edges:
        node_attributes[entry[0]][0].append(entry[1])
        node_attributes[entry[0]][1].append(entry[2])
        node_attributes[entry[1]][0].append(entry[0])
        node_attributes[entry[1]][1].append(entry[2])

    return node_attributes

node_id_to_attrs = get_node_attributes(graph, node_to_id)
list(islice(node_id_to_attrs.items(), 3))

In [None]:
input_c = graph.flatMap(lambda x: (x[0], x[1])).distinct().zipWithIndex().map(lambda x: (node_to_id[x[0]], f"cluster:{x[1]}"))
input_c.take(3)

In [None]:
# Prepare Input G part 1
input_g = graph.map(lambda x: (x[0], (x[1], x[2]))).union(
    graph.map(lambda x: (x[1], (x[0], x[2])))
).map(lambda x: (node_to_id[x[0]], ((node_to_id[x[1][0]], node_id_to_attrs[node_to_id[x[1][0]]]), x[1][1]))).sortByKey(lambda x: x[0])
input_g.take(1)

In [None]:
input_g = input_g.groupByKey().mapValues(lambda r: ([x[0] for x in r], [x[1] for x in r]))
input_g.take(1)

In [None]:
# input_g = input_g.repartition(32)
# input_c = input_c.repartition(32)
input_c = sc.parallelize(input_c.collect(), 32).sortByKey(lambda x: x[0]) # zip doesn't seem to work with repartition... cool.
input_g = sc.parallelize(input_g.collect(), 32).sortByKey(lambda x: x[0])

In [None]:
first_zip = input_c.zip(input_g).map(lambda x: ( x[0][1], ([(x[0][0], x[1][1])]) ) )
first_zip.take(1)

In [None]:
# Map the vertices such that the existing cluster takes a new node...
first_agg = first_zip.mapValues(lambda c: [(v[0], e, v[1][1]) for v in c for e in v[1][0]])
first_agg.take(1)

In [None]:
# Paper Definitions from page 690 in Distributed Graph Clustering Using Modularity Map Equation:
# deg(v) - The weighted degree of a node v that is the sum of all outgoing edges (v, u) of v.
# vol(C) - The sum of all weighted degrees of a set of nodes C.
# cut(v, C) - The sum of all weights of edges (v, u) where u is in C.

# cluster 0 expected outputs:
# 
# vol(C \ v) = 0
# cut(v, C \ v) = sum([3, 3, 3, 3, 4, 4, 4, 2, 9]) + sum([1, 2])
# cut 

In [None]:
class Node:
    def __init__(self, cluster_id: str, id: str, neighbors: list[tuple[str, int]]):
        self.id = id
        self.cluster_id = cluster_id
        self.neighbors = [neighbor for neighbor, _ in neighbors]
        self.weights = [weight for _, weight in neighbors]
    
    def get_volume(self):
        return sum(self.weights)

    def get_degree(self):
        return len(self.neighbors)

    def get_neighbours(self):
        return self.neighbors

    def get_cut(self, other):
        return len(set(self.neighbors).intersection(set(other.neighbors)))