In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, collect_list
from pyspark.sql.types import StructType, StructField, IntegerType

# create a SparkSession object
spark = SparkSession.builder.appName("GraphDF").getOrCreate()

import snap

G = snap.GenRndGnm(snap.PUNGraph, 10000000, 7000000)  # Generate random undirected graph
snap.DelDegKNodes(G, -1, 0)  # Remove isolated nodes

# create a graph DataFrame
schema = StructType([StructField("src", IntegerType()), StructField("dst", IntegerType())])
edges_df=spark.createDataFrame([(edge.GetSrcNId(), edge.GetDstNId()) for edge in G.Edges()], schema=schema)

# print the DataFrame
# edges_df.show()

# Function for the reducer
def reducer(key, values):
    minim = key
    valueList = []
    to_emit = []
    counter = 0
    for value in values:
        if value < minim:
            minim = value
        valueList.append(value)
    if minim < key:
        to_emit.append((key,minim))
        for value in valueList:
            if minim != value:
                counter += 1
                to_emit.append((value,minim))
    return to_emit, counter


# initialising count_new_pairs with a value > 0 to run the first iteration
import time
start_time = time.time()
count_new_pairs = 1
iterations = 0

# Loop for the iterations
while count_new_pairs > 0:
    # CCF iterate
    # Map
    result_map = edges_df.select(col("src").alias("key"), col("dst").alias("value")).union(
        edges_df.select(col("dst").alias("key"), col("src").alias("value")))
    # Shuffle and Sort
    result_shufflesort = result_map.groupBy("key").agg(collect_list("value").alias("values"))
    # Reduce
    result_reducer = result_shufflesort.rdd.map(lambda x: reducer(x.key, x.values))
    result_reducer2 = result_reducer.flatMap(lambda x: x[0]).toDF(["src", "dst"])
    # Updating the counting
    count_new_pairs = result_reducer.map(lambda x: x[1]).reduce(lambda x, y : x + y)
    # CCF dedup
    edges_df = result_reducer2.dropDuplicates()
    iterations+=1

sorted_df = edges_df.sort(col("dst"))
end_time = time.time()

elapsed_time = end_time - start_time

# sorted_df.show()
print("Iterations:", iterations)
print("Elapsed time: ", elapsed_time)

# stop the SparkSession object
spark.stop()

