In [25]:
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import *
from pyspark.sql.functions import explode, collect_list, size, col, row_number, sort_array, udf

In [26]:
sparkSession = SparkSession.builder.enableHiveSupport().master("local").getOrCreate()

In [27]:
graphPath = "/data/graphDFSample"

In [28]:
reversedGraph = sparkSession.read.parquet(graphPath) \
    .withColumn("friend", explode('friends')) \
    .groupBy("friend") \
    .agg(collect_list("user").alias("users")) \
    .withColumn("users_size", size("users"))

reversedGraph.show(3)

+------+--------------------+----------+
|friend|               users|users_size|
+------+--------------------+----------+
|   148|[65051219, 146311...|         4|
|  5518|          [58573511]|         1|
|  9900|          [36844066]|         1|
+------+--------------------+----------+
only showing top 3 rows



In [29]:
reversedGraph = reversedGraph.select(reversedGraph.friend, \
                                     sort_array(reversedGraph.users).alias("users_sorted"), \
                                     "users_size")
reversedGraph.show(3)

+------+--------------------+----------+
|friend|        users_sorted|users_size|
+------+--------------------+----------+
|   148|[3195315, 1463110...|         4|
|  5518|          [58573511]|         1|
|  9900|          [36844066]|         1|
+------+--------------------+----------+
only showing top 3 rows



In [30]:
reversedGraph = reversedGraph.where(reversedGraph.users_size > '1')
reversedGraph.show(3)

+------+--------------------+----------+
|friend|        users_sorted|users_size|
+------+--------------------+----------+
|   148|[3195315, 1463110...|         4|
| 36538|[5506394, 6170161...|        32|
| 41751|[41811068, 60873111]|         2|
+------+--------------------+----------+
only showing top 3 rows



In [39]:
def serializer(input_array):
    out_array = []
    count = 0
    while count < len(input_array):
        for count_internal in range (count+1, len(input_array)):
            out_array.append((input_array[count], input_array[count_internal]))
        count += 1
    return(out_array)

In [47]:
serializer_udf = udf(lambda y: serializer(y), 
                     ArrayType(StructType((StructField("1", IntegerType(), False),StructField("2", IntegerType(), False)))))

In [50]:
reversedGraph = reversedGraph.select(reversedGraph.friend, \
                                     serializer_udf(reversedGraph.users_sorted).alias("users_sorted"), \
                                     "users_size")
reversedGraph.show(3)

+------+--------------------+----------+
|friend|        users_sorted|users_size|
+------+--------------------+----------+
|   148|[[3195315,1463110...|         4|
| 36538|[[5506394,6170161...|        32|
| 41751|[[41811068,608731...|         2|
+------+--------------------+----------+
only showing top 3 rows

