In [48]:
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import *
from pyspark.sql.functions import explode, collect_list, size, col, row_number, sort_array, udf, count

In [2]:
sparkSession = SparkSession.builder.enableHiveSupport().master("local").getOrCreate()

In [3]:
graphPath = "/data/graphDFSample"

In [33]:
reversedGraph = sparkSession.read.parquet(graphPath) \
    .withColumn("friend", explode('friends')) \
    .groupBy("friend") \
    .agg(collect_list("user").alias("users")) \
    .withColumn("users_size", size("users"))

reversedGraph.show(3)

+------+--------------------+----------+
|friend|               users|users_size|
+------+--------------------+----------+
|   148|[65051219, 146311...|         4|
|  5518|          [58573511]|         1|
|  9900|          [36844066]|         1|
+------+--------------------+----------+
only showing top 3 rows



In [34]:
reversedGraph.printSchema()

root
 |-- friend: integer (nullable = true)
 |-- users: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- users_size: integer (nullable = false)



In [35]:
reversedGraph = reversedGraph.select(reversedGraph.friend, \
                                     sort_array(reversedGraph.users).alias("users_sorted"), \
                                     "users_size")
reversedGraph.show(3)

+------+--------------------+----------+
|friend|        users_sorted|users_size|
+------+--------------------+----------+
|   148|[3195315, 1463110...|         4|
|  5518|          [58573511]|         1|
|  9900|          [36844066]|         1|
+------+--------------------+----------+
only showing top 3 rows



In [36]:
reversedGraph.printSchema()

root
 |-- friend: integer (nullable = true)
 |-- users_sorted: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- users_size: integer (nullable = false)



In [37]:
reversedGraph = reversedGraph.where(reversedGraph.users_size > '1')
reversedGraph.show(3)

+------+--------------------+----------+
|friend|        users_sorted|users_size|
+------+--------------------+----------+
|   148|[3195315, 1463110...|         4|
| 36538|[5506394, 6170161...|        32|
| 41751|[41811068, 60873111]|         2|
+------+--------------------+----------+
only showing top 3 rows



In [38]:
reversedGraph.printSchema()

root
 |-- friend: integer (nullable = true)
 |-- users_sorted: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- users_size: integer (nullable = false)



In [39]:
def serializer(input_array):
    out_array = []
    count = 0
    while count < len(input_array):
        for count_internal in range (count+1, len(input_array)):
            out_array.append((input_array[count], input_array[count_internal]))
        count += 1
    return(out_array)

In [40]:
serializer_udf = udf(lambda y: serializer(y), 
                     ArrayType(StructType(
                         (StructField("1", IntegerType(), True),
                          StructField("2", IntegerType(), True)))))

In [41]:
reversedGraph = reversedGraph.select(serializer_udf(reversedGraph.users_sorted).alias("users"))
reversedGraph.show(3)

+--------------------+
|               users|
+--------------------+
|[[3195315,1463110...|
|[[5506394,6170161...|
|[[41811068,608731...|
+--------------------+
only showing top 3 rows



In [42]:
reversedGraph.printSchema()

root
 |-- users: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- 1: integer (nullable = true)
 |    |    |-- 2: integer (nullable = true)



In [43]:
reversedGraph.rdd.take(1)

[Row(users=[Row(1=3195315, 2=14631101), Row(1=3195315, 2=14957568), Row(1=3195315, 2=65051219), Row(1=14631101, 2=14957568), Row(1=14631101, 2=65051219), Row(1=14957568, 2=65051219)])]

In [58]:
reversedGraph.count()

3018835

In [45]:
reversedGraph_2 = reversedGraph.select(explode(reversedGraph.users))
reversedGraph_2.show(3)

+------------------+
|               col|
+------------------+
|[3195315,14631101]|
|[3195315,14957568]|
|[3195315,65051219]|
+------------------+
only showing top 3 rows



In [53]:
reversedGraph_2.printSchema()

root
 |-- col: struct (nullable = true)
 |    |-- 1: integer (nullable = true)
 |    |-- 2: integer (nullable = true)



In [59]:
reversedGraph_2.count()

178746446

In [61]:
reversedGraph_3 = reversedGraph_2.select('col.*')

In [62]:
reversedGraph_3.show(3)

+-------+--------+
|      1|       2|
+-------+--------+
|3195315|14631101|
|3195315|14957568|
|3195315|65051219|
+-------+--------+
only showing top 3 rows



In [None]:
reversedGraph_3.select(
    count("2").over(Window.partitionBy("1")).alias("cnt")).show(3)