# Big Data Assignment 5: Unique Triangles in GraphFrames

You may download the file from: https://github.com/mahmoudparsian/big-data-mapreduce-course/blob/master/data/edges.txt

In [1]:
spark

In [2]:
spark.version

'3.3.2'

In [3]:
#pip install graphframes doesn't seem to work
#launch from terminal with: $SPARK_HOME/bin/pyspark --packages graphframes:graphframes:0.8.2-spark3.2-s_2.12

In [4]:
#make sure this runs
from graphframes import *

In [5]:
# if used from terminal: 
#terminal input: --packages graphframes:graphframes:0.8.2-spark3.2-s_2.12   AidanOlanderAssign5.py   edges.txt

In [6]:
#input_path = sys.argv[1]  
#    print("input_path: {}".format(input_path))

In [7]:
input_path='edges.txt'

Create first DataFrame from edges.txt

In [8]:
df_start = spark\
          .read\
          .format("csv")\
          .option("header","false")\
          .option("inferSchema", "true")\
          .load(input_path) 

In [9]:
df_start.show()

+---+---+------+
|_c0|_c1|   _c2|
+---+---+------+
| 10| 20|friend|
| 20| 30|follow|
| 30| 10|friend|
| 10| 50|follow|
| 50| 70|friend|
| 70| 80|follow|
| 80| 50|friend|
| 50|  1|friend|
|  1|  2|follow|
|  2|  3|friend|
|  3|  1|follow|
|  4|  1|follow|
|  4|  5|follow|
|  5|  6|friend|
|  6|  4|friend|
|  7|  4|follow|
|  8|  4|follow|
|  9| 10|friend|
| 10| 11|friend|
| 11|  9|follow|
+---+---+------+
only showing top 20 rows



Turn DF into a table for queries

In [10]:
df_start.createOrReplaceTempView("input_table")

In [11]:
start_edges = df_start.withColumnRenamed("_c0","src")\
.withColumnRenamed("_c1","dst").withColumnRenamed("_c2", "relation")

Make it so the edges DF is undirected by duplicating in reverse

In [12]:
edges1 = start_edges.selectExpr("src", "dst")
edges2 = start_edges.selectExpr("dst", "src")
edges = edges1.union(edges2)

In [13]:
start_edges.count()

32

In [14]:
edges.count()

64

Create the vertices DF by getting distinct ids from UNION of src and dst columns

In [15]:
vert_query = "SELECT DISTINCT * FROM (\
SELECT _c0 FROM input_table \
UNION \
SELECT _c1 FROM input_table) as tmp"

In [16]:
verts = spark.sql(vert_query)

In [17]:
verts = verts.withColumnRenamed("_c0", "id")

In [18]:
verts.count()

22

Make sure both DFs are in the correct format for GraphFrame:
 - Vertices with "id" column
 - Edges with "src" and "dst" columns

In [19]:
verts.printSchema()
edges.printSchema()

root
 |-- id: integer (nullable = true)

root
 |-- src: integer (nullable = true)
 |-- dst: integer (nullable = true)



Create the GraphFrame using parameter DFs of vertices and edges

In [20]:
triGraphs = GraphFrame(verts, edges)



Create pattern for motif finding, in this case a triangle, and search GraphFrame for it

In [21]:
find_pattern = "(a)-[]->(b); (b)-[]->(c); (c)-[]->(a)"

In [22]:
triangles = triGraphs.find(find_pattern)



Includes duplicate triangles, so use filter to select unique triangles only

In [23]:
unique_triangles = triangles[(triangles.a > triangles.b) &
                              (triangles.b > triangles.c)]

Lastly, format it with looping vertice as well (a at end) as requested in assignment format

In [24]:
unique_triangles.createOrReplaceTempView("end_table")

In [25]:
final_triangles = spark.sql("SELECT a, b, c, a FROM end_table")

In [26]:
final_triangles.show()

+----+----+----+----+
|   a|   b|   c|   a|
+----+----+----+----+
|{13}| {3}| {1}|{13}|
|{13}| {4}| {1}|{13}|
| {6}| {5}| {4}| {6}|
| {3}| {2}| {1}| {3}|
|{80}|{70}|{50}|{80}|
|{11}|{10}| {9}|{11}|
|{30}|{20}|{10}|{30}|
+----+----+----+----+



Check the math to be sure!

In [27]:
print("6 possible verstions of each triangle \n\
so 6 * final_triangles.count() should equal triangles.count()")
print("Total triangles: {}".format(triangles.count()))
print("Unique triangles: {}".format(final_triangles.count()))
print("6 times {} = {}".format(final_triangles.count(), final_triangles.count()*6))

6 possible verstions of each triangle 
so 6 * final_triangles.count() should equal triangles.count()
Total triangles: 42
Unique triangles: 7
6 times 7 = 42
