# Demo of Spark graphFrames

Installation : https://towardsdatascience.com/graphframes-in-jupyter-a-practical-guide-9b3b346cebc5

Thanks to https://medium.com/towards-artificial-intelligence/relationship-analysis-inspark-using-graphx-57ffcce8773f

and https://github.com/parlad/Spark-GraphX-Relationship_Analysis/blob/master/README.md

In [1]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName('Graph') \
    .getOrCreate()
sc = spark.sparkContext

In [2]:
#!pip install graphframes

In [3]:
!hdfs dfs -ls /demo/

Found 12 items
-rw-r--r--   1 root supergroup    2772143 2020-11-29 07:57 /demo/diamonds.csv
-rw-r--r--   1 root supergroup     585225 2020-11-29 07:57 /demo/fuel_econ.csv
drwxr-xr-x   - root supergroup          0 2020-11-29 08:08 /demo/graphframes
drwxr-xr-x   - root supergroup          0 2020-11-29 07:57 /demo/house-prize
drwxr-xr-x   - root supergroup          0 2020-11-29 07:57 /demo/lda
drwxr-xr-x   - root supergroup          0 2020-11-29 07:57 /demo/noaa
-rw-r--r--   1 root supergroup        101 2020-11-29 07:57 /demo/person.csv
-rw-r--r--   1 root supergroup      40454 2020-11-29 07:57 /demo/pokemon.csv
drwxr-xr-x   - root supergroup          0 2020-11-29 07:57 /demo/recommendation
-rw-r--r--   1 root supergroup        212 2020-11-29 07:57 /demo/relationship.csv
drwxr-xr-x   - root supergroup          0 2020-11-29 07:57 /demo/titanic
drwxr-xr-x   - root supergroup          0 2020-11-29 07:57 /demo/txt


In [4]:
from graphframes import *
personsDf = spark.read.csv('/demo/graphframes/person.csv',header=True, inferSchema=True)
#personsDf = spark.read.csv('/demo/graphframes/person.csv',header=True, inferSchema=True)
personsDf.createOrReplaceTempView("persons")
relationshipDf = spark.read.csv('/demo/graphframes/relationship.csv',header=True, inferSchema=True)
relationshipDf.createOrReplaceTempView("relationship")
graph = GraphFrame(personsDf, relationshipDf)

In [5]:
spark.sql("select * from persons").show()

+---+--------+---+---------+
| id|    name|age|     type|
+---+--------+---+---------+
|  1|  Andrew| 45|   person|
|  2|  Sierra| 43|   person|
|  3|     Bob| 12|   person|
|  4|   Emily| 10|   person|
|  5| William| 35|   person|
|  6|  Rachel| 32|   person|
|  7|    Toto| 42|   person|
|  8|    Titi| 42|   person|
|  9|  Europe|  0|continent|
| 10|Amerique|  0|continent|
| 11|  France|  0|     pays|
| 12|     USA|  0|     pays|
+---+--------+---+---------+



In [6]:
spark.sql("select * from relationship").show()

+---+---+--------+
|src|dst|relation|
+---+---+--------+
|  1|  2| Husband|
|  1|  3|  Father|
|  1|  4|  Father|
|  1|  5|  Friend|
|  1|  6|  Friend|
|  2|  1|    Wife|
|  2|  3|  Mother|
|  2|  4|  Mother|
|  2|  6|  Friend|
|  3|  1|     Son|
|  3|  2|     Son|
|  4|  1|Daughter|
|  4|  2|Daughter|
|  5|  1|  Friend|
|  6|  1|  Friend|
|  6|  2|  Friend|
|  7|  8|  Friend|
|  8|  7|  Friend|
|  7| 12|     nee|
|  7| 11|  habite|
+---+---+--------+
only showing top 20 rows



In [7]:
#graph.inDegrees().show();
graph.degrees.show();

+---+------+
| id|degree|
+---+------+
| 12|     2|
|  1|    10|
|  6|     4|
|  3|     4|
|  5|     2|
|  9|     1|
|  4|     4|
|  8|     2|
|  7|     4|
| 10|     1|
| 11|     2|
|  2|     8|
+---+------+



In [8]:
#Here you are going to find all the edges connected to Andrew.
graph.degrees.filter("id = 1").show()

+---+------+
| id|degree|
+---+------+
|  1|    10|
+---+------+



# Graph request

https://graphframes.github.io/graphframes/docs/_site/user-guide.html#motif-finding

In [9]:
graph.find("(a)-[e]->(b);(a)-[e2]->(b)").show()

+--------------------+----------------+--------------------+----------------+
|                   a|               e|                   b|              e2|
+--------------------+----------------+--------------------+----------------+
|[1, Andrew, 45, p...| [1, 2, Husband]|[2, Sierra, 43, p...| [1, 2, Husband]|
|[1, Andrew, 45, p...|  [1, 3, Father]|[3, Bob, 12, person]|  [1, 3, Father]|
|[1, Andrew, 45, p...|  [1, 4, Father]|[4, Emily, 10, pe...|  [1, 4, Father]|
|[1, Andrew, 45, p...|  [1, 5, Friend]|[5, William, 35, ...|  [1, 5, Friend]|
|[1, Andrew, 45, p...|  [1, 6, Friend]|[6, Rachel, 32, p...|  [1, 6, Friend]|
|[2, Sierra, 43, p...|    [2, 1, Wife]|[1, Andrew, 45, p...|    [2, 1, Wife]|
|[2, Sierra, 43, p...|  [2, 3, Mother]|[3, Bob, 12, person]|  [2, 3, Mother]|
|[2, Sierra, 43, p...|  [2, 4, Mother]|[4, Emily, 10, pe...|  [2, 4, Mother]|
|[2, Sierra, 43, p...|  [2, 6, Friend]|[6, Rachel, 32, p...|  [2, 6, Friend]|
|[3, Bob, 12, person]|     [3, 1, Son]|[1, Andrew, 45, p...|    

In [10]:
graph.find("(a)-[e]->(b);(b)-[e2]->(c);(c)-[e3]->(a)").filter("e2.relation == e3.relation").show()

+--------------------+---------------+--------------------+--------------+--------------------+--------------+
|                   a|              e|                   b|            e2|                   c|            e3|
+--------------------+---------------+--------------------+--------------+--------------------+--------------+
|[1, Andrew, 45, p...|[1, 2, Husband]|[2, Sierra, 43, p...|[2, 6, Friend]|[6, Rachel, 32, p...|[6, 1, Friend]|
|[2, Sierra, 43, p...|   [2, 1, Wife]|[1, Andrew, 45, p...|[1, 6, Friend]|[6, Rachel, 32, p...|[6, 2, Friend]|
+--------------------+---------------+--------------------+--------------+--------------------+--------------+



## Connected components algorithm 

The connected components algorithm finds isolated clusters or isolated sub-graphs. These clusters are sets of connected vertices in a graph where each vertex is reachable from any other vertex in the same set.

In [11]:
spark.sparkContext.setCheckpointDir('/tmp')
connections = graph.connectedComponents()
connections.select("id", "component").orderBy("component").show()

+---+---------+
| id|component|
+---+---------+
|  1|        1|
|  3|        1|
|  4|        1|
|  2|        1|
|  5|        1|
|  6|        1|
|  7|        7|
|  8|        7|
| 12|        7|
|  9|        7|
| 10|        7|
| 11|        7|
+---+---------+



# Triangle count

In [12]:
personsTriangleCountDf = graph.triangleCount();
personsTriangleCountDf.show()

+-----+---+--------+---+---------+
|count| id|    name|age|     type|
+-----+---+--------+---+---------+
|    0| 12|     USA|  0|     pays|
|    3|  1|  Andrew| 45|   person|
|    1|  6|  Rachel| 32|   person|
|    1|  3|     Bob| 12|   person|
|    0|  5| William| 35|   person|
|    0|  9|  Europe|  0|continent|
|    1|  4|   Emily| 10|   person|
|    0|  8|    Titi| 42|   person|
|    0|  7|    Toto| 42|   person|
|    0| 10|Amerique|  0|continent|
|    0| 11|  France|  0|     pays|
|    3|  2|  Sierra| 43|   person|
+-----+---+--------+---+---------+



In [13]:
personsTriangleCountDf.createOrReplaceTempView("personsTriangleCount")
maxCountDf = spark.sql("select max(count) as max_count from personsTriangleCount")
maxCountDf.createOrReplaceTempView("personsMaxTriangleCount")
spark.sql("select * from personsTriangleCount P JOIN (select * from personsMaxTriangleCount) M ON (M.max_count = P.count) ").show()

+-----+---+------+---+------+---------+
|count| id|  name|age|  type|max_count|
+-----+---+------+---+------+---------+
|    3|  1|Andrew| 45|person|        3|
|    3|  2|Sierra| 43|person|        3|
+-----+---+------+---+------+---------+



## What is the Shortest Path ?

[Breadth-First Algorithm](https://en.wikipedia.org/wiki/Breadth-first_search) : bfs

In [14]:
graph.bfs(fromExpr="Name='Bob'",toExpr="Name='William'").show()

+--------------------+-----------+--------------------+--------------+--------------------+
|                from|         e0|                  v1|            e1|                  to|
+--------------------+-----------+--------------------+--------------+--------------------+
|[3, Bob, 12, person]|[3, 1, Son]|[1, Andrew, 45, p...|[1, 5, Friend]|[5, William, 35, ...|
+--------------------+-----------+--------------------+--------------+--------------------+



In [15]:
graph.bfs(
   fromExpr = "name = 'Bob'",
   toExpr = "name = 'William'",
   ).show()

+--------------------+-----------+--------------------+--------------+--------------------+
|                from|         e0|                  v1|            e1|                  to|
+--------------------+-----------+--------------------+--------------+--------------------+
|[3, Bob, 12, person]|[3, 1, Son]|[1, Andrew, 45, p...|[1, 5, Friend]|[5, William, 35, ...|
+--------------------+-----------+--------------------+--------------+--------------------+



# Famous Google PageRank

In [16]:
pageRank = graph.pageRank(resetProbability=0.20, maxIter=10)
pageRank.vertices.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- pagerank: double (nullable = true)



In [17]:
pageRank.edges.printSchema()

root
 |-- src: integer (nullable = true)
 |-- dst: integer (nullable = true)
 |-- relation: string (nullable = true)
 |-- weight: double (nullable = true)



In [18]:
pageRank.edges.orderBy("weight", ascending=False).show()

+---+---+-----------+------------------+
|src|dst|   relation|            weight|
+---+---+-----------+------------------+
| 11|  9|est_contenu|               1.0|
| 12| 10|est_contenu|               1.0|
|  5|  1|     Friend|               1.0|
|  8|  7|     Friend|               1.0|
|  3|  1|        Son|               0.5|
|  6|  2|     Friend|               0.5|
|  3|  2|        Son|               0.5|
|  4|  1|   Daughter|               0.5|
|  6|  1|     Friend|               0.5|
|  4|  2|   Daughter|               0.5|
|  7| 12|        nee|0.3333333333333333|
|  7|  8|     Friend|0.3333333333333333|
|  7| 11|     habite|0.3333333333333333|
|  2|  3|     Mother|              0.25|
|  2|  1|       Wife|              0.25|
|  2|  4|     Mother|              0.25|
|  2|  6|     Friend|              0.25|
|  1|  2|    Husband|               0.2|
|  1|  3|     Father|               0.2|
|  1|  4|     Father|               0.2|
+---+---+-----------+------------------+
only showing top

As you can see from the table William’s relationship with Andrew gets the maximum weight since it is unique. No one other than Andrew is a friend to William.

# Create Datasets

In [19]:
%%writefile person.csv
id,name,age,type
1,Andrew,45,person
2,Sierra,43,person
3,Bob,12,person
4,Emily,10,person
5,William,35,person
6,Rachel,32,person
7,Toto,42,person
8,Titi,42,person
9,Europe,0,continent
10,Amerique,0,continent
11,France,0,pays
12,USA,0,pays

Overwriting person.csv


In [20]:
%%writefile relationship.csv
src,dst,relation
1,2,Husband
1,3,Father
1,4,Father
1,5,Friend
1,6,Friend
2,1,Wife
2,3,Mother
2,4,Mother
2,6,Friend
3,1,Son
3,2,Son
4,1,Daughter
4,2,Daughter
5,1,Friend
6,1,Friend
6,2,Friend
7,8,Friend
8,7,Friend
7,12,nee
7,11,habite
11,9,est_contenu
12,10,est_contenu

Overwriting relationship.csv


In [21]:
!hdfs dfs -mkdir -p /demo/graphframes
!hdfs dfs -copyFromLocal -f person.csv /demo/graphframes/person.csv
!hdfs dfs -copyFromLocal -f relationship.csv /demo/graphframes/relationship.csv

In [22]:
!hdfs dfs -ls /demo/graphframes

Found 2 items
-rw-r--r--   1 root supergroup        238 2020-11-29 08:12 /demo/graphframes/person.csv
-rw-r--r--   1 root supergroup        268 2020-11-29 08:12 /demo/graphframes/relationship.csv


# WIP : Plot the graph

Attempt to plot the graph, but did not work :-(

In [23]:
#!pip install python-igraph
#!pip install xcffib
#!pip install cairocffi

In [None]:
import igraph as ig
ig.plot(ig.Graph.Full(3), vertex_label=['a', 'b', 'c'], bbox=(100, 100))

In [None]:
import cairocffi
from igraph import *

In [None]:
width, height = 100, 50


surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, width, height)

cr = cairo.Context(surface)
cr.set_source_rgb(0.7, 0.9, 0.0)
cr.rotate(-45)
cr.rectangle(0, 0, 40, 40)
cr.fill()

# The context will output in the cell
cr

In [None]:
# https://igraph.org/python/doc/tutorial/tutorial.html
g = Graph()
g.add_vertices(3)
g.add_edges([(0,1), (1,2)])
print(g)

In [None]:
g._repr_svg_()
#layout = g.layout("kk")
#p = plot(g, layout = layout)

In [None]:
igraph.plot(gr).show()

In [None]:

ig = Graph.TupleList(graph.edges.collect(), directed=True)
plot(ig)