# GraphFrames

In [6]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql import SQLContext
from graphframes import *


In [7]:
# Create context
spark = SparkSession.builder.appName("asheesh").getOrCreate()
sqlContext = SQLContext(spark.sparkContext)

In [8]:
# Create a Vertex DataFrame with unique ID column "id"
v = sqlContext.createDataFrame([
  ("a", "Alice", 34),
  ("b", "Bob", 36),
  ("c", "Charlie", 30),
], ["id", "name", "age"])
v.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  a|  Alice| 34|
|  b|    Bob| 36|
|  c|Charlie| 30|
+---+-------+---+



In [9]:
# Create an Edge DataFrame with "src" and "dst" columns
e = sqlContext.createDataFrame([
  ("a", "b", "friend"),
  ("b", "c", "follow"),
  ("c", "b", "follow"),
], ["src", "dst", "relationship"])
e.show()

+---+---+------------+
|src|dst|relationship|
+---+---+------------+
|  a|  b|      friend|
|  b|  c|      follow|
|  c|  b|      follow|
+---+---+------------+



In [10]:
# Create a GraphFrame
g = GraphFrame(v, e)

In [11]:
# dir(g)

In [12]:
# Query: Get in-degree of each vertex.
g.inDegrees.show()

+---+--------+
| id|inDegree|
+---+--------+
|  c|       1|
|  b|       2|
+---+--------+



In [13]:
# Query: Count the number of "follow" connections in the graph.
g.edges.filter("relationship = 'follow'").count()

2

In [14]:
# Find the youngest user's age in the graph.
g.vertices.groupBy().min("age").show()

+--------+
|min(age)|
+--------+
|      30|
+--------+



# Motif finding

Motif finding refers to searching for structural patterns in a graph.

In [15]:
from graphframes.examples import Graphs
g = Graphs(sqlContext).friends()  # Get example graph

In [16]:
# Search for pairs of vertices with edges in both directions between them.
motifs = g.find("(a)-[e]->(b); (b)-[e2]->(a)")
motifs.show()

+----------------+--------------+----------------+--------------+
|               a|             e|               b|            e2|
+----------------+--------------+----------------+--------------+
|[c, Charlie, 30]|[c, b, follow]|    [b, Bob, 36]|[b, c, follow]|
|    [b, Bob, 36]|[b, c, follow]|[c, Charlie, 30]|[c, b, follow]|
+----------------+--------------+----------------+--------------+



In [17]:
# More complex queries can be expressed by applying filters.
motifs.filter("b.age > 30").show()

+----------------+--------------+------------+--------------+
|               a|             e|           b|            e2|
+----------------+--------------+------------+--------------+
|[c, Charlie, 30]|[c, b, follow]|[b, Bob, 36]|[b, c, follow]|
+----------------+--------------+------------+--------------+



# Subgraphs

In [18]:
# Select subgraph of users older than 30, and relationships of type "friend".
# Drop isolated vertices (users) which are not contained in any edges (relationships).
g1 = g.filterVertices("age > 30").filterEdges("relationship = 'friend'").dropIsolatedVertices()
g1.vertices.show()

+---+-----+---+
| id| name|age|
+---+-----+---+
|  b|  Bob| 36|
|  a|Alice| 34|
+---+-----+---+



# Graph Algorithms

# Breadth-first search (BFS)

In [19]:
# Search from "Esther" for users of age < 32.
paths = g.bfs("name = 'Esther'", "age < 32")
paths.show()


+---------------+--------------+--------------+
|           from|            e0|            to|
+---------------+--------------+--------------+
|[e, Esther, 32]|[e, d, friend]|[d, David, 29]|
+---------------+--------------+--------------+



In [20]:
# Specify edge filters or max path lengths.
g.bfs("name = 'Esther'", "age < 32",edgeFilter="relationship != 'friend'", maxPathLength=3).show()


                                                                                

+---------------+--------------+--------------+--------------+----------------+
|           from|            e0|            v1|            e1|              to|
+---------------+--------------+--------------+--------------+----------------+
|[e, Esther, 32]|[e, f, follow]|[f, Fanny, 36]|[f, c, follow]|[c, Charlie, 30]|
+---------------+--------------+--------------+--------------+----------------+



# Label Propagation Algorithm (LPA)

LPA is a standard community detection algorithm for graphs

In [22]:
result = g.labelPropagation(maxIter=5)
result.select("id", "label").show()

                                                                                

+---+-------------+
| id|        label|
+---+-------------+
|  b|1047972020224|
|  e| 412316860416|
|  a|1382979469312|
|  f| 670014898176|
|  d| 670014898176|
|  c|1382979469312|
+---+-------------+



# PageRank

In [None]:
# Run PageRank until convergence to tolerance "tol".
results = g.pageRank(resetProbability=0.15, tol=0.01)

In [None]:
# Note that the displayed pagerank may be truncated, e.g., missing the E notation.
results.vertices.select("id", "pagerank").show()

In [None]:
results.edges.select("src", "dst", "weight").show()

In [None]:
# Run PageRank for a fixed number of iterations.
results2 = g.pageRank(resetProbability=0.15, maxIter=10)

In [None]:
# Run PageRank personalized for vertex "a"
results3 = g.pageRank(resetProbability=0.15, maxIter=10, sourceId="a")

In [None]:
# Run PageRank personalized for vertex ["a", "b", "c", "d"] in parallel
results4 = g.parallelPersonalizedPageRank(resetProbability=0.15, sourceIds=["a", "b", "c", "d"], maxIter=10)

# Shortest paths

In [None]:
results = g.shortestPaths(landmarks=["a", "d"])
results.select("id", "distances").show()

# Triangle count

In [None]:
results = g.triangleCount()
results.select("id", "count").show()

# Saving and loading GraphFrames

In [None]:
# Save vertices and edges as Parquet to some location.
g.vertices.write.parquet("hdfs://myLocation/vertices")
g.edges.write.parquet("hdfs://myLocation/edges")


In [None]:
# Load the vertices and edges back.
sameV = sqlContext.read.parquet("hdfs://myLocation/vertices")
sameE = sqlContext.read.parquet("hdfs://myLocation/edges")

In [None]:
# Create an identical GraphFrame.
sameG = GraphFrame(sameV, sameE)