In [6]:
from graphframes import *
from pyspark import SparkConf, SparkContext
import sys

In [None]:
sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)


In [None]:
vertices = sqlContext.createDataFrame([
  ("a", "Alice", 34),
  ("b", "Bob", 36),
  ("c", "Charlie", 30),
  ("d", "David", 29),
  ("e", "Esther", 32),
  ("f", "Fanny", 36),
  ("g", "Gabby", 60)], ["id", "name", "age"])


In [None]:
edges = sqlContext.createDataFrame([
  ("a", "b", "friend"),
  ("b", "c", "follow"),
  ("c", "b", "follow"),
  ("f", "c", "follow"),
  ("e", "f", "follow"),
  ("e", "d", "friend"),
  ("d", "a", "friend"),
  ("a", "e", "friend")
], ["src", "dst", "relationship"])


In [None]:
g = GraphFrame(vertices, edges)
print g


In [None]:
g.vertices.show()

In [None]:
g.edges.show()

In [None]:
g.inDegrees.show()

In [None]:
g.outDegrees.show()

In [None]:
g.degrees.show()

In [None]:
#Run queries directly on vertices DataFrame
youngest = g.vertices.groupBy().min("age")
youngest.show()

In [None]:
#Run queries on edges DataFrame
numFollows = g.edges.filter("relationship = 'follow'").count()
print "The number of follow edges is", numFollows

In [None]:
#Complex relationships
# Search for pairs of vertices with edges in both directions between them.
motifs = g.find("(a)-[e]->(b); (b)-[e2]->(a)")
motifs.show()

In [None]:
#Queries on top of motif
filtered = motifs.filter("b.age > 30 or a.age > 30")
filtered.show()

In [None]:
#Constructing sub-graphs
paths = g.find("(a)-[e]->(b)").filter("e.relationship = 'follow'").filter("a.age < b.age")

In [None]:
e2 = paths.select("e.src", "e.dst", "e.relationship")

In [None]:
g2 = GraphFrame(g.vertices, e2)

In [None]:
g2.vertices.show()

In [None]:
#Graph algorithms
#BFS
paths = g.bfs("name = 'Esther'", "age < 32")
paths.show()

In [None]:
filteredPaths = g.bfs(
  fromExpr = "name = 'Esther'",
  toExpr = "age < 32",
  edgeFilter = "relationship != 'friend'",
  maxPathLength = 3)

In [None]:
filteredPaths.show()

In [None]:
#Community Detection
result = g.labelPropagation(maxIter=5)
result.show()

In [None]:
#PageRank algorithm
results = g.pageRank(resetProbability=0.15, tol=0.01)
results.vertices.show()

results.edges.show()

In [None]:
#Shortest Path
results = g.shortestPaths(landmarks=["a", "d"])
results.show()