In [1]:
from graphframes import *

In [2]:
# create two dataFrame to demostrate vertices and edges. 1: city and state

In [3]:
vertices = sqlContext.createDataFrame([
  ("a","Boston","MA"),
  ("b","New Haven","CT"),
  ("c","New York","NY"),
  ("d","Portland","ME"),
  ("e","Worchester","MA"),
  ("f","Providence","RI"),
  ("g","Hartfold","CT"),
  ("h","Stamfold","CT")],["id","city","state"])

In [4]:
# 2. driveTime between different cities

In [5]:
edges = sqlContext.createDataFrame([
  ("a", "b", 132),
  ("b", "c", 120),
  ("c", "b", 118),
  ("f", "c", 181),
  ("e", "f", 47),
  ("e", "d", 138),
  ("d", "a", 107),
  ("a", "e", 54),
  ("a", "g", 101),
  ("g", "h", 77),
  ("h", "c", 43),
], ["src", "dst", "driveMins"])

In [6]:
vertices.printSchema()
edges.printSchema()

In [7]:
display(vertices)

In [8]:
display(edges)

In [9]:
#create graph

In [10]:
g = GraphFrame(vertices, edges)
print g

In [11]:
display(g.vertices)

In [12]:
display(g.edges)

In [13]:
# The incoming degree of the vertices

In [14]:
display(g.inDegrees)

In [15]:
# The outcoming degree of the vertices

In [16]:
display(g.outDegrees)

In [17]:
# The degree of the vertices:

In [18]:
display(g.degrees)

In [19]:
# count the number of cities in CT

In [20]:
cts = g.vertices.filter("state = 'CT'").count()
print "The number of ct cities is ", cts

In [21]:
# get the min driveMins

In [22]:
shortestTime = g.edges.groupBy().min("driveMins")
display(shortestTime)

In [23]:
numLessOneHour = g.edges.filter("driveMins < 100 ").count()
print "The number of driveMins less than 60 mins is", numLessOneHour

In [24]:
motifs = g.find("(a)-[e]->(b); (b)-[e2]->(a)")
display(motifs)

In [25]:
filtered = motifs.filter("b.state = 'NY'")
display(filtered)

In [26]:
paths = g.find("(a)-[e]->(f)")\
  .filter("e.state = 'MA'")\
  .filter("a.state = e.state")

e2 = paths.select("e.src", "e.dst", "e.driveMins")

g2 = GraphFrame(g.vertices, e2)

In [27]:
# BFS get the path from Boston to CT cities

In [28]:
paths = g.bfs("city = 'Boston'","state = 'CT'")
display(paths)

In [29]:
# filter paths from Boston to CT cities based on driveMins != 0

In [30]:
filteredPaths = g.bfs(
  fromExpr = "city = 'Boston'",
  toExpr = "state = 'CT'",
  edgeFilter = "driveMins != 0",
  maxPathLength = 3)
display(filteredPaths)

In [31]:
# Strongly connected components

In [32]:
result = g.stronglyConnectedComponents(maxIter=10)
display(result.select("id", "component"))

In [33]:
result = g.labelPropagation(maxIter=5)
display(result)

In [34]:
# PageRank

In [35]:
results = g.pageRank(resetProbability=0.15, tol=0.01)
display(results.vertices)

In [36]:
display(results.edges)

In [37]:
g.pageRank(resetProbability=0.15, maxIter=10)

In [38]:
g.pageRank(resetProbability=0.15, maxIter=10, sourceId="a")

In [39]:
# Shortest paths

In [40]:
results = g.shortestPaths(landmarks=["a", "d"])
display(results)