# Writing GraphFrames to Azure Cosmos DB Gremlin API

Code adapted by Caio Moreno <BR>

Original code: https://github.com/syedhassaanahmed/databricks-notebooks/blob/master/graph_write_cosmosdb.py <BR>

This notebook is based on the `GraphFrames` example [specified here](https://graphframes.github.io/user-guide.html#tab_python_0). It requires [graphframes](https://spark-packages.org/package/graphframes/graphframes) and [azure-cosmosdb-spark (uber jar)](https://github.com/Azure/azure-cosmosdb-spark#using-databricks-notebooks) libraries to be uploaded and attached to the cluster. **Scala version** of this notebook can be [found here](https://github.com/syedhassaanahmed/databricks-notebooks/blob/master/graphWriteCosmosDB.scala)

In [2]:
from pyspark.sql.functions import lit

v = sqlContext.createDataFrame([
  ("1", "Construction Site 1", 1, "London"),
  ("2", "Construction Site 2", 3, "Madrid"),
  ("3", "Construction Site 3", 3, "Dublin"),
  ("4", "Construction Site 4", 0, "Sao Paulo"),
  ("5", "Construction Site 5", 10, "Rome"),
  ("6", "Construction Site 6", 12, "Rio de Janeiro"),
  ("7", "Construction Site 7", 14, "Hong Kong")
], ["id", "sensorname", "sensorage", "sensorcitylocation"]) \
.withColumn("entity", lit("sensor"))

In [3]:
e = sqlContext.createDataFrame([
  ("1", "2", "connected"),
  ("1", "3", "connected"),
  ("1", "4", "connected"),
  ("1", "5", "connected"),
  ("1", "6", "connected"),
  ("1", "7", "connected"),
  ("2", "4", "connected"),
  ("2", "6", "connected"),
  ("2", "7", "connected")  
], ["src", "dst", "relationship"])

In [4]:
from graphframes import GraphFrame
g = GraphFrame(v, e)

In [5]:
display(g.vertices)

id,sensorname,sensorage,sensorcitylocation,entity
1,Construction Site 1,1,London,sensor
2,Construction Site 2,3,Madrid,sensor
3,Construction Site 3,3,Dublin,sensor
4,Construction Site 4,0,Sao Paulo,sensor
5,Construction Site 5,10,Rome,sensor
6,Construction Site 6,12,Rio de Janeiro,sensor
7,Construction Site 7,14,Hong Kong,sensor


In [6]:
display(g.edges)

src,dst,relationship
1,2,connected
1,3,connected
1,4,connected
1,5,connected
1,6,connected
1,7,connected
2,4,connected
2,6,connected
2,7,connected


## Convert Vertices and Edges to Cosmos DB internal format
Cosmos DB Gremlin API internally keeps a JSON document representation of Edges and Vertices [as explained here](https://github.com/LuisBosquez/azure-cosmos-db-graph-working-guides/blob/master/graph-backend-json.md). Also `id` in Cosmos DB is [part of the resource URI](https://github.com/Azure/azure-cosmosdb-dotnet/issues/35#issuecomment-121009258) and hence must be URL encoded.

In [8]:
from pyspark.sql.types import StringType
from urllib.parse import quote

def urlencode(value):
  return quote(value, safe="")

udf_urlencode = udf(urlencode, StringType())

In [9]:
def to_cosmosdb_vertices(dfVertices, labelColumn, partitionKey = ""):
  dfVertices = dfVertices.withColumn("id", udf_urlencode("id"))
  
  columns = ["id", labelColumn]
  
  if partitionKey:
    columns.append(partitionKey)
  
  columns.extend(['nvl2({x}, array(named_struct("id", uuid(), "_value", {x})), NULL) AS {x}'.format(x=x) \
                for x in dfVertices.columns if x not in columns])
 
  return dfVertices.selectExpr(*columns).withColumnRenamed(labelColumn, "label")

In [10]:
cosmosDbVertices = to_cosmosdb_vertices(g.vertices, "entity")
display(cosmosDbVertices)

id,label,sensorname,sensorage,sensorcitylocation
1,sensor,"List(List(631a3106-0b05-40e4-9b09-0b6189ec654e, Construction Site 1))","List(List(d4907eb3-1d0c-460c-90f0-b8ae9562355c, 1))","List(List(521be634-1062-4eb9-b15d-a1d154faa4be, London))"
2,sensor,"List(List(49a7d2c7-749b-41f5-a881-e5c0c1b86b7d, Construction Site 2))","List(List(4a2d534e-9d82-47cf-910d-b53854bcb8f6, 3))","List(List(3f2de4ee-dcc2-4d51-91bc-fac03312aede, Madrid))"
3,sensor,"List(List(bb46969f-06ff-4db7-8979-f8f9244cd3e2, Construction Site 3))","List(List(ef0c2bcb-69bb-4b10-8b2a-c19261200380, 3))","List(List(c3dd930a-3d2c-4f44-a158-8177ea85c947, Dublin))"
4,sensor,"List(List(78a3e6b0-bba7-4a3f-8165-4fc6ce18fc03, Construction Site 4))","List(List(90b3b6bd-dccd-4b0e-864c-cbd00585a2a5, 0))","List(List(0f6727b3-681d-45e9-989b-4142f7ddc5aa, Sao Paulo))"
5,sensor,"List(List(4275921c-15f0-404d-a0a7-43ad9713a489, Construction Site 5))","List(List(6f929452-46f7-4ee3-baef-e76467aef614, 10))","List(List(6a134155-1d30-4e3e-a3a5-7e972fb575ec, Rome))"
6,sensor,"List(List(b4002e0d-2178-4e7b-b482-f78adabf6fb2, Construction Site 6))","List(List(59df0c99-4ae5-4b3c-8c9c-f8b24eda4bc9, 12))","List(List(fb5213c5-a2dc-4d41-9ce1-4bfd77536c68, Rio de Janeiro))"
7,sensor,"List(List(110c4b51-a0c7-4b14-aac0-156c030f21ee, Construction Site 7))","List(List(94883882-89d2-4e75-a930-9467e2af3750, 14))","List(List(6131a388-2554-47bc-a61e-c16aa2cd9591, Hong Kong))"


In [11]:
from pyspark.sql.functions import concat_ws, col

def to_cosmosdb_edges(g, labelColumn, partitionKey = ""): 
  dfEdges = g.edges
  
  if partitionKey:
    dfEdges = dfEdges.alias("e") \
      .join(g.vertices.alias("sv"), col("e.src") == col("sv.id")) \
      .join(g.vertices.alias("dv"), col("e.dst") == col("dv.id")) \
      .selectExpr("e.*", "sv." + partitionKey, "dv." + partitionKey + " AS _sinkPartition")

  dfEdges = dfEdges \
    .withColumn("id", udf_urlencode(concat_ws("_", col("src"), col(labelColumn), col("dst")))) \
    .withColumn("_isEdge", lit(True)) \
    .withColumn("_vertexId", udf_urlencode("src")) \
    .withColumn("_sink", udf_urlencode("dst")) \
    .withColumnRenamed(labelColumn, "label") \
    .drop("src", "dst")
  
  return dfEdges

In [12]:
cosmosDbEdges = to_cosmosdb_edges(g, "relationship")
display(cosmosDbEdges)

label,id,_isEdge,_vertexId,_sink
connected,1_connected_2,True,1,2
connected,1_connected_3,True,1,3
connected,1_connected_4,True,1,4
connected,1_connected_5,True,1,5
connected,1_connected_6,True,1,6
connected,1_connected_7,True,1,7
connected,2_connected_4,True,2,4
connected,2_connected_6,True,2,6
connected,2_connected_7,True,2,7


## Make sure to use the [Cosmos DB https endpoint](https://docs.microsoft.com/en-us/azure/cosmos-db/how-to-use-regional-gremlin#portal-endpoint-discovery) and **NOT** the `wss://` endpoint

In [14]:
cosmosDbConfig = {
  "Endpoint" : "https://<COSMOSDB_ENDPOINT>.documents.azure.com:443/",
  "Masterkey" : "<COSMOSDB_PRIMARYKEY>",
  "Database" : "<DATABASE>",
  "Collection" : "<COLLECTION>",
  "Upsert" : "true"
}

cosmosDbFormat = "com.microsoft.azure.cosmosdb.spark"

cosmosDbVertices.write.format(cosmosDbFormat).mode("append").options(**cosmosDbConfig).save()
cosmosDbEdges.write.format(cosmosDbFormat).mode("append").options(**cosmosDbConfig).save()