# (Neo4j)-[:LOVES]->(Kafka)

<div class="img-responsive center-block" style="background-image: url('https://cdn-images-1.medium.com/max/2000/1*0k7QFFBl7YGD9haNYeLQpw.png'); width: 1124px; height: 300px; background-position: center; background-size: cover;"></div>

## Link

https://neo4j-contrib.github.io/neo4j-streams/

# Initialize spark & neo4j sessions

In [14]:
# Spark
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local').config(
    "spark.jars.packages",
    "org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.2"
).getOrCreate()

# Neo4j
import sys
!{sys.executable} -m pip install py2neo

from py2neo import Graph

graph = Graph("bolt://neo4j:7687", auth=("neo4j", "zeppelin"))



# Query the JIT-DWH

In [15]:
from pyspark.sql.functions import first

flattenedDF = spark.read.format("json").load("/home/streams/jit-dwh/**") \
    .where("neo_id is not null") \
    .groupBy("neo_id", "timestamp", "host", "labels", "operation") \
    .pivot("key") \
    .agg(first("value"))

flattenedDF.show()

+------+--------------------+-----+--------+---------+----+------+--------------------+-----+---------------+
|neo_id|           timestamp| host|  labels|operation| age|gender|                  id|index|           name|
+------+--------------------+-----+--------+---------+----+------+--------------------+-----+---------------+
|  2240|2021-06-28T12:00:...|neo4j|[Person]|  created|86.0|     M|43284fe6-9bd8-412...|    3|Name-ClLOuk6VJh|
|  2376|2021-06-28T11:48:...|neo4j|[Person]|  created| 9.0|     M|37136ccb-bd4a-4f3...|    9|Name-Vd56yHE8is|
|  2786|2021-06-28T12:03:...|neo4j|[Person]|  created|75.0|     F|e8178989-7249-4f4...|    7|Name-paE56CMeX6|
|  3239|2021-06-28T12:15:...|neo4j|[Person]|  created|51.0|     X|d89fa928-4e46-4a0...|   10|Name-vOLeA4K3st|
|  3814|2021-06-28T12:29:...|neo4j|[Person]|  created|39.0|     X|60e1a42a-86e2-489...|    5|Name-eYsTwpsyoc|
|  3865|2021-06-28T12:30:...|neo4j|[Person]|  created|15.0|     X|d5fb0ca6-3230-4ba...|    6|Name-tgMVBsodmn|
|  4303|20

# Let's inspect the Structure of our data

In [16]:
flattenedDF.printSchema()

root
 |-- neo_id: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- host: string (nullable = true)
 |-- labels: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- operation: string (nullable = true)
 |-- age: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- id: string (nullable = true)
 |-- index: string (nullable = true)
 |-- name: string (nullable = true)



# Let's go to add the field "placeOfBirth" over a node

In [26]:
id = input("Id: ")
placeOfBirth = input("Place of Birth: ")

graph.run("MATCH (p:Person{id: $id}) SET p.placeOfBirth = $placeOfBirth return p", parameters = {"id": id, "placeOfBirth": placeOfBirth}).data()

Id4ea84c54-b0ec-4e57-96ac-071e0b3bc370
Birth Year1980


[{'p': Node('Person', age=18.0, birthYear='1980', gender='F', id='4ea84c54-b0ec-4e57-96ac-071e0b3bc370', index=9, name='Name-SGQAfhw767')}]

# Check how the new field is automatically added to our representation

In [None]:
from pyspark.sql.functions import first, desc

flattenedDF = spark.read.format("json").load("/home/streams/jit-dwh/**") \
    .where("neo_id is not null") \
    .groupBy("neo_id", "timestamp", "host", "labels", "operation") \
    .pivot("key") \
    .agg(first("value"))

id = input("Id: ")
flattenedDF.where("id = '{id}'".format(**locals())).orderBy(desc("timestamp")).show()