In [0]:
!pip install findspark

Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1
You should consider upgrading via the '/databricks/python3/bin/python -m pip install --upgrade pip' command.[0m


In [0]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.sql import functions as F
from pyspark.sql.functions import *
from pyspark.sql.types import *
import json
import findspark

In [0]:
findspark.init()

In [0]:
spark = SparkSession.builder.appName("ProjetVelib")\
        .master("local")\
        .getOrCreate()
sc = spark.sparkContext.setLogLevel("ERROR")
#ssc = StreamingContext(sc, 60)

In [0]:
raw_json = spark.read.json("dbfs:/FileStore/tables/test.json", multiLine=True)

In [0]:
df = spark.readStream\
          .format("kafka")\
          .option("kafka.bootstrap.servers", "51.38.185.58:9092")\
          .option("subscribe", "vlib_status_ville")\
          .option("startingOffsets", "latest")\
          .load()\
          .withColumn("value", from_json(col("value").cast("string"), raw_json.schema))\
          .select(col('value.fields.name'), col('value.datasetid'), col('value.*'))

In [0]:
df = df.drop('datasetid', 'geometry', 'record_timestamp', 'recordid')

In [0]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- fields: struct (nullable = true)
 |    |-- capacity: long (nullable = true)
 |    |-- coordonnees_geo: array (nullable = true)
 |    |    |-- element: double (containsNull = true)
 |    |-- duedate: string (nullable = true)
 |    |-- ebike: long (nullable = true)
 |    |-- is_installed: string (nullable = true)
 |    |-- is_renting: string (nullable = true)
 |    |-- is_returning: string (nullable = true)
 |    |-- mechanical: long (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- nom_arrondissement_communes: string (nullable = true)
 |    |-- numbikesavailable: long (nullable = true)
 |    |-- numdocksavailable: long (nullable = true)
 |    |-- stationcode: string (nullable = true)



In [0]:
df = df.select(col("fields.*"))

In [0]:
query = df.writeStream \
          .outputMode("append") \
          .format("memory") \
          .queryName("velib")\
          .start()

In [0]:
%sql select * from velib

capacity,coordonnees_geo,duedate,ebike,is_installed,is_renting,is_returning,mechanical,name,nom_arrondissement_communes,numbikesavailable,numdocksavailable,stationcode
35,"List(48.865983, 2.275725)",2022-08-12T09:11:08+00:00,1,OUI,OUI,OUI,0,Benjamin Godard - Victor Hugo,Paris,1,34,16107
20,"List(48.85590755596891, 2.3925706744194035)",2022-08-12T09:12:15+00:00,6,OUI,OUI,OUI,4,Charonne - Robert et Sonia Delauney,Paris,10,9,11104
45,"List(48.85151881501689, 2.343670316040516)",2022-08-12T09:11:18+00:00,2,OUI,OUI,OUI,22,Harpe - Saint-Germain,Paris,24,18,5001
60,"List(48.8710440519842, 2.366104461987773)",2022-08-12T09:08:42+00:00,3,OUI,OUI,OUI,1,Alibert - Jemmapes,Paris,4,53,10013
12,"List(48.875448033960744, 2.315508019010038)",2022-08-12T09:12:41+00:00,8,OUI,OUI,OUI,3,Messine - Place Du Pérou,Paris,11,1,8026
25,"List(48.837525839067, 2.3360354080796)",2022-08-12T09:09:06+00:00,5,OUI,OUI,OUI,2,Cassini - Denfert-Rochereau,Paris,7,18,14111
21,"List(48.85165383178419, 2.3308077827095985)",2022-08-12T09:09:31+00:00,1,OUI,OUI,OUI,1,Saint-Sulpice,Paris,2,19,6003
31,"List(48.91039875761846, 2.3851355910301213)",2022-08-12T09:09:25+00:00,3,OUI,OUI,OUI,3,André Karman - République,Aubervilliers,6,22,33006
50,"List(48.836022242886884, 2.4708339950830287)",2022-08-12T09:13:19+00:00,18,OUI,OUI,OUI,23,Bois de Vincennes - Gare,Nogent-sur-Marne,41,4,41301
22,"List(48.862090937689715, 2.196576297283173)",2022-08-12T09:12:26+00:00,7,OUI,OUI,OUI,0,Place Nelson Mandela,Rueil-Malmaison,7,13,25006


In [0]:
mean_available = df.filter(df.is_renting == "OUI")\
                    .groupBy("name").agg(
                        avg("ebike").alias("MoyVeloElectrique"),\
                        avg("mechanical").alias("MoyVeloClassique"),\
                        avg("numdocksavailable").alias("MoyPlacesDispo"))

In [0]:
mean_available.writeStream.outputMode("complete").format("memory").queryName("mean_available").start()

Out[12]: <pyspark.sql.streaming.StreamingQuery at 0x7f95e28d1eb0>

In [0]:
%sql select * from mean_available

name,MoyVeloElectrique,MoyVeloClassique,MoyPlacesDispo
Cambrai - Benjamin Constant,6.0,3.0,32.0
Bois de Vincennes - Gare,18.0,23.0,4.0
Square Boucicaut,18.0,16.0,26.0
Messine - Place Du Pérou,8.0,3.0,1.0
Cassini - Denfert-Rochereau,5.0,2.0,18.0
Saint-Sulpice,1.0,1.0,19.0
Boétie - Ponthieu,15.0,6.0,11.0
Benjamin Godard - Victor Hugo,1.0,0.0,34.0
Guersant - Gouvion-Saint-Cyr,5.0,3.0,27.0
Saint-Cloud - Hippodrome,3.0,2.0,22.0
