In [0]:
!pip install findspark

You should consider upgrading via the '/databricks/python3/bin/python -m pip install --upgrade pip' command.[0m


In [0]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.sql import functions as F
from pyspark.sql.functions import *
from pyspark.sql.types import *
import json
import findspark

In [0]:
findspark.init()

In [0]:
spark = SparkSession.builder.appName("ProjetVelib")\
        .master("local")\
        .getOrCreate()
sc = spark.sparkContext.setLogLevel("ERROR")
#ssc = StreamingContext(sc, 60)

In [0]:
raw_json = spark.read.json("dbfs:/FileStore/shared_uploads/cecile.guillot@live.fr/test.json", multiLine=True)

df = spark.readStream\
          .format("kafka")\
          .option("kafka.bootstrap.servers", "51.38.185.58:9092")\
          .option("subscribe", "station_status")\
          .option("startingOffsets", "latest")\
          .load()\
          .withColumn("value", from_json(col("value").cast("string"), raw_json.schema))\
          .select(col('value.fields.name'), col('value.datasetid'), col('value.*'))

In [0]:
df = df.drop('datasetid', 'geometry', 'record_timestamp', 'recordid')

In [0]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- fields: struct (nullable = true)
 |    |-- capacity: long (nullable = true)
 |    |-- coordonnees_geo: array (nullable = true)
 |    |    |-- element: double (containsNull = true)
 |    |-- duedate: string (nullable = true)
 |    |-- ebike: long (nullable = true)
 |    |-- is_installed: string (nullable = true)
 |    |-- is_renting: string (nullable = true)
 |    |-- is_returning: string (nullable = true)
 |    |-- mechanical: long (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- nom_arrondissement_communes: string (nullable = true)
 |    |-- numbikesavailable: long (nullable = true)
 |    |-- numdocksavailable: long (nullable = true)
 |    |-- stationcode: string (nullable = true)



In [0]:
df = df.select(col("fields.*"))

In [0]:
query = df.writeStream \
          .outputMode("append") \
          .format("memory") \
          .queryName("velib")\
          .start()

In [0]:
%sql select * from velib

capacity,coordonnees_geo,duedate,ebike,is_installed,is_renting,is_returning,mechanical,name,nom_arrondissement_communes,numbikesavailable,numdocksavailable,stationcode
30,"List(48.871256519012, 2.4865807592869)",2022-08-11T16:11:02+00:00,20,OUI,OUI,OUI,6,Mairie de Rosny-sous-Bois,Rosny-sous-Bois,26,2,31104
60,"List(48.8710440519842, 2.366104461987773)",2022-08-11T16:11:12+00:00,2,OUI,OUI,OUI,2,Alibert - Jemmapes,Paris,4,52,10013
12,"List(48.875448033960744, 2.315508019010038)",2022-08-11T16:05:42+00:00,1,OUI,OUI,OUI,2,Messine - Place Du Pérou,Paris,3,9,8026
25,"List(48.837525839067, 2.3360354080796)",2022-08-11T16:08:42+00:00,0,OUI,OUI,OUI,3,Cassini - Denfert-Rochereau,Paris,3,22,14111
21,"List(48.85165383178419, 2.3308077827095985)",2022-08-11T16:11:00+00:00,6,OUI,OUI,OUI,4,Saint-Sulpice,Paris,10,11,6003
48,"List(48.835092787823875, 2.353468135133752)",2022-08-11T16:10:59+00:00,4,OUI,OUI,OUI,3,Le Brun - Gobelins,Paris,7,39,13007
17,"List(48.84708159081946, 2.321374788880348)",2022-08-11T16:10:18+00:00,1,OUI,OUI,OUI,0,Saint-Romain - Cherche-Midi,Paris,1,16,6108
31,"List(48.91039875761846, 2.3851355910301213)",2022-08-11T16:08:39+00:00,4,OUI,OUI,OUI,2,André Karman - République,Aubervilliers,6,21,33006
25,"List(48.835583838706, 2.2325500845909)",2022-08-11T16:09:32+00:00,2,OUI,OUI,OUI,10,Silly - Galliéni,Boulogne-Billancourt,12,13,21010
22,"List(48.862090937689715, 2.196576297283173)",2022-08-11T16:09:14+00:00,7,OUI,OUI,OUI,0,Place Nelson Mandela,Rueil-Malmaison,7,12,25006


In [0]:
mean_available = df.filter(df.is_renting == "OUI")\
                    .groupBy("name").agg(
                        avg("ebike").alias("MoyVeloElectrique"),\
                        avg("mechanical").alias("MoyVeloClassique"),\
                        avg("numdocksavailable").alias("MoyPlacesDispo"))

In [0]:
mean_available.writeStream.outputMode("complete").format("memory").queryName("mean_available").start()

Out[17]: <pyspark.sql.streaming.StreamingQuery at 0x7f598cbfc8b0>

In [0]:
%sql select * from mean_available

name,MoyVeloElectrique,MoyVeloClassique,MoyPlacesDispo
Ramponeau - Belleville,6.0,1.0,36.0
Cambrai - Benjamin Constant,0.0,0.0,41.0
Square Boucicaut,2.3,13.0,44.7
Messine - Place Du Pérou,0.0,1.1,10.9
Bir Hakeim,5.2,13.2,10.6
Guersant - Gouvion-Saint-Cyr,2.3,0.0,33.7
Place Nelson Mandela,7.0,0.0,12.0
Vaneau - Sèvres,7.714285714285714,10.0,17.285714285714285
Grande Armée - Brunel,6.0,4.0,41.0
Morillons - Dantzig,1.0,1.3,49.7
