In [0]:
!pip install findspark

You should consider upgrading via the '/databricks/python3/bin/python -m pip install --upgrade pip' command.[0m


In [0]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.sql import functions as F
from pyspark.sql.functions import *
from pyspark.sql.types import *
import json
import findspark

In [0]:
findspark.init()

In [0]:
spark = SparkSession.builder.appName("ProjetVelib")\
        .master("local")\
        .getOrCreate()

sc = spark.sparkContext.setLogLevel("ERROR")
#ssc = StreamingContext(sc, 60)

In [0]:
schema = StructType([
StructField("name",StringType(),True), 
StructField("station_code",StringType(),True), 
StructField("ebike",StringType(),True),
StructField("mechanical", StringType(), True),
StructField("coordonnees_geo", StringType(), True),
StructField("duedate", StringType(), True),
StructField("numbikesavailable", IntegerType(), True),
StructField("numdocksavailable", StringType(), True),
StructField("capacity", StringType(), True),
StructField("is_renting", StringType(), True),
StructField("is_installed", StringType(), True),
StructField("nom_arrondissement_communes", StringType(), True),
StructField("is_returning", StringType(), True),
StructField("geometry", StringType(), True),
StructField("coordinates", StringType(), True),
StructField("record_timestamp", StringType(), True)])

In [0]:
# File location and type
file_location = "/FileStore/tables/data_velib.csv"
file_type = "csv"

# CSV options
infer_schema = "false"
first_row_is_header = "false"
delimiter = ","

df_with_schema = spark.read.format("csv") \
      .option("header", True) \
      .schema(schema) \
      .load(file_location)

display(df_with_schema)

name,station_code,ebike,mechanical,coordonnees_geo,duedate,numbikesavailable,numdocksavailable,capacity,is_renting,is_installed,nom_arrondissement_communes,is_returning,geometry,coordinates,record_timestamp
Mairie de Rosny-sous-Bois,31104,20,8,"[48.871256519012, 2.4865807592869]",2022-08-12T12:13:11+00:00,28,1,30,OUI,OUI,Rosny-sous-Bois,OUI,,,
Toudouze - Clauzel,9020,0,0,"[48.87929591733507, 2.3373600840568547]",2022-08-12T12:09:46+00:00,0,19,21,OUI,OUI,Paris,OUI,,,
Mairie du 12ème,12109,5,24,"[48.840855311763, 2.3875549435616]",2022-08-12T12:13:33+00:00,29,1,30,OUI,OUI,Paris,OUI,,,
Harpe - Saint-Germain,5001,5,22,"[48.85151881501689, 2.343670316040516]",2022-08-12T12:10:58+00:00,27,15,45,OUI,OUI,Paris,OUI,,,
Messine - Place Du Pérou,8026,6,4,"[48.875448033960744, 2.315508019010038]",2022-08-12T12:13:30+00:00,10,2,12,OUI,OUI,Paris,OUI,,,
Cassini - Denfert-Rochereau,14111,4,1,"[48.837525839067, 2.3360354080796]",2022-08-12T12:13:31+00:00,5,20,25,OUI,OUI,Paris,OUI,,,
Saint-Sulpice,6003,5,0,"[48.85165383178419, 2.3308077827095985]",2022-08-12T12:05:45+00:00,5,16,21,OUI,OUI,Paris,OUI,,,
Le Brun - Gobelins,13007,3,11,"[48.835092787823875, 2.353468135133752]",2022-08-12T12:07:20+00:00,14,32,48,OUI,OUI,Paris,OUI,,,
Saint-Romain - Cherche-Midi,6108,5,4,"[48.84708159081946, 2.321374788880348]",2022-08-12T12:04:52+00:00,9,8,17,OUI,OUI,Paris,OUI,,,
André Karman - République,33006,3,2,"[48.91039875761846, 2.3851355910301213]",2022-08-12T12:12:17+00:00,5,23,31,OUI,OUI,Aubervilliers,OUI,,,


In [0]:
print(f"Le nombre de partition est : {df.rdd.getNumPartitions()}")

Le nombre de partition est : 1


In [0]:
# Le nombre moyen, le min et le max de place de velib pour chaque station

mean_min_max = df.groupBy("_c0").agg(
                        avg("_c7").alias("Nombre de dock moyen"),\
                        min("_c7").alias("Min de dock"),\
                        max("_c7").alias("Max de dock"))

mean_min_max.display()

_c0,Nombre de dock moyen,Min de dock,Max de dock
André Karman - République,23.0,23,23
Beaux-Arts - Bonaparte,17.0,17,17
Benjamin Godard - Victor Hugo,32.0,32,32
Bir Hakeim,11.0,11,11
Cassini - Denfert-Rochereau,19.454545454545453,19,20
Chazelles - Courcelles,30.0,30,30
Froment - Bréguet,31.0,31,31
Grande Armée - Brunel,37.0,37,37
Guersant - Gouvion-Saint-Cyr,26.5,26,27
Harpe - Saint-Germain,21.666666666666668,15,26


In [0]:
# Le nombre moyen de place de velib pour chaque zone (à définir vous même)

mean_by_region = df.groupBy("_c11").agg(
                        avg("_c7").alias("Nombre de dock moyen"))

mean_by_region.display()

_c11,Nombre de dock moyen
Rueil-Malmaison,15.875
Aubervilliers,23.0
Paris,21.44210526315789
Maisons-Alfort,0.0
Ivry-sur-Seine,18.0
Rosny-sous-Bois,1.0
Boulogne-Billancourt,6.0


In [0]:
# Le nombre de station et / ou la liste des stations qui ont parfois aucun vélib disponible

number = df.filter(df._c6 == 0).groupBy("_c0").sum()

number.display()

_c0
Toudouze - Clauzel
