In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from utilities import *

In [2]:
# Create SparkSession
spark = SparkSession.builder \
    .master("local") \
    .appName("hypercharge_sessions")\
    .config("spark.driver.memory", "15g")\
    .getOrCreate()

# set number of partitions
# spark.conf.set("spark.sql.shuffle.partitions", 2)

spark

Reading the charging sessions parquet file and counting entries

In [3]:
hypercharge_sessions_df = spark.read.parquet(r"C:\Users\Gian\Desktop\Luiss\BigData\FreeToX_Project\dataset\full\hypercarge_sessions.parquet")
hypercharge_sessions_df.count()

13442

Dropping `stackOszis` and `carChargeParameter`. The first is oscilloscope data, so not interesting, the second one is also not interesting and creates problems due to duplicate naming.

We then select only a subset of variables, which are the one we found interesting when reading the documentation. We also rename to avoid problems related to the `.` character in some column names. 

Then, we print the schema (type) of the remaining columns.

In [4]:
hypercharge_sessions_df = hypercharge_sessions_df.drop("carChargeParameter", "stackOszis")

hypercharge_sessions_df = hypercharge_sessions_df.select(
    "gpsLat", "gpsLong", "locationStreet", "locationZipCode", "locationTown", "locationProvince",
    "locationCountry", "endClientName", "distributorName", "corporationName", "operatorName", "type", "physicalPosition", 
    col("`session.averagePower`").alias("session_averagePower"), 
    col("`session.car`").alias("session_car"), 
    col("`session.distributorName`").alias("session_distributorName"), 
    col("`session.end`").alias("session_end"), 
    col("`session.meterStart`").alias("session_meterStart"), 
    col("`session.meterStop`").alias("session_meterStop"), 
    col("`session.physicalPosition`").alias("session_physicalPosition"), 
    col("`session.peakPower`").alias("session_peakPower"), 
    col("`session.start`").alias("session_start"), 
    col("`session.socStart`").alias("session_socStart"), 
    col("`session.socStop`").alias("session_socStop"), 
    col("`session.chargingSessionId`").alias("session_chargingSessionId"), 
    col("`session.type`").alias("session_type"), 
    col("`session.position`").alias("session_position"), 
    col("`session.transactionIdNew`").alias("session_transactionIdNew"), 
    col("`carChargeParameter.car`").alias("carChargeParameter_car"), 
    col("`carChargeParameter.excludeFromStatistics`").alias("carChargeParameter_excludeFromStatistics"), 
    col("`carChargeParameter.batteryCapacity`").alias("carChargeParameter_batteryCapacity")
)

hypercharge_sessions_df.printSchema()

root
 |-- gpsLat: double (nullable = true)
 |-- gpsLong: double (nullable = true)
 |-- locationStreet: string (nullable = true)
 |-- locationZipCode: string (nullable = true)
 |-- locationTown: string (nullable = true)
 |-- locationProvince: string (nullable = true)
 |-- locationCountry: string (nullable = true)
 |-- endClientName: integer (nullable = true)
 |-- distributorName: string (nullable = true)
 |-- corporationName: string (nullable = true)
 |-- operatorName: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- physicalPosition: long (nullable = true)
 |-- session_averagePower: integer (nullable = true)
 |-- session_car: integer (nullable = true)
 |-- session_distributorName: integer (nullable = true)
 |-- session_end: string (nullable = true)
 |-- session_meterStart: long (nullable = true)
 |-- session_meterStop: long (nullable = true)
 |-- session_physicalPosition: integer (nullable = true)
 |-- session_peakPower: integer (nullable = true)
 |-- session_start: s

Using some functions in `utilities`, we display the percentage of null values for each column, we then drop the columns with 100% nulls

In [5]:
session_nulls = null_percentage(hypercharge_sessions_df)
print("Percentage of null values in Session file:")
display_null_percentages(session_nulls)

Percentage of null values in Session file:
gpsLat                        3.45%
gpsLong                       3.45%
locationStreet                33.56%
locationZipCode               33.56%
locationTown                  33.56%
locationProvince              33.56%
locationCountry               8.98%
endClientName                 100.00%
distributorName               0.00%
corporationName               0.00%
operatorName                  100.00%
type                          0.00%
physicalPosition              0.26%
session_averagePower          100.00%
session_car                   100.00%
session_distributorName       100.00%
session_end                   0.00%
session_meterStart            0.00%
session_meterStop             0.00%
session_physicalPosition      100.00%
session_peakPower             100.00%
session_start                 0.00%
session_socStart              0.00%
session_socStop               0.00%
session_chargingSessionId     0.00%
session_type                  100.00%
s

In [6]:
# drop columns with 100% Nulls
hypercharge_sessions_df = hypercharge_sessions_df.drop('endClientName', 'operatorName', 'session_averagePower', 
                                                       'session_car', 'session_distributorName', 'session_physicalPosition',
                                                       'session_peakPower','session_type', 'session_transactionIdNew')

Now we explore some columns in more detailed. First is `carChargeParameter_excludeFromStatistics`: we count occurrences of values to gauge how many sessions are deemed not relevant for statistics.

We also display occurrences of `distributorName` and `corporationName` to see if we could do some analysis based on distributor or corporation. It however turns out that almost all of them are `Free to X`, with a small minority of `Be Charge`, so not that interesting.

In [7]:
hypercharge_sessions_df.groupby('carChargeParameter_excludeFromStatistics').count().show()
hypercharge_sessions_df.groupby('distributorName').count().show()
hypercharge_sessions_df.groupby('corporationName').count().show()

+----------------------------------------+-----+
|carChargeParameter_excludeFromStatistics|count|
+----------------------------------------+-----+
|                                    NULL|   80|
|                                    true|    2|
|                                   false|13360|
+----------------------------------------+-----+

+----------------+-----+
| distributorName|count|
+----------------+-----+
|Free To X S.p.A.|13423|
|   Free To X Srl|   19|
+----------------+-----+

+----------------+-----+
| corporationName|count|
+----------------+-----+
|Free To X S.p.A.|13040|
|   Be charge Srl|  383|
|   Free To X Srl|   19|
+----------------+-----+



In [8]:
hypercharge_sessions_df=hypercharge_sessions_df.drop('distributorName', 'corporationName')

In [42]:
hypercharge_sessions_df.filter(
    hypercharge_sessions_df['carChargeParameter_excludeFromStatistics']==True).toPandas()

Unnamed: 0,gpsLat,gpsLong,locationStreet,locationZipCode,locationTown,locationProvince,locationCountry,type,physicalPosition,session_end,...,session_meterStop,session_start,session_socStart,session_socStop,session_chargingSessionId,session_position,carChargeParameter_car,carChargeParameter_excludeFromStatistics,carChargeParameter_batteryCapacity,session_duration_mins
0,,,,,,,,CCS2_400,4,2024-07-15 20:20:17,...,8381,2024-07-15 20:15:48,2,100,69473769,2,EOL Test,True,,4.483333
1,,,,,,,,CCS2_400,4,2024-07-15 20:57:10,...,75056,2024-07-15 20:26:40,2,100,69475552,2,EOL Test,True,,30.5


In [34]:
hypercharge_sessions_df.filter(
    hypercharge_sessions_df['carChargeParameter_excludeFromStatistics'].isNull()).toPandas().sample(10)

Unnamed: 0,gpsLat,gpsLong,locationStreet,locationZipCode,locationTown,locationProvince,locationCountry,type,physicalPosition,session_end,...,session_meterStop,session_start,session_socStart,session_socStop,session_chargingSessionId,session_position,carChargeParameter_car,carChargeParameter_excludeFromStatistics,carChargeParameter_batteryCapacity,session_duration_mins
29,44.609688,8.661311,,,,,Italy,CCS2_400,1,2024-07-30 16:03:33,...,74255560,2024-07-30 15:26:12,57,99,71842738,1,,,,37.35
5,45.773998,9.04934,,,,,Italy,CCS2_400,4,2024-07-15 21:44:22,...,95902752,2024-07-15 21:41:56,0,0,69486178,2,,,,2.433333
3,43.071302,13.844789,,,,,Italy,CCS2_400,1,2024-07-14 14:10:47,...,20495100,2024-07-14 14:07:45,0,0,69240571,1,,,,3.033333
11,45.729051,9.028446,Autostrada dei Laghi km 29,22071.0,Provincia di Como,Lombardia,Italy,CCS2_400,1,2024-07-20 19:42:32,...,82783008,2024-07-20 19:40:16,0,0,70264634,1,,,,2.266667
63,41.25078,16.21886,Autostrada Adriatica 620,76123.0,Provincia di Barletta-Andria-Trani,Puglia,Italy,CCS2_400,1,2024-08-16 02:00:31,...,60639240,2024-08-16 01:59:11,0,0,74452266,1,,,,1.333333
58,41.049077,14.323145,Autostrada del Sole,81020.0,San Nicola la Strada,Campania,Italy,CCS2_400,4,2024-08-14 14:41:50,...,88662384,2024-08-14 14:39:54,0,0,74205234,2,,,,1.933333
40,45.532501,10.148404,Via Enrico Fermi 20b,25030.0,Roncadelle,Lombardia,Italy,CCS2_400,4,2024-08-01 12:40:39,...,101558920,2024-08-01 12:22:44,38,80,72110821,2,,,,17.916667
64,45.559134,9.039494,,,,,Italy,CCS2_400,4,2024-08-16 11:52:51,...,11103185,2024-08-16 11:50:56,0,0,74479049,2,,,,1.916667
19,43.65052,11.46442,Località Prulli di Sotto 105,50066.0,Città Metropolitana di Firenze,Toscana,Italy,CCS2_400,1,2024-07-25 18:56:18,...,109146600,2024-07-25 18:55:47,25,25,71044766,1,,,,0.516667
69,,,,,,,,CCS2_400,4,2024-08-19 12:13:50,...,8291626,2024-08-19 11:28:31,54,98,74984955,2,,,,45.316667


In [40]:
hypercharge_sessions_exclude_df = hypercharge_sessions_df.filter( hypercharge_sessions_df['carChargeParameter_excludeFromStatistics']==True)

session_exclude_nulls = null_percentage(hypercharge_sessions_exclude_df)
print("Percentage of null values in Sessions to be excluded from statistics:")
display_null_percentages(session_exclude_nulls)

Percentage of null values in Sessions to be excluded from statistics:
gpsLat                        100.00%
gpsLong                       100.00%
locationStreet                100.00%
locationZipCode               100.00%
locationTown                  100.00%
locationProvince              100.00%
locationCountry               100.00%
type                          0.00%
physicalPosition              0.00%
session_end                   0.00%
session_meterStart            0.00%
session_meterStop             0.00%
session_start                 0.00%
session_socStart              0.00%
session_socStop               0.00%
session_chargingSessionId     0.00%
session_position              0.00%
carChargeParameter_car        0.00%
carChargeParameter_excludeFromStatistics0.00%
carChargeParameter_batteryCapacity100.00%
session_duration_mins         0.00%


In [41]:
hypercharge_sessions_exclude_df = hypercharge_sessions_df.filter(hypercharge_sessions_df['carChargeParameter_excludeFromStatistics'].isNull())

session_exclude_nulls = null_percentage(hypercharge_sessions_exclude_df)
print("Percentage of null values in Sessions that have Null in exclude from statistics:")
display_null_percentages(session_exclude_nulls)

Percentage of null values in Sessions that have Null in exclude from statistics:
gpsLat                        5.00%
gpsLong                       5.00%
locationStreet                28.75%
locationZipCode               28.75%
locationTown                  28.75%
locationProvince              28.75%
locationCountry               13.75%
type                          0.00%
physicalPosition              0.00%
session_end                   0.00%
session_meterStart            0.00%
session_meterStop             0.00%
session_start                 0.00%
session_socStart              0.00%
session_socStop               0.00%
session_chargingSessionId     0.00%
session_position              0.00%
carChargeParameter_car        100.00%
carChargeParameter_excludeFromStatistics100.00%
carChargeParameter_batteryCapacity100.00%
session_duration_mins         0.00%


In [52]:
hypercharge_sessions_df = hypercharge_sessions_df.filter(
    hypercharge_sessions_df.carChargeParameter_excludeFromStatistics == False)

Then, we want to convert the columns `session_start` and `session_end` to a useful timestamp format: from the earlier pandas print the format are strings like `2024-08-09T11:15:01+00:00`. So we want to escape the `T` and convert to a timestamp like `2024-08-09 11:15:01`. 

Note how data is GMT+0, it could be useful to convert to Rome summer time (GMT+2). Therefore, `2024-08-09T11:15:01+00:00` and up being `2024-08-09 13:15:01`

We then calculate the duration of the charging session, `session_duration_mins`

In [10]:
hypercharge_sessions_df.select(
    "session_start", 'session_end'
).toPandas().sample(10)

Unnamed: 0,session_start,session_end
13037,2024-08-25T22:10:17+00:00,2024-08-25T22:41:51+00:00
1307,2024-07-17T15:11:01+00:00,2024-07-17T15:50:21+00:00
3267,2024-07-24T07:49:38+00:00,2024-07-24T08:11:32+00:00
8303,2024-08-09T12:38:39+00:00,2024-08-09T12:52:05+00:00
7505,2024-08-07T14:24:58+00:00,2024-08-07T14:49:13+00:00
946,2024-07-15T19:01:17+00:00,2024-07-15T19:16:06+00:00
3411,2024-07-25T15:03:41+00:00,2024-07-25T15:49:10+00:00
445,2024-07-14T09:23:15+00:00,2024-07-14T10:02:47+00:00
5771,2024-08-01T08:17:42+00:00,2024-08-01T08:49:58+00:00
2530,2024-07-22T10:08:20+00:00,2024-07-22T10:46:57+00:00


In [11]:
# Convert the session_start and session_end columns from string to timestamp with the timezone and seconds
hypercharge_sessions_df = hypercharge_sessions_df.withColumn(
    "session_start", to_timestamp("session_start", "yyyy-MM-dd'T'HH:mm:ssXXX")
).withColumn(
    "session_end", to_timestamp("session_end", "yyyy-MM-dd'T'HH:mm:ssXXX")
)

hypercharge_sessions_df.select("session_start", "session_end").show()

# Convert from GMT to Rome's time zone
hypercharge_sessions_df = hypercharge_sessions_df.withColumn(
    "session_start", from_utc_timestamp("session_start", "Europe/Rome")
).withColumn(
    "session_end", from_utc_timestamp("session_end", "Europe/Rome")
)

# Show the result
hypercharge_sessions_df.select("session_start", "session_end").show()

+-------------------+-------------------+
|      session_start|        session_end|
+-------------------+-------------------+
|2024-07-14 08:01:27|2024-07-14 08:26:46|
|2024-07-14 08:01:27|2024-07-14 08:25:08|
|2024-07-14 08:01:57|2024-07-14 08:44:43|
|2024-07-14 08:02:59|2024-07-14 08:20:07|
|2024-07-14 08:03:24|2024-07-14 08:36:35|
|2024-07-14 08:05:42|2024-07-14 08:50:30|
|2024-07-14 08:06:43|2024-07-14 08:45:06|
|2024-07-14 08:08:40|2024-07-14 08:37:28|
|2024-07-14 08:09:23|2024-07-14 08:40:42|
|2024-07-14 08:13:11|2024-07-14 08:28:56|
|2024-07-14 08:13:30|2024-07-14 08:35:17|
|2024-07-14 08:14:44|2024-07-14 08:42:40|
|2024-07-14 08:15:17|2024-07-14 08:19:15|
|2024-07-14 08:16:34|2024-07-14 09:10:07|
|2024-07-14 08:16:43|2024-07-14 08:30:33|
|2024-07-14 08:17:29|2024-07-14 08:41:37|
|2024-07-14 08:17:54|2024-07-14 08:34:54|
|2024-07-14 08:18:23|2024-07-14 08:36:24|
|2024-07-14 08:19:00|2024-07-14 08:38:11|
|2024-07-14 08:21:00|2024-07-14 08:27:53|
+-------------------+-------------

In [12]:
hypercharge_sessions_df.select(
    "session_start", 'session_end'
).printSchema()

root
 |-- session_start: timestamp (nullable = true)
 |-- session_end: timestamp (nullable = true)



In [13]:
hypercharge_sessions_df = hypercharge_sessions_df.withColumn(
    'session_duration_mins', (col('session_end').cast('long')-col('session_start').cast('long'))/60
)

hypercharge_sessions_df.select('session_duration_mins').describe().show()

+-------+---------------------+
|summary|session_duration_mins|
+-------+---------------------+
|  count|                13442|
|   mean|    24.36972672717355|
| stddev|    15.05030696979774|
|    min| 0.016666666666666666|
|    max|                115.4|
+-------+---------------------+



A brief look at the session ID range, to see if we could match it with other datasets

In [14]:
hypercharge_sessions_df.select(
    "session_chargingSessionId",
).describe().show()

+-------+-------------------------+
|summary|session_chargingSessionId|
+-------+-------------------------+
|  count|                    13442|
|   mean|      7.265700143884839E7|
| stddev|       2045333.0978502438|
|    min|                 69206115|
|    max|                 76056911|
+-------+-------------------------+



Now a look ad `carChargeParameter_car` and `carChargeParameter_batteryCapacity`.  
The second one has unfortunately a lot of NAs (72%+all the `0.0` entries) but still it could be useful.

In [53]:
hypercharge_sessions_df.groupby("carChargeParameter_car").count().sort(desc("count")).show()
hypercharge_sessions_df.groupby("carChargeParameter_batteryCapacity").count().sort(desc("count")).show()

+----------------------+-----+
|carChargeParameter_car|count|
+----------------------+-----+
|                  NULL| 1702|
|  Tesla Model Y / 3 LR| 1504|
|  Skoda Enyaq IV / ...| 1335|
|  VW ID.4 / ID.5 GT...|  746|
|   Cupra Born | VW ID3|  642|
|                 Tesla|  597|
|               BMW iX1|  516|
|          BMW I4 G26 ?|  490|
|  Polestar 2 78kWh ...|  433|
|    Mercedes EQV / EQB|  427|
|  Porsche Taycan/Au...|  395|
|                BMW iX|  326|
|          Mercedes EQS|  281|
|            Kia e-Niro|  255|
|           Opel eCorsa|  247|
|  Peugeot E-208 / O...|  240|
|  Renault Megane E-...|  215|
|  Hyundai Ioniq 5 /...|  199|
|  Volvo EX30 / smar...|  197|
|       Fiat 500e 42kWh|  189|
+----------------------+-----+
only showing top 20 rows

+----------------------------------+-----+
|carChargeParameter_batteryCapacity|count|
+----------------------------------+-----+
|                              NULL|12232|
|                           50000.0|  487|
|              

Let's correct the `0.0` entries in `carChargeParameter_batteryCapacity`, they count as `Null` too.  
Same goes for `unknown` and `NotTransmitted` in `carChargeParameter_car`.  
Then, we show the updated occurrency counts.

In [24]:
hypercharge_sessions_df=hypercharge_sessions_df.replace(0.0, None, "carChargeParameter_batteryCapacity")
hypercharge_sessions_df.groupby("carChargeParameter_batteryCapacity").count().sort(desc("count")).show()

hypercharge_sessions_df=hypercharge_sessions_df.replace('unknown', None, "carChargeParameter_car")
hypercharge_sessions_df=hypercharge_sessions_df.replace('NotTransmitted', None, "carChargeParameter_car")
hypercharge_sessions_df.groupby("carChargeParameter_car").count().sort(desc("count")).show()

+----------------------------------+-----+
|carChargeParameter_batteryCapacity|count|
+----------------------------------+-----+
|                              NULL|12314|
|                           50000.0|  487|
|                           79100.0|  433|
|                           42000.0|  189|
|                           23600.0|    8|
|                          135000.0|    3|
|                             153.0|    3|
|                           22800.0|    2|
|                           28000.0|    1|
|                           18700.0|    1|
|                           35500.0|    1|
+----------------------------------+-----+

+----------------------+-----+
|carChargeParameter_car|count|
+----------------------+-----+
|                  NULL| 1782|
|  Tesla Model Y / 3 LR| 1504|
|  Skoda Enyaq IV / ...| 1335|
|  VW ID.4 / ID.5 GT...|  746|
|   Cupra Born | VW ID3|  642|
|                 Tesla|  597|
|               BMW iX1|  516|
|          BMW I4 G26 ?|  490|
|  Polestar 2

In [54]:
hypercharge_sessions_df.groupby("type").count().sort(desc("count")).show()
hypercharge_sessions_df.groupby("physicalPosition").count().sort(desc("count")).show()
hypercharge_sessions_df.groupby("session_Position").count().sort(desc("count")).show()

+--------+-----+
|    type|count|
+--------+-----+
|CCS2_400|13320|
|AC_Cable|   35|
| CHAdeMO|    5|
+--------+-----+

+----------------+-----+
|physicalPosition|count|
+----------------+-----+
|               4| 6720|
|               1| 6605|
|            NULL|   35|
+----------------+-----+

+----------------+-----+
|session_Position|count|
+----------------+-----+
|               2| 6755|
|               1| 6605|
+----------------+-----+



In [46]:
hypercharge_sessions_df.select('session_meterStart', 'session_meterStop').toPandas().sample(10)

Unnamed: 0,session_meterStart,session_meterStop
951,120857960,120898840
4682,232498,232498
11750,41866088,41882432
3634,79945,79945
7660,30626630,30636604
12139,190681024,190746160
10886,2248856,2313505
5628,2505373,2505373
13226,43911248,43946464
6766,21303454,21327920


In [60]:
hypercharge_sessions_df = hypercharge_sessions_df.withColumn(
    'session_meter_diff_kWh', col('session_meterStop')-col('session_meterStart')
)
hypercharge_sessions_df.select('session_meter_diff_kWh').describe().show()

+-------+----------------------+
|summary|session_meter_diff_kWh|
+-------+----------------------+
|  count|                 13360|
|   mean|    30139.438098802395|
| stddev|    18570.603453299973|
|    min|                     0|
|    max|                338538|
+-------+----------------------+



***

In [49]:
hypercharge_sessions_df.groupby("carChargeParameter_car").count().toPandas()

Unnamed: 0,carChargeParameter_car,count
0,Tesla?,4
1,Porsche Taycan,26
2,Fisker Ocean,8
3,Kia e-Niro,255
4,Nio ET5/7,2
...,...,...
79,MG4 64kWh,128
80,Cupra Born | VW ID3,642
81,BMW iX,326
82,Renault Zoe CCS,88
