In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from utilities import *

In [2]:
# Create SparkSession
spark = SparkSession.builder \
    .master("local") \
    .appName("hypercharge_sessions")\
    .config("spark.driver.memory", "15g")\
    .getOrCreate()

# set number of partitions
# spark.conf.set("spark.sql.shuffle.partitions", 2)

spark

Reading the charging sessions parquet file and counting entries

In [3]:
hypercharge_sessions_df = spark.read.parquet(r"C:\Users\Gian\Desktop\Luiss\BigData\FreeToX_Project\dataset\full\hypercarge_sessions.parquet")
hypercharge_sessions_df.count()

13442

Dropping `stackOszis` and `carChargeParameter`. The first is oscilloscope data, so not interesting, the second one is also not interesting and creates problems due to duplicate naming.

We then select only a subset of variables, which are the one we found interesting when reading the documentation. We also rename to avoid problems related to the `.` character in some column names. 

Then, we print the schema (type) of the remaining columns.

In [4]:
hypercharge_sessions_df = hypercharge_sessions_df.drop("carChargeParameter", "stackOszis")

hypercharge_sessions_df = hypercharge_sessions_df.select(
    "gpsLat", "gpsLong", "locationStreet", "locationZipCode", "locationTown", "locationProvince",
    "locationCountry", "endClientName", "distributorName", "corporationName", "operatorName", "type", "physicalPosition", 
    col("`session.averagePower`").alias("session_averagePower"), 
    col("`session.car`").alias("session_car"), 
    col("`session.distributorName`").alias("session_distributorName"), 
    col("`session.end`").alias("session_end"), 
    col("`session.meterStart`").alias("session_meterStart"), 
    col("`session.meterStop`").alias("session_meterStop"), 
    col("`session.physicalPosition`").alias("session_physicalPosition"), 
    col("`session.peakPower`").alias("session_peakPower"), 
    col("`session.start`").alias("session_start"), 
    col("`session.socStart`").alias("session_socStart"), 
    col("`session.socStop`").alias("session_socStop"), 
    col("`session.chargingSessionId`").alias("session_chargingSessionId"), 
    col("`session.type`").alias("session_type"), 
    col("`session.position`").alias("session_position"), 
    col("`session.transactionIdNew`").alias("session_transactionIdNew"), 
    col("`carChargeParameter.car`").alias("carChargeParameter_car"), 
    col("`carChargeParameter.excludeFromStatistics`").alias("carChargeParameter_excludeFromStatistics"), 
    col("`carChargeParameter.batteryCapacity`").alias("carChargeParameter_batteryCapacity")
)

hypercharge_sessions_df.printSchema()

root
 |-- gpsLat: double (nullable = true)
 |-- gpsLong: double (nullable = true)
 |-- locationStreet: string (nullable = true)
 |-- locationZipCode: string (nullable = true)
 |-- locationTown: string (nullable = true)
 |-- locationProvince: string (nullable = true)
 |-- locationCountry: string (nullable = true)
 |-- endClientName: integer (nullable = true)
 |-- distributorName: string (nullable = true)
 |-- corporationName: string (nullable = true)
 |-- operatorName: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- physicalPosition: long (nullable = true)
 |-- session_averagePower: integer (nullable = true)
 |-- session_car: integer (nullable = true)
 |-- session_distributorName: integer (nullable = true)
 |-- session_end: string (nullable = true)
 |-- session_meterStart: long (nullable = true)
 |-- session_meterStop: long (nullable = true)
 |-- session_physicalPosition: integer (nullable = true)
 |-- session_peakPower: integer (nullable = true)
 |-- session_start: s

Using some functions in `utilities`, we display the percentage of null values for each column, we then drop the columns with 100% nulls

In [5]:
session_nulls = null_percentage(hypercharge_sessions_df)
print("Percentage of null values in Session file:")
display_null_percentages(session_nulls)

Percentage of null values in Session file:
gpsLat                        3.45%
gpsLong                       3.45%
locationStreet                33.56%
locationZipCode               33.56%
locationTown                  33.56%
locationProvince              33.56%
locationCountry               8.98%
endClientName                 100.00%
distributorName               0.00%
corporationName               0.00%
operatorName                  100.00%
type                          0.00%
physicalPosition              0.26%
session_averagePower          100.00%
session_car                   100.00%
session_distributorName       100.00%
session_end                   0.00%
session_meterStart            0.00%
session_meterStop             0.00%
session_physicalPosition      100.00%
session_peakPower             100.00%
session_start                 0.00%
session_socStart              0.00%
session_socStop               0.00%
session_chargingSessionId     0.00%
session_type                  100.00%
s

In [6]:
# drop columns with 100% Nulls
hypercharge_sessions_df = hypercharge_sessions_df.drop('endClientName', 'operatorName', 'session_averagePower', 
                                                       'session_car', 'session_distributorName', 'session_physicalPosition',
                                                       'session_peakPower','session_type', 'session_transactionIdNew')

Now we explore some columns in more detailed. First is `carChargeParameter_excludeFromStatistics`: we count occurrences of values to gauge how many sessions are deemed not relevant for statistics.

We also display occurrences of `distributorName` and `corporationName` to see if we could do some analysis based on distributor or corporation. It however turns out that almost all of them are `Free to X`, with a small minority of `Be Charge`, so not that interesting.

In [7]:
hypercharge_sessions_df.groupby('carChargeParameter_excludeFromStatistics').count().show()
hypercharge_sessions_df.groupby('distributorName').count().show()
hypercharge_sessions_df.groupby('corporationName').count().show()

+----------------------------------------+-----+
|carChargeParameter_excludeFromStatistics|count|
+----------------------------------------+-----+
|                                    NULL|   80|
|                                    true|    2|
|                                   false|13360|
+----------------------------------------+-----+

+----------------+-----+
| distributorName|count|
+----------------+-----+
|Free To X S.p.A.|13423|
|   Free To X Srl|   19|
+----------------+-----+

+----------------+-----+
| corporationName|count|
+----------------+-----+
|Free To X S.p.A.|13040|
|   Be charge Srl|  383|
|   Free To X Srl|   19|
+----------------+-----+



In [8]:
hypercharge_sessions_df=hypercharge_sessions_df.drop('distributorName', 'corporationName')

In [9]:
hypercharge_sessions_df.filter(
    hypercharge_sessions_df['carChargeParameter_excludeFromStatistics'].isNull()).toPandas().sample(10)

Unnamed: 0,gpsLat,gpsLong,locationStreet,locationZipCode,locationTown,locationProvince,locationCountry,type,physicalPosition,session_end,session_meterStart,session_meterStop,session_start,session_socStart,session_socStop,session_chargingSessionId,session_position,carChargeParameter_car,carChargeParameter_excludeFromStatistics,carChargeParameter_batteryCapacity
37,44.661782,10.855574,"A1, AdS Secchia Ovest 506",41123.0,Provincia di Modena,Emilia-Romagna,Italy,CCS2_400,1,2024-08-01T07:42:46+00:00,116999146,117005466,2024-08-01T07:37:11+00:00,34,52,72103945,1,,,
2,43.071302,13.844789,,,,,Italy,CCS2_400,1,2024-07-14T10:06:23+00:00,20495100,20495100,2024-07-14T10:03:04+00:00,0,0,69239529,1,,,
30,44.151211,11.193671,"A1 Km 01, Variante di Valico",40035.0,Castiglione dei Pepoli,Emilia-Romagna,Italy,CCS2_400,1,2024-07-30T12:03:30+00:00,51127864,51157472,2024-07-30T11:30:49+00:00,60,97,71843531,1,,,
0,44.589635,8.664777,,,,,Italy,CCS2_400,1,2024-07-14T07:08:35+00:00,57345668,57345668,2024-07-14T07:07:08+00:00,0,0,69210828,1,,,
5,45.773998,9.04934,,,,,Italy,CCS2_400,4,2024-07-15T17:44:22+00:00,95902752,95902752,2024-07-15T17:41:56+00:00,0,0,69486178,2,,,
77,43.619242,13.336944,,,,,Italy,CCS2_400,1,2024-08-26T07:07:53+00:00,26304624,26317800,2024-08-26T06:39:43+00:00,71,100,76050067,1,,,
9,44.661803,10.854625,"A1, AdS Secchia Ovest 506",41123.0,Provincia di Modena,Emilia-Romagna,Italy,CCS2_400,4,2024-07-19T11:10:26+00:00,89892372,89919530,2024-07-19T10:50:43+00:00,20,62,70016481,2,,,
64,45.559134,9.039494,,,,,Italy,CCS2_400,4,2024-08-16T07:52:51+00:00,11103185,11103185,2024-08-16T07:50:56+00:00,0,0,74479049,2,,,
54,44.662237,10.85801,Autostrada del Sole 1,41123.0,Modena,Emilia-Romagna,Italy,CCS2_400,4,2024-08-12T12:11:37+00:00,44717496,44717496,2024-08-12T12:10:22+00:00,0,0,73937730,2,,,
19,43.65052,11.46442,Località Prulli di Sotto 105,50066.0,Città Metropolitana di Firenze,Toscana,Italy,CCS2_400,1,2024-07-25T14:56:18+00:00,109146600,109146600,2024-07-25T14:55:47+00:00,25,25,71044766,1,,,


Then, we want to convert the columns `session_start` and `session_end` to a useful timestamp format: from the earlier pandas print the format are strings like `2024-08-09T11:15:01+00:00`. So we want to escape the `T` and convert to a timestamp like `2024-08-09 11:15:01`. 

Note how data is GMT+0, it could be useful to convert to Rome summer time (GMT+2). Therefore, `2024-08-09T11:15:01+00:00` and up being `2024-08-09 13:15:01`

We then calculate the duration of the charging session, `session_duration_mins`

In [10]:
hypercharge_sessions_df.select(
    "session_start", 'session_end'
).toPandas().sample(10)

Unnamed: 0,session_start,session_end
13037,2024-08-25T22:10:17+00:00,2024-08-25T22:41:51+00:00
1307,2024-07-17T15:11:01+00:00,2024-07-17T15:50:21+00:00
3267,2024-07-24T07:49:38+00:00,2024-07-24T08:11:32+00:00
8303,2024-08-09T12:38:39+00:00,2024-08-09T12:52:05+00:00
7505,2024-08-07T14:24:58+00:00,2024-08-07T14:49:13+00:00
946,2024-07-15T19:01:17+00:00,2024-07-15T19:16:06+00:00
3411,2024-07-25T15:03:41+00:00,2024-07-25T15:49:10+00:00
445,2024-07-14T09:23:15+00:00,2024-07-14T10:02:47+00:00
5771,2024-08-01T08:17:42+00:00,2024-08-01T08:49:58+00:00
2530,2024-07-22T10:08:20+00:00,2024-07-22T10:46:57+00:00


In [11]:
# Convert the session_start and session_end columns from string to timestamp with the timezone and seconds
hypercharge_sessions_df = hypercharge_sessions_df.withColumn(
    "session_start", to_timestamp("session_start", "yyyy-MM-dd'T'HH:mm:ssXXX")
).withColumn(
    "session_end", to_timestamp("session_end", "yyyy-MM-dd'T'HH:mm:ssXXX")
)

hypercharge_sessions_df.select("session_start", "session_end").show()

# Convert from GMT to Rome's time zone
hypercharge_sessions_df = hypercharge_sessions_df.withColumn(
    "session_start", from_utc_timestamp("session_start", "Europe/Rome")
).withColumn(
    "session_end", from_utc_timestamp("session_end", "Europe/Rome")
)

# Show the result
hypercharge_sessions_df.select("session_start", "session_end").show()

+-------------------+-------------------+
|      session_start|        session_end|
+-------------------+-------------------+
|2024-07-14 08:01:27|2024-07-14 08:26:46|
|2024-07-14 08:01:27|2024-07-14 08:25:08|
|2024-07-14 08:01:57|2024-07-14 08:44:43|
|2024-07-14 08:02:59|2024-07-14 08:20:07|
|2024-07-14 08:03:24|2024-07-14 08:36:35|
|2024-07-14 08:05:42|2024-07-14 08:50:30|
|2024-07-14 08:06:43|2024-07-14 08:45:06|
|2024-07-14 08:08:40|2024-07-14 08:37:28|
|2024-07-14 08:09:23|2024-07-14 08:40:42|
|2024-07-14 08:13:11|2024-07-14 08:28:56|
|2024-07-14 08:13:30|2024-07-14 08:35:17|
|2024-07-14 08:14:44|2024-07-14 08:42:40|
|2024-07-14 08:15:17|2024-07-14 08:19:15|
|2024-07-14 08:16:34|2024-07-14 09:10:07|
|2024-07-14 08:16:43|2024-07-14 08:30:33|
|2024-07-14 08:17:29|2024-07-14 08:41:37|
|2024-07-14 08:17:54|2024-07-14 08:34:54|
|2024-07-14 08:18:23|2024-07-14 08:36:24|
|2024-07-14 08:19:00|2024-07-14 08:38:11|
|2024-07-14 08:21:00|2024-07-14 08:27:53|
+-------------------+-------------

In [12]:
hypercharge_sessions_df.select(
    "session_start", 'session_end'
).printSchema()

root
 |-- session_start: timestamp (nullable = true)
 |-- session_end: timestamp (nullable = true)



In [13]:
hypercharge_sessions_df = hypercharge_sessions_df.withColumn(
    'session_duration_mins', (col('session_end').cast('long')-col('session_start').cast('long'))/60
)

hypercharge_sessions_df.select('session_duration_mins').describe().show()

+-------+---------------------+
|summary|session_duration_mins|
+-------+---------------------+
|  count|                13442|
|   mean|    24.36972672717355|
| stddev|    15.05030696979774|
|    min| 0.016666666666666666|
|    max|                115.4|
+-------+---------------------+



A brief look at the session ID range, to see if we could match it with other datasets

In [14]:
hypercharge_sessions_df.select(
    "session_chargingSessionId",
).describe().show()

+-------+-------------------------+
|summary|session_chargingSessionId|
+-------+-------------------------+
|  count|                    13442|
|   mean|      7.265700143884839E7|
| stddev|       2045333.0978502438|
|    min|                 69206115|
|    max|                 76056911|
+-------+-------------------------+



Now a look ad `carChargeParameter_car` and `carChargeParameter_batteryCapacity`. The second one has unfortunately a lot of NAs (72%+all the `0.0` entries) but still it could be useful.

In [17]:
hypercharge_sessions_df.groupby("carChargeParameter_car").count().sort(desc("count")).show()
hypercharge_sessions_df.groupby("carChargeParameter_batteryCapacity").count().sort(desc("count")).show()

+----------------------+-----+
|carChargeParameter_car|count|
+----------------------+-----+
|  Tesla Model Y / 3 LR| 1504|
|  Skoda Enyaq IV / ...| 1335|
|               unknown| 1322|
|  VW ID.4 / ID.5 GT...|  746|
|   Cupra Born | VW ID3|  642|
|                 Tesla|  597|
|               BMW iX1|  516|
|          BMW I4 G26 ?|  490|
|  Polestar 2 78kWh ...|  433|
|    Mercedes EQV / EQB|  427|
|  Porsche Taycan/Au...|  395|
|        NotTransmitted|  380|
|                BMW iX|  326|
|          Mercedes EQS|  281|
|            Kia e-Niro|  255|
|           Opel eCorsa|  247|
|  Peugeot E-208 / O...|  240|
|  Renault Megane E-...|  215|
|  Hyundai Ioniq 5 /...|  199|
|  Volvo EX30 / smar...|  197|
+----------------------+-----+
only showing top 20 rows

+----------------------------------+-----+
|carChargeParameter_batteryCapacity|count|
+----------------------------------+-----+
|                              NULL| 9736|
|                               0.0| 2578|
|              

Let's correct the `0.0` entries in `carChargeParameter_batteryCapacity`, they count as `Null` too

In [22]:
hypercharge_sessions_df=hypercharge_sessions_df.replace(0.0, None, "carChargeParameter_batteryCapacity")
hypercharge_sessions_df.groupby("carChargeParameter_batteryCapacity").count().sort(desc("count")).show()

+----------------------------------+-----+
|carChargeParameter_batteryCapacity|count|
+----------------------------------+-----+
|                              NULL|12314|
|                           50000.0|  487|
|                           79100.0|  433|
|                           42000.0|  189|
|                           23600.0|    8|
|                          135000.0|    3|
|                             153.0|    3|
|                           22800.0|    2|
|                           28000.0|    1|
|                           18700.0|    1|
|                           35500.0|    1|
+----------------------------------+-----+

