In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from utilities import *

In [13]:
# Create SparkSession
spark = SparkSession.builder \
    .master("local") \
    .appName("hypercharge_sessions")\
    .config("spark.driver.memory", "15g")\
    .getOrCreate()

# set number of partitions
# spark.conf.set("spark.sql.shuffle.partitions", 2)

spark

Reading the charging sessions parquet file and counting entries

In [14]:
hypercharge_sessions_df = spark.read.parquet(r"C:\Users\Gian\Desktop\Luiss\BigData\FreeToX_Project\dataset\full\hypercarge_sessions.parquet")
hypercharge_sessions_df.count()

13442

Dropping `stackOszis` and `carChargeParameter`. The first is oscilloscope data, so not interesting, the second one is also not interesting and creates problems due to duplicate naming.

We then select only a subset of variables, which are the one we found interesting when reading the documentation. We also rename to avoid problems related to the `.` character in some column names. 

Then, we print the schema (type) of the remaining columns.

In [15]:
hypercharge_sessions_df = hypercharge_sessions_df.drop("carChargeParameter", "stackOszis")

hypercharge_sessions_df = hypercharge_sessions_df.select(
    "gpsLat", "gpsLong", "locationStreet", "locationZipCode", "locationTown", "locationProvince",
    "locationCountry", "endClientName", "distributorName", "corporationName", "operatorName", "type", "physicalPosition", 
    col("`session.averagePower`").alias("session_averagePower"), 
    col("`session.car`").alias("session_car"), 
    col("`session.distributorName`").alias("session_distributorName"), 
    col("`session.end`").alias("session_end"), 
    col("`session.meterStart`").alias("session_meterStart"), 
    col("`session.meterStop`").alias("session_meterStop"), 
    col("`session.physicalPosition`").alias("session_physicalPosition"), 
    col("`session.peakPower`").alias("session_peakPower"), 
    col("`session.start`").alias("session_start"), 
    col("`session.socStart`").alias("session_socStart"), 
    col("`session.socStop`").alias("session_socStop"), 
    col("`session.chargingSessionId`").alias("session_chargingSessionId"), 
    col("`session.type`").alias("session_type"), 
    col("`session.position`").alias("session_position"), 
    col("`session.transactionIdNew`").alias("session_transactionIdNew"), 
    col("`carChargeParameter.car`").alias("carChargeParameter_car"), 
    col("`carChargeParameter.excludeFromStatistics`").alias("carChargeParameter_excludeFromStatistics"), 
    col("`carChargeParameter.batteryCapacity`").alias("carChargeParameter_batteryCapacity")
)

hypercharge_sessions_df.printSchema()

root
 |-- gpsLat: double (nullable = true)
 |-- gpsLong: double (nullable = true)
 |-- locationStreet: string (nullable = true)
 |-- locationZipCode: string (nullable = true)
 |-- locationTown: string (nullable = true)
 |-- locationProvince: string (nullable = true)
 |-- locationCountry: string (nullable = true)
 |-- endClientName: integer (nullable = true)
 |-- distributorName: string (nullable = true)
 |-- corporationName: string (nullable = true)
 |-- operatorName: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- physicalPosition: long (nullable = true)
 |-- session_averagePower: integer (nullable = true)
 |-- session_car: integer (nullable = true)
 |-- session_distributorName: integer (nullable = true)
 |-- session_end: string (nullable = true)
 |-- session_meterStart: long (nullable = true)
 |-- session_meterStop: long (nullable = true)
 |-- session_physicalPosition: integer (nullable = true)
 |-- session_peakPower: integer (nullable = true)
 |-- session_start: s

Using some functions in `utilities`, we display the percentage of null values for each column, we then drop the columns with 100% nulls

In [16]:
session_nulls = null_percentage(hypercharge_sessions_df)
print("Percentage of null values in Session file:")
display_null_percentages(session_nulls)

Percentage of null values in Session file:
gpsLat                        3.45%
gpsLong                       3.45%
locationStreet                33.56%
locationZipCode               33.56%
locationTown                  33.56%
locationProvince              33.56%
locationCountry               8.98%
endClientName                 100.00%
distributorName               0.00%
corporationName               0.00%
operatorName                  100.00%
type                          0.00%
physicalPosition              0.26%
session_averagePower          100.00%
session_car                   100.00%
session_distributorName       100.00%
session_end                   0.00%
session_meterStart            0.00%
session_meterStop             0.00%
session_physicalPosition      100.00%
session_peakPower             100.00%
session_start                 0.00%
session_socStart              0.00%
session_socStop               0.00%
session_chargingSessionId     0.00%
session_type                  100.00%
s

In [17]:
# drop columns with 100% Nulls
hypercharge_sessions_df = hypercharge_sessions_df.drop('endClientName', 'operatorName', 'session_averagePower', 
                                                       'session_car', 'session_distributorName', 'session_physicalPosition',
                                                       'session_peakPower','session_type', 'session_transactionIdNew')

Now we explore some columns in more detailed. First is `carChargeParameter_excludeFromStatistics`: we count occurrences of values to gauge how many sessions are deemed not relevant for statistics.

We also display occurrences of `distributorName` and `corporationName` to see if we could do some analysis based on distributor or corporation. It however turns out that almost all of them are `Free to X`, with a small minority of `Be Charge`, so not that interesting.

In [18]:
hypercharge_sessions_df.groupby('carChargeParameter_excludeFromStatistics').count().show()
hypercharge_sessions_df.groupby('distributorName').count().show()
hypercharge_sessions_df.groupby('corporationName').count().show()

+----------------------------------------+-----+
|carChargeParameter_excludeFromStatistics|count|
+----------------------------------------+-----+
|                                    NULL|   80|
|                                    true|    2|
|                                   false|13360|
+----------------------------------------+-----+

+----------------+-----+
| distributorName|count|
+----------------+-----+
|Free To X S.p.A.|13423|
|   Free To X Srl|   19|
+----------------+-----+

+----------------+-----+
| corporationName|count|
+----------------+-----+
|Free To X S.p.A.|13040|
|   Be charge Srl|  383|
|   Free To X Srl|   19|
+----------------+-----+



In [25]:
hypercharge_sessions_df=hypercharge_sessions_df.drop('distributorName', 'corporationName')

In [19]:
hypercharge_sessions_df.filter(
    hypercharge_sessions_df['carChargeParameter_excludeFromStatistics'].isNull()).toPandas().sample(10)

Unnamed: 0,gpsLat,gpsLong,locationStreet,locationZipCode,locationTown,locationProvince,locationCountry,distributorName,corporationName,type,...,session_meterStart,session_meterStop,session_start,session_socStart,session_socStop,session_chargingSessionId,session_position,carChargeParameter_car,carChargeParameter_excludeFromStatistics,carChargeParameter_batteryCapacity
11,45.729051,9.028446,Autostrada dei Laghi km 29,22071.0,Provincia di Como,Lombardia,Italy,Free To X S.p.A.,Free To X S.p.A.,CCS2_400,...,82783008,82783008,2024-07-20T15:40:16+00:00,0,0,70264634,1,,,
36,44.427793,11.599278,via Madonnina 2101,40024.0,Città Metropolitana di Bologna,Emilia-Romagna,Italy,Free To X S.p.A.,Free To X S.p.A.,CCS2_400,...,97726904,97740856,2024-08-01T06:50:41+00:00,51,70,72097886,1,,,
53,43.65052,11.46448,Località Prulli di Sotto 105,50066.0,Città Metropolitana di Firenze,Toscana,Italy,Free To X S.p.A.,Free To X S.p.A.,CCS2_400,...,112615464,112615464,2024-08-12T11:56:52+00:00,0,0,73935326,1,,,
72,,,,,,,,Free To X S.p.A.,Free To X S.p.A.,CCS2_400,...,7131678,7131678,2024-08-22T19:52:43+00:00,0,0,75530013,1,,,
25,44.66229,10.85797,Strada Tre Olmi 19,41123.0,Modena,Emilia-Romagna,Italy,Free To X S.p.A.,Be charge Srl,CCS2_400,...,186628272,186628272,2024-07-27T10:07:24+00:00,0,0,71336405,1,,,
8,,,,,,,,Free To X S.p.A.,Free To X S.p.A.,CCS2_400,...,2983377,3024532,2024-07-19T10:45:20+00:00,17,81,70015270,1,,,
24,44.428896,11.60029,Autostrada A14,40024.0,Provincia di Bologna,Emilia-Romagna,Italy,Free To X S.p.A.,Free To X S.p.A.,CCS2_400,...,75966592,75966592,2024-07-27T10:01:34+00:00,0,0,71334847,1,,,
73,50.0,-25.0,,,,,,Free To X S.p.A.,Free To X S.p.A.,CCS2_400,...,1604783,1604783,2024-08-24T10:21:09+00:00,0,0,75765042,1,,,
0,44.589635,8.664777,,,,,Italy,Free To X S.p.A.,Free To X S.p.A.,CCS2_400,...,57345668,57345668,2024-07-14T07:07:08+00:00,0,0,69210828,1,,,
66,45.729839,9.028009,"A9, Lario Ovest 278",22071.0,Provincia di Como,Lombardia,Italy,Free To X S.p.A.,Free To X S.p.A.,CCS2_400,...,90608032,90608032,2024-08-17T13:06:43+00:00,0,0,74723206,1,,,


Then, we want to convert the columns `session_start` and `session_end` to a useful timestamp format: from the earlier pandas print the format are strings like `2024-08-09T11:15:01+00:00`. So we want to escape the `T` and convert to a timestamp like `2024-08-09 11:15:01`. 

Note how data is GMT+0, it could be useful to convert to Rome summer time (GMT+2). Therefore, `2024-08-09T11:15:01+00:00` and up being `2024-08-09 13:15:01`

We then calculate the duration of the charging session, `session_duration_mins`

In [20]:
hypercharge_sessions_df.select(
    "session_start", 'session_end'
).toPandas().sample(10)

Unnamed: 0,session_start,session_end
1573,2024-07-19T09:45:34+00:00,2024-07-19T10:20:59+00:00
12623,2024-08-24T11:29:50+00:00,2024-08-24T11:54:29+00:00
11324,2024-08-19T08:34:29+00:00,2024-08-19T09:01:50+00:00
6593,2024-08-04T08:37:53+00:00,2024-08-04T09:09:21+00:00
10925,2024-08-17T14:33:15+00:00,2024-08-17T15:04:33+00:00
9868,2024-08-15T20:44:15+00:00,2024-08-15T20:59:58+00:00
719,2024-07-15T16:31:26+00:00,2024-07-15T17:03:47+00:00
12210,2024-08-22T18:48:25+00:00,2024-08-22T19:04:04+00:00
267,2024-07-14T08:25:34+00:00,2024-07-14T09:01:19+00:00
5355,2024-07-30T12:43:12+00:00,2024-07-30T13:24:08+00:00


In [21]:
# Convert the session_start and session_end columns from string to timestamp with the timezone and seconds
hypercharge_sessions_df = hypercharge_sessions_df.withColumn(
    "session_start", to_timestamp("session_start", "yyyy-MM-dd'T'HH:mm:ssXXX")
).withColumn(
    "session_end", to_timestamp("session_end", "yyyy-MM-dd'T'HH:mm:ssXXX")
)

hypercharge_sessions_df.select("session_start", "session_end").show()

# Convert from GMT to Rome's time zone
hypercharge_sessions_df = hypercharge_sessions_df.withColumn(
    "session_start", from_utc_timestamp("session_start", "Europe/Rome")
).withColumn(
    "session_end", from_utc_timestamp("session_end", "Europe/Rome")
)

# Show the result
hypercharge_sessions_df.select("session_start", "session_end").show()

+-------------------+-------------------+
|      session_start|        session_end|
+-------------------+-------------------+
|2024-07-14 08:01:27|2024-07-14 08:26:46|
|2024-07-14 08:01:27|2024-07-14 08:25:08|
|2024-07-14 08:01:57|2024-07-14 08:44:43|
|2024-07-14 08:02:59|2024-07-14 08:20:07|
|2024-07-14 08:03:24|2024-07-14 08:36:35|
|2024-07-14 08:05:42|2024-07-14 08:50:30|
|2024-07-14 08:06:43|2024-07-14 08:45:06|
|2024-07-14 08:08:40|2024-07-14 08:37:28|
|2024-07-14 08:09:23|2024-07-14 08:40:42|
|2024-07-14 08:13:11|2024-07-14 08:28:56|
|2024-07-14 08:13:30|2024-07-14 08:35:17|
|2024-07-14 08:14:44|2024-07-14 08:42:40|
|2024-07-14 08:15:17|2024-07-14 08:19:15|
|2024-07-14 08:16:34|2024-07-14 09:10:07|
|2024-07-14 08:16:43|2024-07-14 08:30:33|
|2024-07-14 08:17:29|2024-07-14 08:41:37|
|2024-07-14 08:17:54|2024-07-14 08:34:54|
|2024-07-14 08:18:23|2024-07-14 08:36:24|
|2024-07-14 08:19:00|2024-07-14 08:38:11|
|2024-07-14 08:21:00|2024-07-14 08:27:53|
+-------------------+-------------

In [22]:
hypercharge_sessions_df.select(
    "session_start", 'session_end'
).printSchema()

root
 |-- session_start: timestamp (nullable = true)
 |-- session_end: timestamp (nullable = true)



In [23]:
hypercharge_sessions_df = hypercharge_sessions_df.withColumn(
    'session_duration_mins', (col('session_end').cast('long')-col('session_start').cast('long'))/60
)

hypercharge_sessions_df.select('session_duration_mins').describe().show()

+-------+---------------------+
|summary|session_duration_mins|
+-------+---------------------+
|  count|                13442|
|   mean|    24.36972672717355|
| stddev|    15.05030696979774|
|    min| 0.016666666666666666|
|    max|                115.4|
+-------+---------------------+



A brief look at the session ID range, to see if we could match it with other datasets

In [24]:
hypercharge_sessions_df.select(
    "session_chargingSessionId",
).describe().show()

+-------+-------------------------+
|summary|session_chargingSessionId|
+-------+-------------------------+
|  count|                    13442|
|   mean|      7.265700143884839E7|
| stddev|       2045333.0978502438|
|    min|                 69206115|
|    max|                 76056911|
+-------+-------------------------+



In [27]:
hypercharge_sessions_df.groupby("carChargeParameter_car").count().show()
hypercharge_sessions_df.groupby("carChargeParameter_batteryCapacity").count().show()

+----------------------+-----+
|carChargeParameter_car|count|
+----------------------+-----+
|                Tesla?|    4|
|        Porsche Taycan|   26|
|          Fisker Ocean|    8|
|            Kia e-Niro|  255|
|             Nio ET5/7|    2|
|  Tesla Model Y / 3 LR| 1504|
|        Audi Q8 e-tron|  106|
|  Renault Megane E-...|  215|
|               BMW iX3|   84|
|          Mercedes EQS|  281|
|           Opel eCorsa|  247|
|  Tesla Model Y LFP...|  163|
|               VW ID.4|    3|
|               Nio ES8|    1|
|  Audi e-tron/ VW I...|   68|
|  Genesis G80 Elect...|    1|
|    Audi E-tron Coupe?|  187|
|    Scania Truck BEV 3|    2|
|  VW ID.4/ID.3 77kW...|   21|
|               unknown| 1322|
+----------------------+-----+
only showing top 20 rows

+----------------------------------+-----+
|carChargeParameter_batteryCapacity|count|
+----------------------------------+-----+
|                               0.0| 2578|
|                           28000.0|    1|
|              