In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pandas as pd
from utilities import *
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut, GeocoderServiceError

In [2]:
# Create SparkSession
spark = SparkSession.builder \
    .master("local") \
    .appName("hypercharge_sessions")\
    .config("spark.driver.memory", "15g")\
    .config("spark.executor.memory", "15g") \
    .getOrCreate()

# set number of partitions
# spark.conf.set("spark.sql.shuffle.partitions", 10)

spark

***
# Read .parquet File

Reading the charging sessions parquet file and counting entries

In [3]:
hypercharge_sessions_df = spark.read.parquet(r"C:\Users\Gian\Desktop\Luiss\BigData\FreeToX_Project\dataset\full\hypercarge_sessions.parquet")
# hypercharge_sessions_df = spark.read.parquet('hypercarge_sessions.parquet')
hypercharge_sessions_df.count()

13442

# Dropping some stuff and renaming
Dropping `stackOszis` and `carChargeParameter`. The first is oscilloscope data, so not interesting, the second one is also not interesting and creates problems due to duplicate naming.

We then select only a subset of variables, which are the one we found interesting when reading the documentation. We also rename to avoid problems related to the `.` character in some column names. 

Then, we print the schema (type) of the remaining columns.

In [4]:
hypercharge_sessions_df = hypercharge_sessions_df.drop("carChargeParameter", "stackOszis")

hypercharge_sessions_df = hypercharge_sessions_df.select(
    "gpsLat", "gpsLong", "locationStreet", "locationZipCode", "locationTown", "locationProvince",
    "locationCountry", "endClientName", "distributorName", "corporationName", "operatorName", "type", "physicalPosition", 
    col("`session.averagePower`").alias("session_averagePower"), 
    col("`session.car`").alias("session_car"), 
    col("`session.distributorName`").alias("session_distributorName"), 
    col("`session.end`").alias("session_end"), 
    col("`session.meterStart`").alias("session_meterStart"), 
    col("`session.meterStop`").alias("session_meterStop"), 
    col("`session.physicalPosition`").alias("session_physicalPosition"), 
    col("`session.peakPower`").alias("session_peakPower"), 
    col("`session.start`").alias("session_start"), 
    col("`session.socStart`").alias("session_socStart"), 
    col("`session.socStop`").alias("session_socStop"), 
    col("`session.chargingSessionId`").alias("session_chargingSessionId"), 
    col("`session.type`").alias("session_type"), 
    col("`session.position`").alias("session_position"), 
    col("`session.transactionIdNew`").alias("session_transactionIdNew"), 
    col("`carChargeParameter.car`").alias("carChargeParameter_car"), 
    col("`carChargeParameter.excludeFromStatistics`").alias("carChargeParameter_excludeFromStatistics"), 
    col("`carChargeParameter.batteryCapacity`").alias("carChargeParameter_batteryCapacity")
)

hypercharge_sessions_df.printSchema()

root
 |-- gpsLat: double (nullable = true)
 |-- gpsLong: double (nullable = true)
 |-- locationStreet: string (nullable = true)
 |-- locationZipCode: string (nullable = true)
 |-- locationTown: string (nullable = true)
 |-- locationProvince: string (nullable = true)
 |-- locationCountry: string (nullable = true)
 |-- endClientName: integer (nullable = true)
 |-- distributorName: string (nullable = true)
 |-- corporationName: string (nullable = true)
 |-- operatorName: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- physicalPosition: long (nullable = true)
 |-- session_averagePower: integer (nullable = true)
 |-- session_car: integer (nullable = true)
 |-- session_distributorName: integer (nullable = true)
 |-- session_end: string (nullable = true)
 |-- session_meterStart: long (nullable = true)
 |-- session_meterStop: long (nullable = true)
 |-- session_physicalPosition: integer (nullable = true)
 |-- session_peakPower: integer (nullable = true)
 |-- session_start: s

# Dropping columns with just Nulls
Using some functions in `utilities`, we display the percentage of null values for each column, we then drop the columns with 100% nulls

In [5]:
session_nulls = null_percentage(hypercharge_sessions_df)
print("Percentage of null values in Session file:")
display_null_percentages(session_nulls)

Percentage of null values in Session file:
gpsLat                        3.45%
gpsLong                       3.45%
locationStreet                33.56%
locationZipCode               33.56%
locationTown                  33.56%
locationProvince              33.56%
locationCountry               8.98%
endClientName                 100.00%
distributorName               0.00%
corporationName               0.00%
operatorName                  100.00%
type                          0.00%
physicalPosition              0.26%
session_averagePower          100.00%
session_car                   100.00%
session_distributorName       100.00%
session_end                   0.00%
session_meterStart            0.00%
session_meterStop             0.00%
session_physicalPosition      100.00%
session_peakPower             100.00%
session_start                 0.00%
session_socStart              0.00%
session_socStop               0.00%
session_chargingSessionId     0.00%
session_type                  100.00%
s

In [6]:
# drop columns with 100% Nulls
hypercharge_sessions_df = hypercharge_sessions_df.drop('endClientName', 'operatorName', 'session_averagePower', 
                                                       'session_car', 'session_distributorName', 'session_physicalPosition',
                                                       'session_peakPower','session_type', 'session_transactionIdNew')

# excludeFromStatistics, locationCountry, distributorName, corporationName
Now we explore some columns in more detail. First is `carChargeParameter_excludeFromStatistics`: we count occurrences of values to gauge how many sessions are deemed not relevant for statistics.

We also display occurrences of `distributorName` and `corporationName` to see if we could do some analysis based on distributor or corporation. It however turns out that almost all of them are `Free to X`, with a small minority of `Be Charge`, so not that interesting.

`locationCountry` is also not that useful, as all stations are in Italy.

In [7]:
hypercharge_sessions_df.groupby('locationCountry').count().show()
hypercharge_sessions_df.groupby('carChargeParameter_excludeFromStatistics').count().show()
hypercharge_sessions_df.groupby('distributorName').count().show()
hypercharge_sessions_df.groupby('corporationName').count().show()

+---------------+-----+
|locationCountry|count|
+---------------+-----+
|           NULL| 1207|
|          Italy|12235|
+---------------+-----+

+----------------------------------------+-----+
|carChargeParameter_excludeFromStatistics|count|
+----------------------------------------+-----+
|                                    NULL|   80|
|                                    true|    2|
|                                   false|13360|
+----------------------------------------+-----+

+----------------+-----+
| distributorName|count|
+----------------+-----+
|Free To X S.p.A.|13423|
|   Free To X Srl|   19|
+----------------+-----+

+----------------+-----+
| corporationName|count|
+----------------+-----+
|Free To X S.p.A.|13040|
|   Be charge Srl|  383|
|   Free To X Srl|   19|
+----------------+-----+



In [8]:
hypercharge_sessions_df=hypercharge_sessions_df.drop('locationCountry', 'distributorName', 'corporationName')

In [9]:
hypercharge_sessions_df.filter(
    hypercharge_sessions_df['carChargeParameter_excludeFromStatistics']==True).toPandas()

Unnamed: 0,gpsLat,gpsLong,locationStreet,locationZipCode,locationTown,locationProvince,type,physicalPosition,session_end,session_meterStart,session_meterStop,session_start,session_socStart,session_socStop,session_chargingSessionId,session_position,carChargeParameter_car,carChargeParameter_excludeFromStatistics,carChargeParameter_batteryCapacity
0,,,,,,,CCS2_400,4,2024-07-15T16:20:17+00:00,15,8381,2024-07-15T16:15:48+00:00,2,100,69473769,2,EOL Test,True,0.0
1,,,,,,,CCS2_400,4,2024-07-15T16:57:10+00:00,17653,75056,2024-07-15T16:26:40+00:00,2,100,69475552,2,EOL Test,True,0.0


In [10]:
hypercharge_sessions_df.filter(
    hypercharge_sessions_df['carChargeParameter_excludeFromStatistics'].isNull()).toPandas().sample(10)

Unnamed: 0,gpsLat,gpsLong,locationStreet,locationZipCode,locationTown,locationProvince,type,physicalPosition,session_end,session_meterStart,session_meterStop,session_start,session_socStart,session_socStop,session_chargingSessionId,session_position,carChargeParameter_car,carChargeParameter_excludeFromStatistics,carChargeParameter_batteryCapacity
16,,,,,,,CCS2_400,4,2024-07-22T11:17:56+00:00,1547297,1547297,2024-07-22T11:15:44+00:00,0,0,70544864,2,,,
76,44.662385,10.858521,A1 - Area di Servizio Secchia Est,41123.0,Modena,Emilia-Romagna,CCS2_400,1,2024-08-25T22:04:25+00:00,43821824,43821824,2024-08-25T22:03:05+00:00,0,0,76032121,1,,,
14,45.76237,8.81087,Autostrada A8 Milano-Varese KM 43,21040.0,Cascine Maggio,Lombardia,CCS2_400,4,2024-07-20T17:22:08+00:00,68087016,68087016,2024-07-20T17:21:05+00:00,0,0,70282018,2,,,
43,44.66229,10.85801,Strada Tre Olmi 19,41123.0,Modena,Emilia-Romagna,CCS2_400,4,2024-08-02T15:38:26+00:00,182947456,182947456,2024-08-02T15:35:34+00:00,0,0,72369868,2,,,
23,44.66229,10.85801,Strada Tre Olmi 19,41123.0,Modena,Emilia-Romagna,CCS2_400,4,2024-07-27T09:53:15+00:00,180624064,180624064,2024-07-27T09:52:17+00:00,0,0,71332381,2,,,
44,45.57584,9.39376,"A4, AdS Brianza Nord",20864.0,Province of Monza and Brianza,Lombardy,CCS2_400,4,2024-08-02T16:08:04+00:00,109391944,109391944,2024-08-02T16:05:50+00:00,0,0,72376395,2,,,
42,43.583001,11.522125,Autostrada del Sole 331,52027.0,San Giovanni Valdarno,Toscana,CCS2_400,1,2024-08-01T08:48:16+00:00,106152992,106214304,2024-08-01T08:28:18+00:00,19,79,72111746,1,,,
1,43.65052,11.46442,Località Prulli di Sotto 105,50066.0,Città Metropolitana di Firenze,Toscana,CCS2_400,4,2024-07-14T09:02:58+00:00,68739936,68739936,2024-07-14T08:59:23+00:00,0,0,69226357,2,,,
48,44.427793,11.599278,via Madonnina 2101,40024.0,Città Metropolitana di Bologna,Emilia-Romagna,CCS2_400,1,2024-08-07T14:55:29+00:00,100109360,100109360,2024-08-07T14:54:27+00:00,0,0,73147409,1,,,
75,18.0,-45.0,,,,,CCS2_400,1,2024-08-24T11:40:18+00:00,8942376,8942376,2024-08-24T11:38:26+00:00,0,0,75782781,1,,,


In [11]:
hypercharge_sessions_exclude_df = hypercharge_sessions_df.filter( hypercharge_sessions_df['carChargeParameter_excludeFromStatistics']==True)

session_exclude_nulls = null_percentage(hypercharge_sessions_exclude_df)
print("Percentage of null values in Sessions to be excluded from statistics:")
display_null_percentages(session_exclude_nulls)

Percentage of null values in Sessions to be excluded from statistics:
gpsLat                        100.00%
gpsLong                       100.00%
locationStreet                100.00%
locationZipCode               100.00%
locationTown                  100.00%
locationProvince              100.00%
type                          0.00%
physicalPosition              0.00%
session_end                   0.00%
session_meterStart            0.00%
session_meterStop             0.00%
session_start                 0.00%
session_socStart              0.00%
session_socStop               0.00%
session_chargingSessionId     0.00%
session_position              0.00%
carChargeParameter_car        0.00%
carChargeParameter_excludeFromStatistics0.00%
carChargeParameter_batteryCapacity0.00%


In [12]:
hypercharge_sessions_exclude_df = hypercharge_sessions_df.filter(hypercharge_sessions_df['carChargeParameter_excludeFromStatistics'].isNull())

session_exclude_nulls = null_percentage(hypercharge_sessions_exclude_df)
print("Percentage of null values in Sessions that have Null in exclude from statistics:")
display_null_percentages(session_exclude_nulls)

Percentage of null values in Sessions that have Null in exclude from statistics:
gpsLat                        5.00%
gpsLong                       5.00%
locationStreet                28.75%
locationZipCode               28.75%
locationTown                  28.75%
locationProvince              28.75%
type                          0.00%
physicalPosition              0.00%
session_end                   0.00%
session_meterStart            0.00%
session_meterStop             0.00%
session_start                 0.00%
session_socStart              0.00%
session_socStop               0.00%
session_chargingSessionId     0.00%
session_position              0.00%
carChargeParameter_car        100.00%
carChargeParameter_excludeFromStatistics100.00%
carChargeParameter_batteryCapacity100.00%


In [13]:
hypercharge_sessions_df = hypercharge_sessions_df.filter(
    hypercharge_sessions_df.carChargeParameter_excludeFromStatistics == False)

***
# sessionStart, sessionEnd
Then, we want to convert the columns `session_start` and `session_end` to a useful timestamp format: from the earlier pandas print the format are strings like `2024-08-09T11:15:01+00:00`. So we want to escape the `T` and convert to a timestamp like `2024-08-09 11:15:01`. 

Note how data is GMT+0, it could be useful to convert to Rome summer time (GMT+2). Therefore, `2024-08-09T11:15:01+00:00` and up being `2024-08-09 13:15:01`

We then calculate the duration of the charging session, `session_duration_mins`

In [14]:
hypercharge_sessions_df.select(
    "session_start", 'session_end'
).toPandas().sample(10)

Unnamed: 0,session_start,session_end
4992,2024-07-30T10:24:34+00:00,2024-07-30T11:01:52+00:00
5517,2024-08-01T06:50:01+00:00,2024-08-01T07:00:10+00:00
1641,2024-07-19T10:13:13+00:00,2024-07-19T10:16:20+00:00
1537,2024-07-19T09:35:00+00:00,2024-07-19T09:56:36+00:00
6753,2024-08-04T09:37:04+00:00,2024-08-04T09:57:39+00:00
10509,2024-08-17T12:54:35+00:00,2024-08-17T12:55:05+00:00
3836,2024-07-27T08:03:10+00:00,2024-07-27T08:17:45+00:00
5184,2024-07-30T11:47:32+00:00,2024-07-30T11:51:07+00:00
1881,2024-07-19T11:34:20+00:00,2024-07-19T11:51:06+00:00
1728,2024-07-19T10:37:46+00:00,2024-07-19T10:38:27+00:00


In [15]:
# Convert the session_start and session_end columns from string to timestamp with the timezone and seconds
hypercharge_sessions_df = hypercharge_sessions_df.withColumn(
    "session_start", to_timestamp("session_start", "yyyy-MM-dd'T'HH:mm:ssXXX")
).withColumn(
    "session_end", to_timestamp("session_end", "yyyy-MM-dd'T'HH:mm:ssXXX")
)

hypercharge_sessions_df.select("session_start", "session_end").show()

# Convert from GMT to Rome's time zone
hypercharge_sessions_df = hypercharge_sessions_df.withColumn(
    "session_start", from_utc_timestamp("session_start", "Europe/Rome")
).withColumn(
    "session_end", from_utc_timestamp("session_end", "Europe/Rome")
)

# Show the result
hypercharge_sessions_df.select("session_start", "session_end").show()

+-------------------+-------------------+
|      session_start|        session_end|
+-------------------+-------------------+
|2024-07-14 08:01:27|2024-07-14 08:26:46|
|2024-07-14 08:01:27|2024-07-14 08:25:08|
|2024-07-14 08:01:57|2024-07-14 08:44:43|
|2024-07-14 08:02:59|2024-07-14 08:20:07|
|2024-07-14 08:03:24|2024-07-14 08:36:35|
|2024-07-14 08:05:42|2024-07-14 08:50:30|
|2024-07-14 08:06:43|2024-07-14 08:45:06|
|2024-07-14 08:08:40|2024-07-14 08:37:28|
|2024-07-14 08:09:23|2024-07-14 08:40:42|
|2024-07-14 08:13:11|2024-07-14 08:28:56|
|2024-07-14 08:13:30|2024-07-14 08:35:17|
|2024-07-14 08:14:44|2024-07-14 08:42:40|
|2024-07-14 08:15:17|2024-07-14 08:19:15|
|2024-07-14 08:16:34|2024-07-14 09:10:07|
|2024-07-14 08:16:43|2024-07-14 08:30:33|
|2024-07-14 08:17:29|2024-07-14 08:41:37|
|2024-07-14 08:17:54|2024-07-14 08:34:54|
|2024-07-14 08:18:23|2024-07-14 08:36:24|
|2024-07-14 08:19:00|2024-07-14 08:38:11|
|2024-07-14 08:21:00|2024-07-14 08:27:53|
+-------------------+-------------

In [16]:
hypercharge_sessions_df.select(
    "session_start", 'session_end'
).printSchema()

root
 |-- session_start: timestamp (nullable = true)
 |-- session_end: timestamp (nullable = true)



In [17]:
hypercharge_sessions_df = hypercharge_sessions_df.withColumn(
    'session_duration_mins', (col('session_end').cast('long')-col('session_start').cast('long'))/60
)

hypercharge_sessions_df.select('session_duration_mins').describe().show()

+-------+---------------------+
|summary|session_duration_mins|
+-------+---------------------+
|  count|                13360|
|   mean|   24.473212325349316|
| stddev|   15.005184537542611|
|    min| 0.016666666666666666|
|    max|                115.4|
+-------+---------------------+



***
# SessionID
A brief look at the session ID range, to see if we could match it with other datasets

In [18]:
hypercharge_sessions_df.select(
    "session_chargingSessionId",
).describe().show()

+-------+-------------------------+
|summary|session_chargingSessionId|
+-------+-------------------------+
|  count|                    13360|
|   mean|       7.26581745351048E7|
| stddev|       2045508.5538464938|
|    min|                 69206115|
|    max|                 76056911|
+-------+-------------------------+



***
# car, batteryCapacity
Now a look ad `carChargeParameter_car` and `carChargeParameter_batteryCapacity`.  
The second one has unfortunately a lot of NAs (72%+all the `0.0` entries) but still it could be useful.

In [19]:
hypercharge_sessions_df.groupby("carChargeParameter_car").count().sort(desc("count")).show()
hypercharge_sessions_df.groupby("carChargeParameter_batteryCapacity").count().sort(desc("count")).show()

+----------------------+-----+
|carChargeParameter_car|count|
+----------------------+-----+
|  Tesla Model Y / 3 LR| 1504|
|  Skoda Enyaq IV / ...| 1335|
|               unknown| 1322|
|  VW ID.4 / ID.5 GT...|  746|
|   Cupra Born | VW ID3|  642|
|                 Tesla|  597|
|               BMW iX1|  516|
|          BMW I4 G26 ?|  490|
|  Polestar 2 78kWh ...|  433|
|    Mercedes EQV / EQB|  427|
|  Porsche Taycan/Au...|  395|
|        NotTransmitted|  380|
|                BMW iX|  326|
|          Mercedes EQS|  281|
|            Kia e-Niro|  255|
|           Opel eCorsa|  247|
|  Peugeot E-208 / O...|  240|
|  Renault Megane E-...|  215|
|  Hyundai Ioniq 5 /...|  199|
|  Volvo EX30 / smar...|  197|
+----------------------+-----+
only showing top 20 rows

+----------------------------------+-----+
|carChargeParameter_batteryCapacity|count|
+----------------------------------+-----+
|                              NULL| 9656|
|                               0.0| 2576|
|              

Let's correct the `0.0` entries in `carChargeParameter_batteryCapacity`, they count as `Null` too.  
Same goes for `unknown` and `NotTransmitted` in `carChargeParameter_car`.  
Then, we show the updated occurrency counts.

In [20]:
hypercharge_sessions_df=hypercharge_sessions_df.replace(0.0, None, "carChargeParameter_batteryCapacity")
hypercharge_sessions_df.groupby("carChargeParameter_batteryCapacity").count().sort(desc("count")).show()

hypercharge_sessions_df=hypercharge_sessions_df.replace('unknown', None, "carChargeParameter_car")
hypercharge_sessions_df=hypercharge_sessions_df.replace('NotTransmitted', None, "carChargeParameter_car")
hypercharge_sessions_df.groupby("carChargeParameter_car").count().sort(desc("count")).show()

+----------------------------------+-----+
|carChargeParameter_batteryCapacity|count|
+----------------------------------+-----+
|                              NULL|12232|
|                           50000.0|  487|
|                           79100.0|  433|
|                           42000.0|  189|
|                           23600.0|    8|
|                          135000.0|    3|
|                             153.0|    3|
|                           22800.0|    2|
|                           28000.0|    1|
|                           18700.0|    1|
|                           35500.0|    1|
+----------------------------------+-----+

+----------------------+-----+
|carChargeParameter_car|count|
+----------------------+-----+
|                  NULL| 1702|
|  Tesla Model Y / 3 LR| 1504|
|  Skoda Enyaq IV / ...| 1335|
|  VW ID.4 / ID.5 GT...|  746|
|   Cupra Born | VW ID3|  642|
|                 Tesla|  597|
|               BMW iX1|  516|
|          BMW I4 G26 ?|  490|
|  Polestar 2

In [21]:
hypercharge_sessions_df.select("carChargeParameter_car").distinct().count()

83

In [22]:
hypercharge_sessions_df.groupby("carChargeParameter_car").count().toPandas().sort_values(by='count', ascending=False)

Unnamed: 0,carChargeParameter_car,count
27,,1702
5,Tesla Model Y / 3 LR,1504
65,Skoda Enyaq IV / VW ID.Buzz / Cupra Born / ID4...,1335
24,VW ID.4 / ID.5 GTX / Cupra Born 77kWh,746
79,Cupra Born | VW ID3,642
...,...,...
37,Polestar 2 78kWh | Volvo XC40 / C40,1
74,Honda e,1
15,Genesis G80 Electrified,1
38,Opel Ampera-e | Chevy Bolt EV,1


# type, physicalPosition, sessionPosition

In [23]:
hypercharge_sessions_df.groupby("type").count().sort(desc("count")).show()
hypercharge_sessions_df.groupby("physicalPosition").count().sort(desc("count")).show()
hypercharge_sessions_df.groupby("session_Position").count().sort(desc("count")).show()

+--------+-----+
|    type|count|
+--------+-----+
|CCS2_400|13320|
|AC_Cable|   35|
| CHAdeMO|    5|
+--------+-----+

+----------------+-----+
|physicalPosition|count|
+----------------+-----+
|               4| 6720|
|               1| 6605|
|            NULL|   35|
+----------------+-----+

+----------------+-----+
|session_Position|count|
+----------------+-----+
|               2| 6755|
|               1| 6605|
+----------------+-----+



***
# meterStart, meterStop

In [24]:
hypercharge_sessions_df.select('session_meterStart', 'session_meterStop').toPandas().sample(10)

Unnamed: 0,session_meterStart,session_meterStop
9912,22818066,22843524
7973,60775872,60808092
10938,89391976,89432360
11829,111468416,111479784
4704,54064700,54107536
7602,38478176,38500972
7516,21605420,21629744
936,38228684,38268484
3143,9157988,9221310
8959,123981416,124014384


In [25]:
hypercharge_sessions_df = hypercharge_sessions_df.withColumn(
    'session_meter_diff_kWh', (col('session_meterStop')-col('session_meterStart'))/1000
)
hypercharge_sessions_df.select('session_meter_diff_kWh').describe().show()

+-------+----------------------+
|summary|session_meter_diff_kWh|
+-------+----------------------+
|  count|                 13360|
|   mean|     30.13943809880248|
| stddev|    18.570603453300016|
|    min|                   0.0|
|    max|               338.538|
+-------+----------------------+



In [26]:
hypercharge_sessions_df = hypercharge_sessions_df.filter(col('session_meter_diff_kWh')!=0.0)
hypercharge_sessions_df.select('session_meter_diff_kWh').describe().show()

+-------+----------------------+
|summary|session_meter_diff_kWh|
+-------+----------------------+
|  count|                 12248|
|   mean|     32.87580772371009|
| stddev|    16.917824335792425|
|    min|                 0.001|
|    max|               338.538|
+-------+----------------------+



***
# State of Charge

In [27]:
hypercharge_sessions_df.select('session_socStart', 'session_socStop').toPandas().sample(10)

Unnamed: 0,session_socStart,session_socStop
10076,26,76
3389,39,56
302,26,88
6781,16,88
12181,21,88
1420,48,89
1721,55,99
5459,66,80
10843,32,93
5513,26,30


In [28]:
hypercharge_sessions_df.groupby("session_socStart").count().sort(desc("count")).show()
hypercharge_sessions_df.groupby("session_socStop").count().sort(desc("count")).show()

+----------------+-----+
|session_socStart|count|
+----------------+-----+
|              26|  300|
|              24|  275|
|              29|  267|
|              27|  261|
|              19|  256|
|              25|  252|
|              22|  250|
|              34|  248|
|              31|  248|
|              32|  246|
|              41|  242|
|              33|  239|
|              30|  237|
|              40|  236|
|              39|  235|
|              18|  234|
|              28|  233|
|              21|  232|
|              36|  229|
|              35|  224|
+----------------+-----+
only showing top 20 rows

+---------------+-----+
|session_socStop|count|
+---------------+-----+
|             80|  839|
|            100|  829|
|             99|  591|
|             90|  576|
|             81|  396|
|             85|  371|
|             95|  353|
|             84|  327|
|             79|  326|
|             86|  322|
|             87|  314|
|             98|  310|
|             

In [29]:
hypercharge_sessions_df=hypercharge_sessions_df.withColumn("soc_diff", col("session_socStop")-col("session_socStart"))
hypercharge_sessions_df.groupby("session_socStart").count().sort(desc("count")).show()
hypercharge_sessions_df.select("soc_diff").toPandas().sample(10)

+----------------+-----+
|session_socStart|count|
+----------------+-----+
|              26|  300|
|              24|  275|
|              29|  267|
|              27|  261|
|              19|  256|
|              25|  252|
|              22|  250|
|              34|  248|
|              31|  248|
|              32|  246|
|              41|  242|
|              33|  239|
|              30|  237|
|              40|  236|
|              39|  235|
|              18|  234|
|              28|  233|
|              21|  232|
|              36|  229|
|              35|  224|
+----------------+-----+
only showing top 20 rows



Unnamed: 0,soc_diff
7575,39
5530,80
4655,32
655,63
90,16
10216,52
471,53
11221,52
9660,16
8783,41


***
# GPS coordinates and addresses

In [30]:
hypercharge_sessions_df.select('gpsLat', 'gpsLong').describe().show()

+-------+------------------+------------------+
|summary|            gpsLat|           gpsLong|
+-------+------------------+------------------+
|  count|             11831|             11831|
|   mean| 43.11647419067993| 9.014087360123028|
| stddev|6.1391546575241245|10.320970218048606|
|    min|               2.0|             -50.0|
|    max|              60.0|         16.777855|
+-------+------------------+------------------+



In [31]:
# Define the boundary coordinates
north_lat = 47.0925
south_lat = 35.4900
east_long = 18.5194
west_long = 6.6261

# Filter the DataFrame to include only rows within these geographic bounds
hypercharge_sessions_df = hypercharge_sessions_df.filter(
    (hypercharge_sessions_df.gpsLat.between(south_lat, north_lat)) &
    (hypercharge_sessions_df.gpsLong.between(west_long, east_long))
)

hypercharge_sessions_df.select('gpsLat', 'gpsLong').describe().show()

+-------+------------------+------------------+
|summary|            gpsLat|           gpsLong|
+-------+------------------+------------------+
|  count|             11169|             11169|
|   mean| 43.97081261974521|11.419255757687841|
| stddev|1.5123194695655346| 2.101105435906287|
|    min| 40.83769660577199| 8.441119911448869|
|    max| 46.49905823466896|         16.777855|
+-------+------------------+------------------+



In [32]:
# Remove rows where either lat or long is null
hypercharge_sessions_df = hypercharge_sessions_df.filter(
    hypercharge_sessions_df.gpsLat.isNotNull() & 
    hypercharge_sessions_df.gpsLong.isNotNull()
)

# Create a df with rows where either locationStreet, locationZipCode, locationTown, or locationProvince is null
# we don't want to call the reverse geocode search where not necessary
filtered_df = hypercharge_sessions_df.filter(
    (hypercharge_sessions_df.locationStreet.isNull()) | 
    (hypercharge_sessions_df.locationZipCode.isNull()) | 
    (hypercharge_sessions_df.locationTown.isNull()) | 
    (hypercharge_sessions_df.locationProvince.isNull())
)

filtered_df.select('gpsLat', 'gpsLong', 'locationStreet', 'locationZipCode', 'locationTown', 'locationProvince').show()
filtered_df.count()

+---------------+---------------+--------------+---------------+------------+----------------+
|         gpsLat|        gpsLong|locationStreet|locationZipCode|locationTown|locationProvince|
+---------------+---------------+--------------+---------------+------------+----------------+
|46.206436157227|13.048324584961|          NULL|           NULL|        NULL|            NULL|
|      41.068329|       14.88722|          NULL|           NULL|        NULL|            NULL|
|        45.5756|         9.0086|          NULL|           NULL|        NULL|            NULL|
|      46.182976|      13.126373|          NULL|           NULL|        NULL|            NULL|
|      42.200748|       14.59728|          NULL|           NULL|        NULL|            NULL|
|46.206436157227|13.048324584961|          NULL|           NULL|        NULL|            NULL|
|46.206436157227|13.048324584961|          NULL|           NULL|        NULL|            NULL|
|      42.868913|      12.026451|          NULL|  

3097

In [33]:
filtered_df.printSchema()

root
 |-- gpsLat: double (nullable = true)
 |-- gpsLong: double (nullable = true)
 |-- locationStreet: string (nullable = true)
 |-- locationZipCode: string (nullable = true)
 |-- locationTown: string (nullable = true)
 |-- locationProvince: string (nullable = true)
 |-- type: string (nullable = true)
 |-- physicalPosition: long (nullable = true)
 |-- session_end: timestamp (nullable = true)
 |-- session_meterStart: long (nullable = true)
 |-- session_meterStop: long (nullable = true)
 |-- session_start: timestamp (nullable = true)
 |-- session_socStart: long (nullable = true)
 |-- session_socStop: long (nullable = true)
 |-- session_chargingSessionId: long (nullable = true)
 |-- session_position: long (nullable = true)
 |-- carChargeParameter_car: string (nullable = true)
 |-- carChargeParameter_excludeFromStatistics: boolean (nullable = true)
 |-- carChargeParameter_batteryCapacity: double (nullable = true)
 |-- session_duration_mins: double (nullable = true)
 |-- session_meter_diff_

In [34]:
# Apply the UDF to fill null fields
updated_df = filtered_df.toPandas()

# Apply the geocoding function using Pandas apply and lambda
updated_df[['locationStreet', 'locationZipCode', 'locationTown', 'locationProvince']] = updated_df.apply(
    lambda row: pd.Series(get_address_from_coords(row['gpsLat'], row['gpsLong'])), axis=1
)

# Show a sample of the updated DataFrame
print(updated_df[["gpsLat", "gpsLong", "locationStreet", "locationZipCode", "locationTown", "locationProvince"]].sample(10))

         gpsLat    gpsLong                     locationStreet locationZipCode  \
344   44.453754  11.280675                Autostrada del Sole           40033   
1783  44.452500  11.279600                Autostrada del Sole           40033   
478   43.071302  13.844789               Autostrada Adriatica           63827   
2877  45.559134   9.039494               Autostrada dei Laghi           20045   
806   44.874039   8.833501  Autostrada dei Giovi - Serravalle           15057   
2044  44.453754  11.280675                Autostrada del Sole           40033   
898   44.589635   8.664777             Autostrada dei Trafori           16060   
3092  45.613064   8.720427                     Cargo City Sud           21015   
2826  46.206436  13.048325                               None           33030   
512   44.428383  11.599576               Autostrada Adriatica           40064   

                 locationTown       locationProvince  
344       Casalecchio di Reno         Emilia-Romagna 

In [35]:
# Union with the original DataFrame to retain rows that already had full addresses
# Convert remaining rows (with full addresses) to Pandas
final_df = hypercharge_sessions_df.filter(
    (hypercharge_sessions_df.locationStreet.isNotNull()) & 
    (hypercharge_sessions_df.locationZipCode.isNotNull()) & 
    (hypercharge_sessions_df.locationTown.isNotNull()) & 
    (hypercharge_sessions_df.locationProvince.isNotNull())
).toPandas()

# Concatenate the final DataFrame with the updated DataFrame
final_df = pd.concat([final_df, updated_df], ignore_index=True)

# Show a sample of the final DataFrame
print(final_df[["gpsLat", "gpsLong", "locationStreet", "locationZipCode", "locationTown", "locationProvince"]].sample(10))

          gpsLat    gpsLong                     locationStreet  \
1004   43.650520  11.464480      Località Prulli di Sotto 105    
151    45.729839   9.028009               A9, Lario Ovest 278    
3238   44.604692   8.953054           Autostrada dei Giovi A7    
6582   43.075102  13.842823          Autostrada Adriatica 290    
8618   46.182976  13.126373              Autostrada Alpe-Adria   
4118   41.251810  16.220100          Autostrada Adriatica 620    
4362   43.650520  11.464480      Località Prulli di Sotto 105    
10077  44.609688   8.661311                    Galleria Ciutti   
8524   42.032747  11.953672                 Autostrada Azzurra   
9059   44.874039   8.833501  Autostrada dei Giovi - Serravalle   

      locationZipCode                        locationTown  \
1004            50066      Città Metropolitana di Firenze   
151             22071                   Provincia di Como   
3238            16019                       Ronco Scrivia   
6582            63828        

***
# Save Cleaned Dataset

In [36]:
# Save the final DataFrame to CSV
final_df.to_csv("hypercharge_sessions_cleaned.csv", header=True, index=False)