In [100]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.functions import col

In [101]:
spark = SparkSession.builder.appName("BigDataTask1").getOrCreate()

# Load CSV files
hypercharge_location = spark.read.csv("C:/Users/david/PycharmProjects/Big Data Project\dataset/full/hypercarge_locations.csv", header=True, inferSchema=True)
cdr = spark.read.csv("C:/Users/david\PycharmProjects/Big Data Project/dataset/full/cdr.csv", header=True, inferSchema=True)
pdr = spark.read.csv("C:/Users/david/PycharmProjects/Big Data Project/dataset/full/pdr_locations.csv", header=True, inferSchema=True)

# Load Parquet file
hypercharge_sessions = spark.read.parquet("C:/Users/david/PycharmProjects/Big Data Project/dataset/full/hypercarge_sessions.parquet")

In [67]:
# Display basic information about the dataframes
print("Basic Information")
hypercharge_sessions.printSchema()
cdr.printSchema()
pdr.printSchema()
hypercharge_location.printSchema()

Basic Information
root
 |-- serialNumber: string (nullable = true)
 |-- gpsLat: double (nullable = true)
 |-- gpsLong: double (nullable = true)
 |-- locationStreet: string (nullable = true)
 |-- locationZipCode: string (nullable = true)
 |-- locationTown: string (nullable = true)
 |-- locationProvince: string (nullable = true)
 |-- locationCountry: string (nullable = true)
 |-- locationUpdateNote: string (nullable = true)
 |-- endClientName: integer (nullable = true)
 |-- distributorName: string (nullable = true)
 |-- corporationName: string (nullable = true)
 |-- operatorName: integer (nullable = true)
 |-- lendeeName: integer (nullable = true)
 |-- evId: string (nullable = true)
 |-- type: string (nullable = true)
 |-- physicalPosition: long (nullable = true)
 |-- cableLength: long (nullable = true)
 |-- producer: string (nullable = true)
 |-- chargingSessionGraphData: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- currentEv: double (nullable =

In [102]:
hypercharge_sessions = hypercharge_sessions.select(
    "gpsLat", "gpsLong", "locationStreet", "locationZipCode", "locationTown", "locationProvince",
    "locationCountry", "endClientName", "distributorName", "corporationName", "operatorName", "type", "physicalPosition", 
    col("`session.averagePower`").alias("session_averagePower"), 
    col("`session.car`").alias("session_car"), 
    col("`session.distributorName`").alias("session_distributorName"), 
    col("`session.end`").alias("session_end"), 
    col("`session.meterStart`").alias("session_meterStart"), 
    col("`session.meterStop`").alias("session_meterStop"), 
    col("`session.physicalPosition`").alias("session_physicalPosition"), 
    col("`session.peakPower`").alias("session_peakPower"), 
    col("`session.start`").alias("session_start"), 
    col("`session.socStart`").alias("session_socStart"), 
    col("`session.socStop`").alias("session_socStop"), 
    col("`session.chargingSessionId`").alias("session_chargingSessionId"), 
    col("`session.type`").alias("session_type"), 
    col("`session.position`").alias("session_position"), 
    col("`session.transactionIdNew`").alias("session_transactionIdNew"), 
    col("`carChargeParameter.car`").alias("carChargeParameter_car"), 
    col("`carChargeParameter.excludeFromStatistics`").alias("carChargeParameter_excludeFromStatistics"), 
    col("`carChargeParameter.batteryCapacity`").alias("carChargeParameter_batteryCapacity")
)

In [103]:
location_variables = [
    "chargerId", "numberStacks", "chassis", "isPublic", "chargePointIdentity", 
    "customerIccid", "locationTown", "locationZipCode", "locationProvince", 
    "locationCountry", "isRemoteLocation", "outletList", "status_position", 
    "status_status", "endClientId", "surroundingChargers"
]
hypercharge_location = hypercharge_location.select(*location_variables)

pdr_variables = [
    "station_uid", "station_address", "station_street_number", "station_postal_code", 
    "station_city", "station_country_id", "station_type_name", "station_brand", 
    "station_model", "station_commissioning_date", "station_is_at_home", 
    "station_owner_company_uid", "station_owner_company_name", 
    "station_owner_company_stars", "station_owner_company_is_always_open", 
    "station_owner_company_is_shopping_center", "station_owner_company_roaming", 
    "station_owner_company_pay_description", "station_owner_company_pois", 
    "station_owner_company_experiences", "station_owner_company_sustainability_profile", 
    "station_owner_company_distances", "station_owner_company_show_advanced_services", 
    "station_owner_company_show_roaming", "station_owner_company_show_map", 
    "station_owner_company_keyfob_fee", "station_owner_company_owner_cost_per_kwh", 
    "plugs", "sessions", "totEnergy", "totEnergyLocal", "totEnergyNotLocal", 
    "totSessions", "totSessionsNotLocal", "totSessionsLocal", "totCost", 
    "totHouseSessions", "totPublicSessions", "totPublicCost", "totHouseCost", 
    "totHouseEnergy", "totPublicEnergy"
]
pdr = pdr.select(*pdr_variables)

cdr_variables = [
    "CDR ID", "EVSE ID", "Operatore", "Potenza (kW)", "Station Nome", "Station Città", 
    "Station Indirizzo", "Data inizio", "Ora inizio", "Data fine", "Ora fine", 
    "Ricavi totali (€) (IVA esclusa) ", "Ricavi Energia (€) (IVA esclusa) ", 
    "Energia (kWh)", "Ricavi Penalty Time (€) (IVA esclusa) ", "Tempo Totale (min)", 
    "Inizio penalty time", "Contachilometri (Km)", "Auth ID", "Auth method", 
    "Type Status"
]
cdr = cdr.select(*cdr_variables)

In [105]:
hypercharge_sessions = hypercharge_sessions.drop("carChargeParameter")

# Lists of interesting variables to reduce data frame size
session_variables = [
    "gpsLat", "gpsLong", "locationStreet", "locationZipCode", "locationTown", 
    "locationProvince", "locationCountry", "distributorName", "corporationName", 
    "type", "physicalPosition", "session_averagePower", "session_car", 
    "session_distributorName", "session_end", "session_meterStart", 
    "session_meterStop", "session_physicalPosition", "session_peakPower", 
    "session_start", "session_socStart", "session_socStop", 
    "session_chargingSessionId", "session_type", "session_position", 
    "session_transactionIdNew", "carChargeParameter_car", 
    "carChargeParameter_batteryCapacity"
]
hypercharge_sessions = hypercharge_sessions.select(*session_variables)

In [106]:
print("Sessions Stats")
numeric_columns = [col for col, dtype in hypercharge_sessions.dtypes if dtype in ['int', 'double', 'float']]
pandas_sessions = hypercharge_sessions.select(numeric_columns).toPandas()
display(pandas_sessions.describe())

Sessions Stats


Unnamed: 0,gpsLat,gpsLong,session_averagePower,session_car,session_distributorName,session_physicalPosition,session_peakPower,session_type,session_transactionIdNew,carChargeParameter_batteryCapacity
count,12978.0,12978.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3706.0
mean,43.106707,8.948087,,,,,,,,18149.044522
std,6.186204,10.427072,,,,,,,,28934.976897
min,2.0,-50.0,,,,,,,,0.0
25%,42.53072,9.237866,,,,,,,,0.0
50%,44.4525,11.193636,,,,,,,,0.0
75%,45.0749,12.61855,,,,,,,,42000.0
max,60.0,16.777855,,,,,,,,135000.0


In [107]:
print("Location Stats")
pandas_location = hypercharge_location.toPandas()
display(pandas_location.describe())

print("CDR Stats")
pandas_cdr = cdr.toPandas()
display(pandas_cdr.describe())

print("PDR Stats")
pandas_pdr = pdr.toPandas()
display(pandas_pdr.describe())

Location Stats


Unnamed: 0,chargerId,numberStacks,locationZipCode,status_position
count,374.0,374.0,153.0,374.0
mean,43247.780749,3.823529,35724.941176,0.0
std,33380.514195,0.42177,24216.932744,0.0
min,6590.0,1.0,10.0,0.0
25%,13418.25,4.0,20045.0,0.0
50%,19747.5,4.0,31032.0,0.0
75%,83748.75,4.0,50019.0,0.0
max,83842.0,4.0,83100.0,0.0


CDR Stats


Unnamed: 0,CDR ID,Potenza (kW),Ora inizio,Ora fine,Ricavi totali (€) (IVA esclusa),Ricavi Energia (€) (IVA esclusa),Energia (kWh),Ricavi Penalty Time (€) (IVA esclusa),Tempo Totale (min),Inizio penalty time
count,300000.0,300000.0,300000,300000,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0
mean,1326766.0,282.026348,2024-09-29 14:08:57.396426752,2024-09-29 14:26:52.545593344,20.01075,20.01071,29.690876,0.0,26.8268,0.0
min,813476.0,22.0,2024-09-29 00:00:01,2024-09-29 00:00:01,0.0,0.0,0.001,0.0,0.0,0.0
25%,1104226.0,300.0,2024-09-29 10:49:36,2024-09-29 11:11:00,11.1,11.1,16.652,0.0,15.0,0.0
50%,1284196.0,300.0,2024-09-29 14:17:07,2024-09-29 14:40:50,18.81,18.81,27.904,0.0,24.0,0.0
75%,1571378.0,300.0,2024-09-29 17:49:02,2024-09-29 18:09:26,27.77,27.77,40.97625,0.0,34.0,0.0
max,1766361.0,400.0,2024-09-29 23:59:59,2024-09-29 23:59:59,247.96,247.96,364.652,0.0,5915.0,0.0
std,252664.7,70.270005,,,12.102538,12.102568,17.662696,0.0,24.014121,0.0


PDR Stats


Unnamed: 0,station_postal_code,station_commissioning_date,station_owner_company_pay_description,station_owner_company_owner_cost_per_kwh,totEnergy,totEnergyLocal,totEnergyNotLocal,totSessions,totSessionsNotLocal,totSessionsLocal,totCost,totHouseSessions,totPublicSessions,totPublicCost,totHouseCost,totHouseEnergy,totPublicEnergy
count,830.0,285,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0
mean,36763.586747,2024-03-12 23:14:44.210526208,0.0,0.0,14006.847572,0.000308,13711.930864,465.036145,454.827711,0.00241,9467.685855,0.0,465.036145,9467.685855,0.0,0.0,14006.847572
min,10.0,2023-09-11 02:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,20026.0,2023-11-22 01:00:00,0.0,0.0,1134.687,0.0,1100.365,79.0,77.0,0.0,584.035,0.0,79.0,584.035,0.0,0.0,1134.687
50%,33010.0,2024-02-26 01:00:00,0.0,0.0,7879.4,0.0,7646.5245,283.5,276.5,0.0,5360.65,0.0,283.5,5360.65,0.0,0.0,7879.4
75%,52041.0,2024-07-04 02:00:00,0.0,0.0,23273.082,0.0,22836.1835,729.75,715.75,0.0,15833.2225,0.0,729.75,15833.2225,0.0,0.0,23273.082
max,83100.0,2024-08-09 02:00:00,0.0,0.0,95445.22,0.147,92886.066,3062.0,2981.0,1.0,64974.67,0.0,3062.0,64974.67,0.0,0.0,95445.22
std,24449.650843,,0.0,0.0,15775.476469,0.006348,15449.816311,490.645257,480.294118,0.049058,10802.687982,0.0,490.645257,10802.687982,0.0,0.0,15775.476469


In [108]:
# Function to calculate percentage of null values
def null_percentage(df):
    null_counts = df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).collect()[0]
    total_count = df.count()
    return {k: v/total_count*100 for k, v in null_counts.asDict().items()}


session_nulls = null_percentage(hypercharge_sessions)
cdr_nulls = null_percentage(cdr)
pdr_nulls = null_percentage(pdr)
location_nulls = null_percentage(hypercharge_location)

In [109]:
def display_null_percentages(null_counts_dict):
  for col, percentage in null_counts_dict.items():
    print(f"{col:30s}{percentage:.2f}%")

print("Percentage of null values in Location file:")
display_null_percentages(location_nulls)
print("\n")

print("Percentage of null values in CDR file:")
display_null_percentages(cdr_nulls)
print("\n")

print("Percentage of null values in PDR file:")
display_null_percentages(pdr_nulls)
print("\n")

print("Percentage of null values in Session file:")
display_null_percentages(session_nulls)

Percentage of null values in Location file:
chargerId                     0.00%
numberStacks                  0.00%
chassis                       0.00%
isPublic                      0.00%
chargePointIdentity           30.21%
customerIccid                 89.84%
locationTown                  59.09%
locationZipCode               59.09%
locationProvince              59.09%
locationCountry               39.30%
isRemoteLocation              0.00%
outletList                    0.00%
status_position               0.00%
status_status                 0.00%
endClientId                   99.47%
surroundingChargers           30.75%


Percentage of null values in CDR file:
CDR ID                        0.00%
EVSE ID                       0.00%
Operatore                     0.00%
Potenza (kW)                  0.00%
Station Nome                  0.00%
Station Città                 0.00%
Station Indirizzo             0.00%
Data inizio                   0.00%
Ora inizio                    0.00%
Data fi

In [111]:
# Check for duplicate rows
print("Duplicate rows in Session:", hypercharge_sessions.count() - hypercharge_sessions.dropDuplicates().count())
print("Duplicate rows in CDR:", cdr.count() - cdr.dropDuplicates().count())
print("Duplicate rows in PDR:", pdr.count() - pdr.dropDuplicates().count())
print("Duplicate rows in Locations:", hypercharge_location.count() - hypercharge_location.dropDuplicates().count())

Duplicate rows in Session: 0
Duplicate rows in CDR: 0
Duplicate rows in PDR: 2
Duplicate rows in Locations: 0


In [None]:
from pyspark.sql.types import IntegerType, LongType, FloatType, DoubleType
import pyspark.sql.functions as F

def detect_outliers(df):
    outlier_dfs = []
    for col_name in df.schema.names:
        col_type = df.schema[col_name].dataType
        if col_type == IntegerType() or col_type == LongType() or col_type == FloatType() or col_type == DoubleType():
            quantiles = df.approxQuantile(col_name, [0.25, 0.75], 0.05)
            IQR = quantiles[1] - quantiles[0]
            lower_bound = quantiles[0] - 1.5 * IQR
            upper_bound = quantiles[1] + 1.5 * IQR
            outlier_df = df.filter((F.col(col_name) < lower_bound) | (F.col(col_name) > upper_bound))
            outlier_dfs.append(outlier_df)
    return outlier_dfs

#session_outliers = detect_outliers(hypercharge_sessions)
cdr_outliers = detect_outliers(cdr)
pdr_outliers = detect_outliers(pdr)
location_outliers = detect_outliers(hypercharge_location)

In [114]:
print("Number of outliers in CDR:", len(cdr_outliers))
print("Number of outliers in PDR:", len(pdr_outliers))
print("Number of outliers in Location:", len(location_outliers))
#print("Number of outliers in Session:", len(session_outliers))

Number of outliers in CDR: 8
Number of outliers in PDR: 16
Number of outliers in Location: 4


In [90]:
spark.stop()