1. Extract charging station dataset
2. Transform the data and provide average charging duration, max charging duration and round off the values to two decimal points for each charging point
3. Load the result into data lake

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, max,round


class ChargePointsETLJob:
    input_path = 'dbfs:/FileStore/shared_uploads/anjaniratish@gmail.com/electric_chargepoints_2017.csv'
    output_path = 'mnt/testdata/transformed/chargingStats'

    def __init__(self):
        self.spark_session = (SparkSession.builder
                                          .master("local[*]")
                                          .appName("ElectricChargePointsETLJob")
                                          .getOrCreate())

    def extract(self):
        return self.spark_session.read.options(header=True,inferSchema=True).csv(self.input_path)
         

    def transform(self, df):
        return df.groupBy('CPID').agg(max('PluginDuration').alias('max_duration'),avg('PluginDuration').alias('avg_duration')).withColumnRenamed('CPID','chargepoint_id').withColumn('max_duration',round('max_duration',2)).withColumn('avg_duration',round('avg_duration',2))
        

    def load(self, df):
        df.write.parquet(self.output_path)

    def run(self):
        self.load(self.transform(self.extract()))


In [0]:
job1 = ChargePointsETLJob()
job1.run()