In [3]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
from pyspark.sql.functions import lit
from datetime import datetime

class harminozator:
    
    df_ = {}
    df_delta_ = {}
    
    def __init__(self):
        hired_employees_schema = \
                      StructType([ \
                                StructField('id', IntegerType(), True), \
                                StructField('name', StringType(), True), \
                                StructField('datetime', DateType(), True), \
                                StructField('department_id', IntegerType(), True), \
                                StructField('job_id', IntegerType(), True) \
                                ]) 

        departments_schema = \
                      StructType([ \
                                StructField('id', IntegerType(), True), \
                                StructField('department', StringType(), True) \
                                ])

        jobs_schema = StructType([ \
                                StructField('id', IntegerType(), True), \
                                StructField('job', StringType(), True) \
                                ])

        self.config = {'hired_employees': hired_employees_schema, 'departments': departments_schema, 'jobs': jobs_schema}
    
    def harmonize(self, raw_fs, file_path):
        current_date = datetime.today().strftime('%Y-%m-%d %H:%M:%S')
        
        for table, schema in self.config.items():
            self.df_[table] = spark.read.csv(f'{raw_fs}/{file_path}/{table}.csv', header=False, schema=schema)
            self.df_[table] = self.df_[table].withColumn('update_date', lit(current_date))
    
    def writeSilver(self, hmz_fs, file_path):
        for table in self.config:
            if DeltaTable.isDeltaTable(spark, f'{hmz_fs}/{file_path}/{table}_delta'):
                self.df_delta_[table] = DeltaTable.forPath(spark, f'{hmz_fs}/{file_path}/{table}_delta')
                self.df_delta_[table].alias('current') \
                .merge( \
                        self.df_[table].alias('new'), \
                        condition="current.id  = new.id" \
                ) \
                .whenMatchedUpdateAll() \
                .whenNotMatchedInsertAll() \
                .execute()
        
                self.df_delta_[table].optimize().executeCompaction()
            else:
                self.df_[table].write.format("delta").mode("overwrite").save(f'{hmz_fs}/{file_path}/{table}_delta')

In [4]:
hmz = harminozator()

{'hired_employees': StructType([StructField('id', IntegerType(), True), StructField('name', StringType(), True), StructField('datetime', DateType(), True), StructField('department_id', IntegerType(), True), StructField('job_id', IntegerType(), True)]), 'departments': StructType([StructField('id', IntegerType(), True), StructField('department', StringType(), True)]), 'jobs': StructType([StructField('id', IntegerType(), True), StructField('job', StringType(), True)])}
