In [1]:
from pyspark.sql.types import StringType
from delta.tables import DeltaTable

In [2]:
def table_exists(table):
    table = table.split('.')
    database, table = table[0], table[1]
    
    if spark._jsparkSession.catalog().tableExists(database, table):
        return True
    else:
        return False

def merge(df, target_table, pk, spark_session=None, partition=[]):
    
    if spark_session is None:
        global spark
    else:
        spark = spark_session
        
    if table_exists(target_table):
        dt = DeltaTable.forName(spark, target_table)
        
        condition = [f't.{k}=s.{k}' for k in pk]
        condition = ' and '.join(condition)
        
        dt.alias('t').merge(
            df.alias('s'), condition
        ).whenMatchedUpdateAll() \
        .whenNotMatchedInsertAll() \
        .execute()
    else:
        if len(partition) > 0:
            df.write.partitionBy(*partition).mode('overwrite').format('delta').saveAsTable(target_table)
        else:
            df.write.mode('overwrite').format('delta').saveAsTable(target_table)
    return

In [3]:
class RawETL:
    
    pk = ['idx']
    
    def __init__(self, spark):
        self.spark = spark
    
    def extract(self, source_table):

        df = self.spark.read.csv(source_table, header=True)
        
        return df
    
    def transform(self, df):
        
        for c in df.columns:
            df = df.withColumn(c, df[c].cast(StringType()))
        
        return df
    
    def load(self, df, target_table):
        
        merge(df, target_table, self.pk, self.spark, partition=self.pk)
        
    def etl(self, target_table, source_table, e_kwargs={}, t_kwargs={}, l_kwargs={}):
        
        df = self.extract(source_table, **e_kwargs)
        df = self.transform(df, **t_kwargs)
        self.load(df, target_table, **l_kwargs)
        
        return df

In [4]:
if __name__ == '__main__':
    
    from pyspark.sql import SparkSession
    from delta.pip_utils import configure_spark_with_delta_pip
    
    builder = SparkSession.builder\
           .appName('raw_etl')\
           .config('spark.sql.warehouse.dir', 'pyspark_tables')\
           .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
           .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
           .config('spark.databricks.delta.retentionDurationCheck.enabled', False) \
           .config('spark.databricks.delta.schema.autoMerge.enabled', True) \
           .config('delta.enableChangeDataFeed', True)

    spark = configure_spark_with_delta_pip(builder).enableHiveSupport().getOrCreate()
    
    TARGET_TABLE = 'dummy.raw'
    SOURCE_TABLE = 'raw.csv'
    
    etl = rawEtl(spark)
    etl.etl(TARGET_TABLE, SOURCE_TABLE)
    
    spark.sql('SHOW DATABASES').show()
    
    df = spark.sql('SELECT * FROM dummy.raw').toPandas()
    
    display(df)



23/02/10 04:47:07 WARN Utils: Your hostname, spiriel resolves to a loopback address: 127.0.1.1; using 192.168.15.6 instead (on interface enp3s0)
23/02/10 04:47:07 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/home/ahow/main_env/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ahow/.ivy2/cache
The jars for the packages stored in: /home/ahow/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-f5b6adaa-e9d2-4bcf-98af-722432baa6e0;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.2.0 in central
	found io.delta#delta-storage;2.2.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
:: resolution report :: resolve 90ms :: artifacts dl 4ms
	:: modules in use:
	io.delta#delta-core_2.12;2.2.0 from central in [default]
	io.delta#delta-storage;2.2.0 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0   |   0   | 

23/02/10 04:47:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
                                                                                

23/02/10 04:47:15 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.




23/02/10 04:47:17 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider delta. Persisting data source table `dummy`.`raw` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive.
+---------+
|namespace|
+---------+
|  default|
|    dummy|
+---------+



Unnamed: 0,idx,value,type
0,11,15,asdf
1,14,10,asdf
2,1,15,asdf
3,12,10,asdf
4,4,15,asdf
5,3,15,asdf
6,13,18,asdf
7,2,15,asdf
8,5,15,asdf
