In [0]:
from datetime import datetime, timedelta
from pyspark.sql import functions as F
from utils.spark_delta_transform import unnest_struct, transform_column_names
from utils.spark_delta import merge, table_exists, optimize_tb
from utils.logs import print_args

In [0]:
class UsersETL:
    def __init__(self, spark, dt_start, dt_end, pk=None):
        self.spark = spark
        self.dt_start = dt_start  # dummy varibles to be used later in real ETL
        self.dt_end = dt_end  # dummy varibles to be used later in real ETL
        self.pk = self._set_pk(pk)
        
    @staticmethod
    def _set_pk(pk):
        if pk is None:
            pk = ["id_oid"]
        return pk
    
    @print_args(print_kwargs=['file_name'])
    def extract(self, file_name: str):
        df = self.spark.read.format('json').options(multiLine=True).load(file_name)
        return df
    
    def transform(self, df):
        # Unnest two levels
        df = unnest_struct(df)
        df = unnest_struct(df)
        
        df = transform_column_names(df)
        
        for c in df.columns:
            if '_date' in c:
                df = df.withColumn(c, F.to_timestamp(df[c]))
        
        return df
    
    @print_args(print_kwargs=['target_tb', 'drop'])
    def load(self, df, target_tb, drop=False):
        if drop and table_exists(target_tb, self.spark):
            print(f"Dropping table {target_tb}")
            self.spark.sql(f"DROP TABLE {target_tb}").show()

        print(f"df {df.count()} rows.")
        df.write.format('delta').mode('append').saveAsTable(target_tb)

In [0]:
%run ./etl_constants

In [0]:
etl = UsersETL(spark, DT_START, DT_END)
df = etl.extract(file_name=f"{BASE_PATH[5:]}/users.json")
df.persist()

df = etl.transform(df)
etl.load(df, target_tb=TARGET_USERS_RAW_TB, drop=DROP_RAW)
df.unpersist()

In [0]:
# etl = UsersETL(spark, DT_START, DT_END)
# optimize_tb(spark, TARGET_USERS_RAW_TB, ['updated_at_date']+etl.pk, replace=True)