In [84]:
from db_auth import pyodbc_url, jdbc_url, connection_properties
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pyodbc
import uuid

In [5]:
spark = SparkSession.builder.appName("Load Gold").getOrCreate()

In [62]:
def load_data(key_col, data, df, table_name):
    conn = pyodbc.connect(pyodbc_url)
    with conn.cursor() as cursor:
        cursor.execute(f'CREATE TABLE #temp_table({", ".join([key+" VARCHAR(50)" for key in key_col])})')
        columns = ", ".join(data[0].keys())
        placeholders = ", ".join(["?" for _ in data[0]])
        insert_query = f"INSERT INTO #temp_table ({columns}) VALUES ({placeholders})"
        values = [tuple(order.values()) for order in data]
        cursor.executemany(insert_query, values)
        cursor.execute(f'DELETE c FROM {table_name} c INNER JOIN #temp_table t ON {" AND ".join(["c."+key+"=t."+key for key in key_col])}')
        conn.commit()
    df.write.jdbc(url=jdbc_url, table=table_name, mode='append', properties=connection_properties)

Customer

In [30]:
silver_customer_df = spark.read.jdbc(url=jdbc_url, table='[silver].[Dim_Customers]', properties=connection_properties)
silver_fact_sales_df = spark.read.jdbc(url=jdbc_url, table='[silver].[Fact_Sales_Transactions]', properties=connection_properties)

In [31]:
joined_df = silver_fact_sales_df.alias('fact').join(silver_customer_df.alias('cust'), silver_fact_sales_df.customer_id == silver_customer_df.customer_id, 'inner').select('cust.*', 'fact.order_id', 'fact.total_amount')
gold_customer_df = joined_df.groupBy('customer_id', 'name', 'email', 'age', 'gender', 'signup_date').agg(sum('total_amount').alias('total_spent'), count('order_id').alias('total_orders'))

In [47]:
customer_id = [{"customer_id": customer.customer_id} for customer in gold_customer_df.collect()]

In [61]:
load_data(['customer_id'], customer_id, gold_customer_df, 'gold.Dim_Customers')

Products

In [63]:
silver_product_df = spark.read.jdbc(url=jdbc_url, table='[silver].[Dim_Products]', properties=connection_properties)

In [70]:
joined_df = silver_product_df.alias('prod').join(silver_fact_sales_df.alias('fact'), silver_product_df.product_id == silver_fact_sales_df.product_id, 'inner').select('prod.*', 'fact.total_amount')
gold_product_df = joined_df.groupBy('product_id', 'name', 'category', 'brand', 'cost_price', 'selling_price').agg(count('product_id').alias('total_units_sold') ,sum('total_amount').alias('total_revenue'))

In [71]:
product_id = [{"product_id": product.product_id} for product in gold_product_df.collect()]

In [74]:
load_data(['product_id'], product_id, gold_product_df, 'gold.Dim_Products')

Orders

In [75]:
silver_orders_df = spark.read.jdbc(url=jdbc_url, table='[silver].[Dim_Orders]', properties=connection_properties)

In [77]:
joined_df = silver_orders_df.alias('ord').join(silver_fact_sales_df.alias('fact'), silver_orders_df.order_id == silver_fact_sales_df.order_id, 'inner').select('ord.*', 'fact.total_amount', 'fact.customer_id')
gold_orders_df = joined_df.groupBy('order_id', 'order_date', 'customer_id').agg(count('customer_id').alias('total_items') ,sum('total_amount').alias('total_order_value'))

In [79]:
order_id = [{"order_id": order.order_id} for order in gold_orders_df.collect()]

In [81]:
load_data(['order_id'], order_id, gold_orders_df, 'gold.Dim_Orders')

Fact Sales Transaction

In [91]:
uuid_udf = udf(lambda: str(uuid.uuid4()).upper(), StringType())

In [97]:
joined_df = silver_fact_sales_df.alias('fact').join(silver_orders_df.alias('ord'), silver_fact_sales_df.order_id == silver_orders_df.order_id, 'inner').select('fact.*', 'ord.order_date')
gold_sales_df = joined_df.groupBy('order_id', 'product_id', 'customer_id', 'order_date').agg(count('product_id').alias('total_quantity'), sum('total_amount').alias('total_revenue'), sum('discount').alias('total_discount'), sum('tax').alias('total_tax'), sum('refund_amount').alias('total_refund')).withColumn('transaction_id', uuid_udf()).withColumnRenamed('order_date', 'transaction_date')

In [98]:
transaction_id = [{"transaction_id": sales.transaction_id} for sales in gold_sales_df.collect()]

In [100]:
load_data(['transaction_id'], transaction_id, gold_sales_df, 'gold.Fact_Sales_Transactions')