
==========================

**BRONZE**

Files Load to Bronze Layer

==========================


Tables

- SalesOrderDetail
- SalesOrderHeader
- Product
- ProductModel
- ProductSubcategory
- Customer
- SalesTerritory
 

 

========================

**SILVER**

Read Data from Bronze 
Load into the Silver Layer
Apply Data Quality 
Change Data Columns names 


===========================

 
- sales_order_detail
- sales_order_header
- product
- product_model
- product_subcategory
- customer
- sales_territory
 

===============

**GOLD**

Read data from SILVER
Create Kimball Model 
FACTS/DIM


===================


## Landing to Bronze layer

### Set up the Variables for Root location, Schema Names, Directory  for the files

In [0]:
# root path
root_path = '/mnt/data/'

# Catalog Name
catalog = 'hive_metastore'

# Schema Names
schema_bronze = 'bronze'
schema_silver = 'silver'
schema_gold = 'gold'

# Directory Names
directory_landing = 'landing'
directory_bronze = 'bronze'
directory_silver = 'silver'
directory_gold = 'gold'

### Set up the Bronze Schema as CATALOG_NAME.SCHEMA_NAME

In [0]:
#create schema for Bronze if it  does not exist for Bronze Layer
bronze_catalog_schema = catalog + '.' + schema_bronze
bronze_schema_location = root_path + directory_bronze

#create schema for Silver if it  does not exist for Silver Layer
silver_catalog_schema = catalog + '.' + directory_silver
silver_schema_location = root_path + directory_silver

#create schema for Gold if it  does not exist for Gold Layer
gold_catalog_schema = catalog + '.' + directory_gold
gold_schema_location = root_path + directory_gold

In [0]:
#create schema for Bronze if it  does not exist
spark.sql("CREATE SCHEMA IF NOT EXISTS {} MANAGED LOCATION '{}'".format(bronze_catalog_schema, bronze_schema_location))

DataFrame[]

In [0]:
#create schema for Silver if it  does not exist
spark.sql("CREATE SCHEMA IF NOT EXISTS {} MANAGED LOCATION '{}'".format(silver_catalog_schema, silver_schema_location))

DataFrame[]

In [0]:
#create schema for Gold if it  does not exist
spark.sql("CREATE SCHEMA IF NOT EXISTS {} MANAGED LOCATION '{}'".format(gold_catalog_schema, gold_schema_location))

DataFrame[]

### Set up the landing location

In [0]:
SalesOrderDetail_path=root_path + directory_landing + '/SalesOrderDetail.csv'
SalesOrderHeader_path =root_path + directory_landing + '/SalesOrderHeader.csv'
Product_path = root_path + directory_landing + '/Product.csv'
ProductModel_path =root_path + directory_landing + '/ProductModel.csv'
ProductSubcategory_path=root_path + directory_landing + '/ProductSubcategory.csv'
Customer_path=root_path + directory_landing + '/Customer.csv'
SalesTerritory_path=root_path + directory_landing + '/SalesTerritory.csv'


### Import Types

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, TimestampType,DoubleType, DateType

### Define Schema for each File

In [0]:
# Define schema for sessions CSV
SalesOrderDetail_schema = StructType([
    StructField("SalesOrderID", IntegerType(), True),
    StructField("SalesOrderDetailID", IntegerType(), True),
    StructField("CarrierTrackingNumber", StringType(), True),
    StructField("OrderQty", IntegerType(), True),
    StructField("ProductID", IntegerType(), True),
    StructField("SpecialOfferID", IntegerType(), True),
    StructField("UnitPrice", FloatType(), True),
    StructField("UnitPriceDiscount", FloatType(), True),
    StructField("LineTotal", FloatType(), True),
    StructField("ModifiedDate", TimestampType(), True)
])

SalesOrderHeader_schema = schema = StructType([
    StructField("SalesOrderID", IntegerType(), True),
    StructField("RevisionNumber", IntegerType(), True),
    StructField("OrderDate", StringType(), True),
    StructField("DueDate", StringType(), True),
    StructField("ShipDate", StringType(), True),
    StructField("Status", IntegerType(), True),
    StructField("OnlineOrderFlag", IntegerType(), True),
    StructField("SalesOrderNumber", StringType(), True),
    StructField("PurchaseOrderNumber", StringType(), True),
    StructField("AccountNumber", StringType(), True),
    StructField("CustomerID", IntegerType(), True),
    StructField("SalesPersonID", IntegerType(), True),
    StructField("TerritoryID", IntegerType(), True),
    StructField("BillToAddressID", IntegerType(), True),
    StructField("ShipToAddressID", IntegerType(), True),
    StructField("ShipMethodID", IntegerType(), True),
    StructField("CreditCardID", IntegerType(), True),
    StructField("CreditCardApprovalCode", StringType(), True),
    StructField("CurrencyRateID", StringType(), True),
    StructField("SubTotal", DoubleType(), True),
    StructField("TaxAmt", DoubleType(), True),
    StructField("Freight", DoubleType(), True),
    StructField("TotalDue", DoubleType(), True),
    StructField("Comment", StringType(), True),
    StructField("ModifiedDate", StringType(), True)
])

Product_schema = StructType([
    StructField("ProductID", IntegerType(), True),
    StructField("Name", StringType(), True),
    StructField("ProductNumber", StringType(), True),
    StructField("MakeFlag", IntegerType(), True),
    StructField("FinishedGoodsFlag", IntegerType(), True),
    StructField("Color", StringType(), True),
    StructField("SafetyStockLevel", IntegerType(), True),
    StructField("ReorderPoint", IntegerType(), True),
    StructField("StandardCost", DoubleType(), True),
    StructField("ListPrice", DoubleType(), True),
    StructField("Size", StringType(), True),
    StructField("SizeUnitMeasureCode", StringType(), True),
    StructField("WeightUnitMeasureCode", StringType(), True),
    StructField("Weight", DoubleType(), True),
    StructField("DaysToManufacture", IntegerType(), True),
    StructField("ProductLine", StringType(), True),
    StructField("Class", StringType(), True),
    StructField("Style", StringType(), True),
    StructField("ProductSubcategoryID", IntegerType(), True),
    StructField("ProductModelID", IntegerType(), True),
    StructField("SellStartDate", TimestampType(), True),
    StructField("SellEndDate", TimestampType(), True),
    StructField("DiscontinuedDate", TimestampType(), True),
    StructField("ModifiedDate", TimestampType(), True)
])

ProductModel_schema = StructType([
    StructField("ProductModelID", IntegerType(), True),
    StructField("Name", StringType(), True),
    StructField("ModifiedDate", TimestampType(), True)
])

ProductSubcategory_schema=StructType([
    StructField("ProductSubcategoryID", IntegerType(), True),
    StructField("ProductCategoryID", IntegerType(), True),
    StructField("Name", StringType(), True),
    StructField("ModifiedDate", TimestampType(), True)
])
Customer_schema = StructType([
    StructField("CustomerID", IntegerType(), True),
    StructField("PersonID", IntegerType(), True),
    StructField("StoreID", IntegerType(), True),
    StructField("TerritoryID", IntegerType(), True),
    StructField("AccountNumber", StringType(), True),
    StructField("BusinessEntityID", IntegerType(), True),
    StructField("PersonType", StringType(), True),
    StructField("NameStyle", IntegerType(), True),
    StructField("Title", StringType(), True),
    StructField("FirstName", StringType(), True),
    StructField("MiddleName", StringType(), True),
    StructField("LastName", StringType(), True),
    StructField("Suffix", StringType(), True)
])

SalesTerritory_schema = schema = StructType([
    StructField("StoreID", IntegerType(), True),
    StructField("TerritoryID", IntegerType(), True),
    StructField("AccountNumber", StringType(), True),
    StructField("SalesTerritoryName", StringType(), True),
    StructField("CountryRegionCode", StringType(), True),
    StructField("Group", StringType(), True),
    StructField("StoreName", StringType(), True),
    StructField("SalesPersonID", IntegerType(), True)
])

### Read data file and create a Spark Dataframe

In [0]:
# Read CSV files with defined schema
fact_SalesOrderHeader = spark.read.csv(SalesOrderHeader_path, header=True, schema=SalesOrderHeader_schema)

In [0]:
# Read CSV files with defined schema
dim_Product = spark.read.csv(Product_path, header=True, schema=Product_schema)

In [0]:
# Read CSV files with defined schema
dim_ProductModel = spark.read.csv(ProductModel_path, header=True, schema=ProductModel_schema)

In [0]:
# Read CSV files with defined schema
dim_ProductSubcategory = spark.read.csv(ProductSubcategory_path, header=True, schema=ProductSubcategory_schema)

In [0]:
# Read CSV files with defined schema
dim_Customer = spark.read.csv(Customer_path, header=True, schema=Customer_schema)

In [0]:
# Read CSV files with defined schema
fact_SalesOrderDetail= spark.read.csv(SalesOrderDetail_path, header=True, schema=SalesOrderDetail_schema)

In [0]:
dim_SalesTerritory = spark.read.csv(SalesTerritory_path, header=True, schema=SalesTerritory_schema)

###Add Ingestion timestamp and source system

###Create a Function to add audit/metedata/contol columns like Ingestion timestamp and Source system

In [0]:
from pyspark.sql import functions as F
from datetime import datetime

def auditedDataframes(dataframes_list):
    transformed_dataframes = []
    
    for df in dataframes_list:
        # Add ingestionTimestamp and SourceSystem columns
        transformed_df = df.withColumn("ingestionTimestamp", F.lit(datetime.now())) \
                           .withColumn("SourceSystem", F.lit("CRM"))
        
        # Append transformed dataframe to the list
        transformed_dataframes.append(transformed_df)
    
    # Return as tuple of dataframes
    return tuple(transformed_dataframes)


###Add Ingestion timestamp and source system using the Function created in previous step

In [0]:
fact_SalesOrderDetail,fact_SalesOrderHeader,dim_Product,dim_ProductModel,dim_ProductSubcategory,dim_Customer,dim_SalesTerritory=auditedDataframes([fact_SalesOrderDetail,fact_SalesOrderHeader,dim_Product,dim_ProductModel,dim_ProductSubcategory,dim_Customer,dim_SalesTerritory])

###Write Data in Bronze Layer in Delta Format

In [0]:
raw_SalesOrderDetail = catalog + '.'+ schema_bronze + '.'+  "SalesOrderDetail"
raw_SalesOrderHeader = catalog + '.'+ schema_bronze + '.'+  "SalesOrderHeader"
raw_Product = catalog + '.'+ schema_bronze + '.'+  "Product"
raw_ProductModel = catalog + '.'+ schema_bronze + '.'+ "ProductModel"
raw_ProductSubcategory = catalog + '.'+ schema_bronze + '.'+  "ProductSubcategory"
raw_Customer = catalog + '.'+ schema_bronze + '.'+  "Customer"
raw_SalesTerritory = catalog + '.'+ schema_bronze + '.'+  "SalesTerritory"

In [0]:
# Writing the tables to Delta format in the bronze layer
fact_SalesOrderDetail.write.format("delta").mode("overwrite").saveAsTable(raw_SalesOrderDetail)

In [0]:
# Writing the tables to Delta format in the bronze layer
fact_SalesOrderHeader.write.format("delta").mode("overwrite").saveAsTable(raw_SalesOrderHeader)  

In [0]:
# Writing the tables to Delta format in the bronze layer
dim_Product.write.format("delta").mode("overwrite").saveAsTable(raw_Product)         

In [0]:
# Writing the tables to Delta format in the bronze layer
dim_ProductModel.write.format("delta").mode("overwrite").saveAsTable(raw_ProductModel)    

In [0]:
# Writing the tables to Delta format in the bronze layer
dim_ProductSubcategory.write.format("delta").mode("overwrite").saveAsTable(raw_ProductSubcategory)

In [0]:
# Writing the tables to Delta format in the bronze layer
dim_Customer.write.format("delta").mode("overwrite").saveAsTable(raw_Customer)

In [0]:
# Writing the tables to Delta format in the bronze layer
dim_SalesTerritory.write.format("delta").mode("overwrite").saveAsTable(raw_SalesTerritory)

## **SILVER LAYER**

###Read Bronze Layer tables in a Spark Dataframe

In [0]:
# Query the bronze tables back
df_fact_SalesOrderDetail=spark.table(raw_SalesOrderDetail)
df_fact_SalesOrderHeader=spark.table(raw_SalesOrderHeader)
df_dim_Product=spark.table(raw_Product)
df_dim_ProductModel=spark.table(raw_ProductModel)
df_dim_ProductSubcategory=spark.table(raw_ProductSubcategory)
df_dim_Customer=spark.table(raw_Customer)
df_dim_SalesTerritory=spark.table(raw_SalesTerritory)

###Some generic functions to perform Transformations

###Drop the Duplicate Records

###Create a function using existing function dropDuplicates to remove the Duplicates rows

In [0]:

#Function to remove duplicates
from pyspark.sql.functions import current_timestamp, col, to_date
def dropAllDuplicatesFromDP(df):
    return df.dropDuplicates()

In [0]:
from pyspark.sql import functions as F

#Performing the transformations
df_fact_SalesOrderDetail_enriched=dropAllDuplicatesFromDP(df_fact_SalesOrderDetail)
df_fact_SalesOrderHeader_enriched=dropAllDuplicatesFromDP(df_fact_SalesOrderHeader)

###Convert **OrderDate, DueDate, ShipDate, ModifiedDate** from String Datatype to DateType

In [0]:
#Converting the StringType columns into DateType format
df_fact_SalesOrderHeader_enriched = df_fact_SalesOrderHeader_enriched.withColumn("OrderDate", to_date(df_fact_SalesOrderHeader_enriched["OrderDate"], "dd-MM-yyyy"))
df_fact_SalesOrderHeader_enriched = df_fact_SalesOrderHeader_enriched.withColumn("DueDate", to_date(df_fact_SalesOrderHeader_enriched["DueDate"], "dd-MM-yyyy"))
df_fact_SalesOrderHeader_enriched = df_fact_SalesOrderHeader_enriched.withColumn("ShipDate", to_date(df_fact_SalesOrderHeader_enriched["ShipDate"], "dd-MM-yyyy"))
df_fact_SalesOrderHeader_enriched = df_fact_SalesOrderHeader_enriched.withColumn("ModifiedDate", to_date(df_fact_SalesOrderHeader_enriched["ModifiedDate"], "dd-MM-yyyy"))

###Replace NULL or 'NULL' values in the records

In [0]:
# Replace nulls and 'NULL' strings with 'Unknown'
df_fact_SalesOrderHeader_enriched = df_fact_SalesOrderHeader_enriched.withColumn(
    "CurrencyRateID",
    F.when((F.col("CurrencyRateID").isNull()) | (F.col("CurrencyRateID") == 'NULL'), 'Unknown')
    .otherwise(F.col("CurrencyRateID"))
)

###Display the updated Dataframe 

###Write Data in Silver Layer in Delta Format

In [0]:
cleaned_SalesOrderDetail = catalog + '.'+ schema_silver + '.'+  "sales_order_detail"
cleaned_SalesOrderHeader = catalog + '.'+ schema_silver + '.'+  "sales_order_header"
cleaned_product = catalog + '.'+ schema_silver + '.'+  "product"
cleaned_productModel = catalog + '.'+ schema_silver + '.'+ "product_model"
cleaned_productSubcategory = catalog + '.'+ schema_silver + '.'+  "product_subcategory"
cleaned_customer = catalog + '.'+ schema_silver + '.'+  "customer"
cleaned_salesTerritory = catalog + '.'+ schema_silver + '.'+  "sales_territory"

In [0]:
#Write to Silver Layer
dim_ProductSubcategory.write.format("delta").mode("overwrite").saveAsTable(cleaned_productSubcategory)


In [0]:
#Write to Silver Layer
dim_Customer.write.format("delta").mode("overwrite").saveAsTable(cleaned_customer)    


In [0]:
#Write to Silver Layer
dim_ProductModel.write.format("delta").mode("overwrite").saveAsTable(cleaned_productModel)   

In [0]:
#Write to Silver Layer
dim_Product.write.format("delta").mode("overwrite").saveAsTable(cleaned_product)       


In [0]:

#Write to Silver Layer
df_fact_SalesOrderHeader_enriched.write.format("delta").mode("overwrite").saveAsTable(cleaned_SalesOrderHeader)


In [0]:
#Write to Silver Layer
df_fact_SalesOrderDetail_enriched.write.format("delta").mode("overwrite").saveAsTable(cleaned_SalesOrderDetail)


In [0]:

#Write to Silver Layer
dim_SalesTerritory.write.format("delta").mode("overwrite").saveAsTable(cleaned_salesTerritory)

###  **GOLD LAYER**

###Import Libraries

In [0]:
from pyspark.sql.functions import sha2,concat_ws

###Read Silver Layer tables in a Spark Dataframe

In [0]:
#Query the Silver tables back
df_fact_SalesOrderDetail_enriched=spark.table(cleaned_SalesOrderDetail)
df_fact_SalesOrderHeader_enriched=spark.table(cleaned_SalesOrderHeader)
df_dim_Product=spark.table(cleaned_product)
df_dim_ProductModel=spark.table(cleaned_productModel)
df_dim_ProductSubcategory=spark.table(cleaned_productSubcategory)
df_dim_Customer=spark.table(cleaned_customer)
df_dim_SalesTerritory=spark.table(cleaned_salesTerritory)

###Create a FACT Daily Sales Table 
###We join the two tables 
- catalog.silver.cleaned_SalesOrderDetail
- catalog.silver.cleaned_SalesOrderHeader
- on SalesOrderID
- Select required columns

In [0]:
#Transformations at the Gold Layer
#Join the SalesOrderDetail table with SalesOrderHeader to create the fact_daily_sales table
fact_daily_sales = df_fact_SalesOrderDetail_enriched \
    .join(df_fact_SalesOrderHeader_enriched, on="SalesOrderID", how="right") \
    .select(
        df_fact_SalesOrderHeader_enriched["*"],  # All columns from df_fact_SalesOrderDetail_enriched
        df_fact_SalesOrderDetail_enriched["SalesOrderDetailID"],
        df_fact_SalesOrderDetail_enriched["CarrierTrackingNumber"],
        df_fact_SalesOrderDetail_enriched["OrderQty"],
        df_fact_SalesOrderDetail_enriched["ProductID"],
        df_fact_SalesOrderDetail_enriched["SpecialOfferID"],
        df_fact_SalesOrderDetail_enriched["UnitPrice"],
        df_fact_SalesOrderDetail_enriched["UnitPriceDiscount"],
        df_fact_SalesOrderDetail_enriched["LineTotal"],
        df_fact_SalesOrderDetail_enriched["IngestionTimestamp"].alias("SalesOrderDetail.IngestionTimestamp"),
        df_fact_SalesOrderDetail_enriched["SalesOrderDetailID"].alias("SalesOrderDetail.SalesOrderDetailID"),
        df_fact_SalesOrderDetail_enriched["ModifiedDate"].alias("SalesOrderDetail.ModifiedDate") 
    )

###Create a Dimension Product Table 
###We join the three tables 
- catalog.silver.cleaned_product
- catalog.silver.cleaned_productModel
- catalog.silver.cleaned_productSubcategory
- on ProductModelID
- on ProductSubcategoryID
- Select required columns

In [0]:
#Join the Product table,ProductSubcategory and ProductModel tables to create the dim_product table
dim_product = df_dim_Product \
    .join(df_dim_ProductModel, on="ProductModelID", how="inner") \
    .join(df_dim_ProductSubcategory, on="ProductSubcategoryID", how="inner") \
    .select(
        df_dim_Product["*"],  # All columns from df_dim_Product
        df_dim_ProductModel["ProductModelID"].alias("ProductModel.ProductModelID"),  # Specific columns can be added
        df_dim_ProductModel["Name"].alias("ProductModel.Name"),
        df_dim_ProductSubcategory["ProductSubcategoryID"].alias("ProductSubcategory.ProductSubcategoryID"),
        df_dim_ProductSubcategory["ProductCategoryID"].alias("ProductSubcategory.ProductCategoryID"),
        df_dim_ProductSubcategory["Name"].alias("ProductSubcategory.Name"),
        df_dim_ProductSubcategory["ModifiedDate"].alias("ProductSubcategory.ModifiedDate")  # All columns from df_dim_ProductSubcategory
    )

###Create a Dimension Customer Table 
###We join the two tables 
- catalog.silver.cleaned_customer
- catalog.silver.cleaned_salesTerritory
- on TerritoryID
- Select required columns

In [0]:
#  Join the Customer table with the SalesTerritory table to create the dim_customer table   
dim_customer = df_dim_Customer \
    .join(df_dim_SalesTerritory, on=["StoreId", "TerritoryID"], how="inner") \
    .select(df_dim_Customer["*"],df_dim_SalesTerritory["StoreID"].alias("Sales_Territory.StoreID"),
    df_dim_SalesTerritory["TerritoryID"].alias("Sales_Territory.TerritoryID"),
    df_dim_SalesTerritory["AccountNumber"].alias("Sales_Territory.AccountNumber"),
    df_dim_SalesTerritory["SalesTerritoryName"].alias("Sales_Territory.SalesTerritoryName"),
    df_dim_SalesTerritory["CountryRegionCode"].alias("Sales_Territory.CountryRegionCode"),
    df_dim_SalesTerritory["Group"].alias("Sales_Territory.Group"),
    df_dim_SalesTerritory["StoreName"].alias("Sales_Territory.StoreName"),
    df_dim_SalesTerritory["SalesPersonID"].alias("Sales_Territory.SalesPersonID"))

### Display the updated Dataframe

### Add Key Columns

###Adding Surrogate Keys in the FACT and Dimenions using Hash function on the Concatenated Columns 

###Generate Surrogate Keys 
- ProductID = Hash of Concatenation of ProductID and SourceSystem 
- CustomerID = Hash of Concatenation of CustomerID and SourceSystem 

In [0]:
fact_daily_sales= fact_daily_sales.withColumn(
                                    "ProductKey",
                                    F.sha2(F.concat_ws('|', F.col("ProductID"), F.col("SourceSystem")), 256)).withColumn(
                                    "CustomerKey",
                                    F.sha2(F.concat_ws('|', F.col("CustomerID"), F.col("SourceSystem")), 256))





###Generate Surrogate Keys 
- ProductID = Hash of Concatenation of ProductID and SourceSystem 

In [0]:
dim_product=dim_product.withColumn(
                                    "ProductKey",
                                    F.sha2(F.concat_ws('|', F.col("ProductID"), F.col("SourceSystem")), 256))

###Generate Surrogate Keys 
- CustomerID = Hash of Concatenation of CustomerID and SourceSystem 

In [0]:
dim_customer=dim_customer.withColumn(
                                    "CustomerKey",
                                    F.sha2(F.concat_ws('|', F.col("CustomerID"), F.col("SourceSystem")), 256))

## **Removing Timestamp from DateTime columns in the fact tables**

###Formatting the Datetime columns in the FACT in YYYY-MM-DD format. The input data has Time values as well

In [0]:
from pyspark.sql.functions import date_format

fact_daily_sales = fact_daily_sales.withColumn("SalesOrderDetailIngestionTimestamp", date_format("`SalesOrderDetail.IngestionTimestamp`", "yyyy-MM-dd"))
fact_daily_sales = fact_daily_sales.withColumn("IngestionTimestamp", date_format("ingestionTimestamp", "yyyy-MM-dd"))
fact_daily_sales = fact_daily_sales.withColumn("OrderDate", date_format("OrderDate", "yyyy-MM-dd"))
fact_daily_sales = fact_daily_sales.withColumn("DueDate", date_format("DueDate", "yyyy-MM-dd"))
fact_daily_sales = fact_daily_sales.withColumn("ShipDate", date_format("ShipDate", "yyyy-MM-dd"))
fact_daily_sales = fact_daily_sales.withColumn("ModifiedDate", date_format("ModifiedDate", "yyyy-MM-dd"))

### Creating another fact table: fact_monthly sales

###Create a Monthly Fact named as FACT_MONTHLY_SALES

In [0]:
from pyspark.sql import functions as F
# Convert relevant date columns to date type
for date_col in ["OrderDate", "DueDate", "ShipDate", "ModifiedDate", "IngestionTimestamp"]:
    fact_daily_sales = fact_daily_sales.withColumn(date_col, F.col(date_col).cast("date"))

# Create the fact monthly sales table
fact_monthly_sales = (
    fact_daily_sales
    .groupBy("CustomerKey","ProductKey", F.date_format("OrderDate", "yyyy-MM").alias("Month"))  # Group by year and month
    .agg(
        F.sum("TotalDue").alias("TotalSales"),  # Sum of total sales
    )
)



CustomerKey,ProductKey,Month,TotalSales
ed70b2da779b44ea73dd66833114c41d2c03dbc1e572b686e601bd71bedb55f0,0eb3606a4c4dd0055321a6598b79f2dd16c67504c46ff513257b47ea62021e79,2014-01,23488.3347
273f6a1eff2650420edc2ff3d8e16ff71e4b68447f60c16f36f669331035f98c,761573d92ee0da044f6d164eca62d3df52c9832b366d5844f54354e4d2698b3d,2014-01,12074.2815
caacdcc0628d42bd2534b23329036da33b79ed576524ec180276adadae7f635d,94ad5008986241a9c021b16c572a867c12dd54a9b022e1f84ae90833dbc0c68e,2012-06,54842.8645
a39bf99b6605fe5c62bb65dcc21973d4c233fdd48798e17cdf5a9373ac9d392d,a1ddaae5cbfa76d0324726b49d04ff8a225dfd5e9be4c7f00a1844c56ba65677,2013-02,108780.5231
845e27ff6e76c75ca9a5c02daf9e784e28f58fc6b2a30b36258d49daac754ab6,23122dc967a192381df18e82d5208ad5e2a3000bf59e0a65d0a5a0dc8e8bc1d7,2013-06,11920.5677
31b3b715ba788e0f39c4cc7160a5e459f24086b31abba036c0b039f42f5952f3,b329474f5f27a3b3e8a61aaa8240b755182a2c18b19f7b561dd1fd734933c330,2013-08,54244.2389
f1c5d9bb2a59c1b0c056e3fee7b8f542ebf474980a0bfd85ae2aaeeb7aefea67,68136b92cd14dc75c2b5cf5a9e07c6d5ac33b59308d0ebfd9dd2b3da971fefd4,2014-01,1276.8054
dc7ce8e6575f5a1526c0cca0532dd53be7e4496ee2aba0543336265e0072b3d9,2e03726406530cbb1e6d6e76aca631d58dcbb25011a132d30b6676a2af75b740,2013-10,138.6444
d972c19946bbca0193d88a7b9d9768423df5c4a60d70a9e6ac9b17a0b9b168fc,f781252924d6f7961cdd6dd7438a0a6ff91eb2a198e46a2b9ca1500b46200a3a,2014-01,40.8408
e31285fd2d38a5cf3c5301fd4cfa13b1220a0fc0dbf1291be7ddb37f047f4939,373a61d65da86b57a5c8a9e0468353cd1965d9ba065f6f8a8f49b6a31ad994dd,2013-07,30202.5847


###  Writing to Gold **Layer**

###Write the data in GOLD Layer

In [0]:
tbl_fact_daily_sales = catalog + '.'+ schema_gold + '.' +  "fact_daily_sales"
tbl_fact_monthly_sales = catalog + '.'+ schema_gold + '.' +  "fact_monthly_sales"
tbl_dim_product = catalog + '.'+ schema_gold + '.' +  "dim_product"
tbl_dim_customer = catalog + '.'+ schema_gold + '.' +  "dim_customer"

In [0]:
#Write to Gold Layer
fact_daily_sales.write.format("delta").mode("overwrite").saveAsTable(tbl_fact_daily_sales)
fact_monthly_sales.write.format("delta").mode("overwrite").saveAsTable(tbl_fact_monthly_sales)
dim_product.write.format("delta").mode("overwrite").saveAsTable(tbl_dim_product)   
dim_customer.write.format("delta").mode("overwrite").saveAsTable(tbl_dim_customer)