# Welcome

Welcome to the data prep notebook. For additional information about this dataset and the code please visit [https://github.com/bradleyschacht/sample-data](https://github.com/bradleyschacht/sample-data). Below you will find a list of the activities that need to be completed before running this notebook. 

Happy coding!

1. Attach a lakehouse to this notebook
1. Update the values in the **"Parameters"** section
   - **lakehouse_name** - The name of the lakehouse the notebook is attached to and where the data processing will happen
   - **storage_connection_id** - The ID of the connection being used to connect to the ADLS Gen2 account where the source data is stored
   - **storage_account_name** - The name of the ADLS Gen2 account where the source data is stored
   - **storage_path_root** - The location of the sample dataset inside the ADLS Gen2 account in the format of **/_container_/_folder_/_path_/_here_**
   - **create_delta_v_order_disabled** - Specifies if Delta files should be created with V-Order disabled
   - **create_delta_v_order_enabled** - Specifies if Delta files should be created with V-Order enabled
   - **create_json** - Specifies if JSON files should be created
1. If you're feeling lucky, update the values in the **Proceed with caution** section



# Parameters

In [None]:
# Define the variables
lakehouse_name          = 'Lakehouse_Name_Here'

storage_connection_id   = 'xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxx'
storage_account_name    = 'storage_account_name_Here'
storage_path_root       = '/container/FolderStructure/TPC-DS'

create_delta_v_order_disabled   = True
create_delta_v_order_enabled    = True
create_json                     = True

# Proceed with caution
Only modify the code in this section after fully reviewing the notebook code for downstream impact. 

This notebook has been configured to work with a specific folder structure. While this section will provide additional flexibility exercise caution when making changes as it could impact other sections of code.

In [None]:
# Build the other required variables 
# !!!!!  Review all changes for downstream impact !!!!! #
adls_shortcut_name                          = 'TPC-DS'
data_directory_raw                          = 'Files/TPC-DS/GB_001/Raw'
data_directory_clean_delta_v_order_disabled = 'Files/TPC-DS/GB_001/Clean/Delta_V_Order_Disabled'
data_directory_clean_delta_v_order_enabled  = 'Files/TPC-DS/GB_001/Clean/Delta_V_Order_Enabled'
data_directory_clean_json                   = 'Files/TPC-DS/GB_001/Clean/JSON'


workspace_id = mssparkutils.runtime.context['currentWorkspaceId']
lakehouse_id = mssparkutils.lakehouse.get(lakehouse_name).id

# The setup

In [None]:
#Import the relevant libraries
import json
import requests
import time
from pyspark.sql.functions import col, when
from pyspark.sql.types import *

In [None]:
def fn_create_lakehouse_shortcut(shortcut_path, shortcut_name, target):
    
    request_headers = {
        "Authorization": "Bearer " + mssparkutils.credentials.getToken("pbi"),
        "Content-Type": "application/json"
    }

    request_body = {
        "path": shortcut_path,
        "name": shortcut_name,
        "target": target
    }

    request_url = f'https://api.fabric.microsoft.com/v1/workspaces/{workspace_id}/items/{lakehouse_id}/shortcuts?shortcutConflictPolicy=Abort'
    
    if requests.request(method = "GET", url = f'https://api.fabric.microsoft.com/v1/workspaces/{workspace_id}/items/{lakehouse_id}/shortcuts/{shortcut_path}/{shortcut_name}', headers = request_headers).status_code == 200:
        print('The shortcut already exists')

    else:
        response = requests.request(method = "POST", url = request_url, headers = request_headers, json = request_body)

        if response.status_code == 201:
            print (f'\nShortcut created successfully: {shortcut_path}/{shortcut_name}\n')
            #print (f'{request_url} \n')
            #print (f'{request_headers} \n')
            #print (f'{json.dumps(request_body, indent = 3)} \n')
            time.sleep(5)

        else:
            error_summary = {
                'status_code' : response.status_code,
                'error_type' : json.loads(response.text)['errorCode'],
                'error_code' : json.loads(response.text)['moreDetails'][0]['errorCode'],
                'error_message' : json.loads(response.text)['message'],
                'request_url' : request_url,
                'request_headers' : request_headers,
                'request_body' : request_body
            }

            raise Exception(f'\n\nShortcut creation failed\n\nError summary:\n{json.dumps(error_summary, indent = 3)}\n\nFull response text:\n{response.text}')

In [None]:
def fn_process_data (file_name, table_name, dataframe_schema, delta_table_definition):
    
    #####  Define source and destination location variables  #####
    source                                          = f'{data_directory_raw}/{file_name}'
    destination_directory_delta_v_order_disabled    = f'{data_directory_clean_delta_v_order_disabled}/{table_name}'
    destination_directory_delta_v_order_enabled     = f'{data_directory_clean_delta_v_order_enabled}/{table_name}'
    destination_directory_json                      = f'{data_directory_clean_json}/{table_name}'

    delta_table_definition_v_order_disabled = delta_table_definition.replace("[DELTA_TABLE_LOCATION]", destination_directory_delta_v_order_disabled)
    delta_table_definition_v_order_enabled  = delta_table_definition.replace("[DELTA_TABLE_LOCATION]", destination_directory_delta_v_order_enabled)

    #####  Print the variable values to the cell output  #####
    print (f'Table Name:                                {table_name}')
    print (f'Source File:                               {source}')
    print (f'Delta Table Location (V-Order Disabled):   {destination_directory_delta_v_order_disabled}')
    print (f'Delta Table Location (V-Order Enabled):    {destination_directory_delta_v_order_enabled}')
    print (f'JSON Location:                             {destination_directory_json} \n')
        
    #####  Read the files  #####
    if create_delta_v_order_disabled == True or create_delta_v_order_enabled == True or create_json == True:
        df = spark.read.load(source, format = 'csv', delimiter = '|', header = False, schema = dataframe_schema)

    #####  Create the delta table - V-Order Disabled  #####
    if create_delta_v_order_disabled == True:
            
        print (f'{table_name} - Delta - Disabling V-Order on write')
        spark.conf.set('spark.sql.parquet.vorder.enabled', 'false')

        time.sleep(5)
        
        print (f'{table_name} - Delta - Dropping the table if it exists')
        spark.sql(f'DROP TABLE IF EXISTS {table_name}')

        print (f'{table_name} - Delta - Removing the directory if it exists')
        if mssparkutils.fs.exists(destination_directory_delta_v_order_disabled):
            mssparkutils.fs.rm(destination_directory_delta_v_order_disabled, True)
        
        print (f'{table_name} - Delta - Creating the table')
        spark.sql(delta_table_definition_v_order_disabled)

        time.sleep(5)

        print (f'{table_name} - Delta - Writing to the table has started')
        df.write.format("delta").mode("overwrite").insertInto(table_name)
        print (f'{table_name} - Delta - Writing to the table has completed')

        print (f'{table_name} - Delta - Dropping the table if it exists \n')
        spark.sql(f'DROP TABLE IF EXISTS {table_name}')

        time.sleep(5)

    #####  Create the delta table - V-Order Enabled  #####
    if create_delta_v_order_enabled == True:

        print (f'{table_name} - Delta - Enabling V-Order on write')
        spark.conf.set('spark.sql.parquet.vorder.enabled', 'true')

        time.sleep(5)
        
        print (f'{table_name} - Delta - Dropping the table if it exists')
        spark.sql(f'DROP TABLE IF EXISTS {table_name}')

        print (f'{table_name} - Delta - Removing the directory if it exists')
        if mssparkutils.fs.exists(destination_directory_delta_v_order_enabled):
            mssparkutils.fs.rm(destination_directory_delta_v_order_enabled, True)
        
        print (f'{table_name} - Delta - Creating the table')
        spark.sql(delta_table_definition_v_order_enabled)

        time.sleep(5)

        print (f'{table_name} - Delta - Writing to the table has started')
        df.write.format("delta").mode("overwrite").insertInto(table_name)
        print (f'{table_name} - Delta - Writing to the table has completed')

        print (f'{table_name} - Delta - Dropping the table if it exists \n')
        spark.sql(f'DROP TABLE IF EXISTS {table_name}')

        time.sleep(5)

    #####  Create the JSON files  #####
    if create_json == True:
        print (f'{table_name} - JSON - Removing the directory if it exists')
        if mssparkutils.fs.exists(destination_directory_json):
            mssparkutils.fs.rm(destination_directory_json, True)
        
        print (f'{table_name} - JSON - Writing to the table has started')
        df.write.mode('overwrite').json(destination_directory_json)
        print (f'{table_name} - JSON - Writing to the table has completed \n')

In [None]:
##### Create the shortcut to the ADLS Gen2 storage account in the files section of the lakehouse  #####
target = {
    "adlsGen2": {
        "location": f'https://{storage_account_name}.dfs.core.windows.net',
        "subpath": storage_path_root,
        "connectionId": storage_connection_id
    }
}

fn_create_lakehouse_shortcut(
    shortcut_path ='Files',
    shortcut_name = adls_shortcut_name,
    target = target
)

# The data processing

In [None]:
#call_center
file_name = 'call_center/*.dat'
table_name = 'call_center'
dataframe_schema = \
	StructType([ \
			StructField("cc_call_center_sk", IntegerType()), \
			StructField("cc_call_center_id", StringType()), \
			StructField("cc_rec_start_date", DateType()), \
			StructField("cc_rec_end_date", DateType()), \
			StructField("cc_closed_date_sk", IntegerType()), \
			StructField("cc_open_date_sk", IntegerType()), \
			StructField("cc_name", StringType()), \
			StructField("cc_class", StringType()), \
			StructField("cc_employees", IntegerType()), \
			StructField("cc_sq_ft", IntegerType()), \
			StructField("cc_hours", StringType()), \
			StructField("cc_manager", StringType()), \
			StructField("cc_mkt_id", IntegerType()), \
			StructField("cc_mkt_class", StringType()), \
			StructField("cc_mkt_desc", StringType()), \
			StructField("cc_market_manager", StringType()), \
			StructField("cc_division", IntegerType()), \
			StructField("cc_division_name", StringType()), \
			StructField("cc_company", IntegerType()), \
			StructField("cc_company_name", StringType()), \
			StructField("cc_street_number", StringType()), \
			StructField("cc_street_name", StringType()), \
			StructField("cc_street_type", StringType()), \
			StructField("cc_suite_number", StringType()), \
			StructField("cc_city", StringType()), \
			StructField("cc_county", StringType()), \
			StructField("cc_state", StringType()), \
			StructField("cc_zip", StringType()), \
			StructField("cc_country", StringType()), \
			StructField("cc_gmt_offset", DoubleType()), \
			StructField("cc_tax_percentage", DoubleType()) \
	])
delta_table_definition = """
CREATE TABLE call_center
    (
        cc_call_center_sk         integer               not null,
        cc_call_center_id         char(16)              not null,
        cc_rec_start_date         date                          ,
        cc_rec_end_date           date                          ,
        cc_closed_date_sk         integer                       ,
        cc_open_date_sk           integer                       ,
        cc_name                   varchar(50)                   ,
        cc_class                  varchar(50)                   ,
        cc_employees              integer                       ,
        cc_sq_ft                  integer                       ,
        cc_hours                  char(20)                      ,
        cc_manager                varchar(40)                   ,
        cc_mkt_id                 integer                       ,
        cc_mkt_class              char(50)                      ,
        cc_mkt_desc               varchar(100)                  ,
        cc_market_manager         varchar(40)                   ,
        cc_division               integer                       ,
        cc_division_name          varchar(50)                   ,
        cc_company                integer                       ,
        cc_company_name           char(50)                      ,
        cc_street_number          char(10)                      ,
        cc_street_name            varchar(60)                   ,
        cc_street_type            char(15)                      ,
        cc_suite_number           char(10)                      ,
        cc_city                   varchar(60)                   ,
        cc_county                 varchar(30)                   ,
        cc_state                  char(2)                       ,
        cc_zip                    char(10)                      ,
        cc_country                varchar(20)                   ,
        cc_gmt_offset             decimal(5,2)                  ,
        cc_tax_percentage         decimal(5,2)
    )
    LOCATION '[DELTA_TABLE_LOCATION]'
"""

fn_process_data (file_name, table_name, dataframe_schema, delta_table_definition)

In [None]:
#catalog_page
file_name = 'catalog_page/*.dat'
table_name = 'catalog_page'
dataframe_schema = \
	StructType([ \
			StructField("cp_catalog_page_sk", IntegerType()), \
			StructField("cp_catalog_page_id", StringType()), \
			StructField("cp_start_date_sk", IntegerType()), \
			StructField("cp_end_date_sk", IntegerType()), \
			StructField("cp_department", StringType()), \
			StructField("cp_catalog_number", IntegerType()), \
			StructField("cp_catalog_page_number", IntegerType()), \
			StructField("cp_description", StringType()), \
			StructField("cp_type", StringType()) \
	])
delta_table_definition = """
CREATE TABLE catalog_page
    (
        cp_catalog_page_sk        integer               not null,
        cp_catalog_page_id        char(16)              not null,
        cp_start_date_sk          integer                       ,
        cp_end_date_sk            integer                       ,
        cp_department             varchar(50)                   ,
        cp_catalog_number         integer                       ,
        cp_catalog_page_number    integer                       ,
        cp_description            varchar(100)                  ,
        cp_type                   varchar(100)
    )
    LOCATION '[DELTA_TABLE_LOCATION]'
"""

fn_process_data (file_name, table_name, dataframe_schema, delta_table_definition)

In [None]:
# catalog_returns
file_name = 'catalog_returns/*.dat'
table_name = 'catalog_returns'
dataframe_schema = \
	StructType([ \
			StructField("cr_returned_date_sk", IntegerType()), \
			StructField("cr_returned_time_sk", IntegerType()), \
			StructField("cr_item_sk", IntegerType()), \
			StructField("cr_refunded_customer_sk", IntegerType()), \
			StructField("cr_refunded_cdemo_sk", IntegerType()), \
			StructField("cr_refunded_hdemo_sk", IntegerType()), \
			StructField("cr_refunded_addr_sk", IntegerType()), \
			StructField("cr_returning_customer_sk", IntegerType()), \
			StructField("cr_returning_cdemo_sk", IntegerType()), \
			StructField("cr_returning_hdemo_sk", IntegerType()), \
			StructField("cr_returning_addr_sk", IntegerType()), \
			StructField("cr_call_center_sk", IntegerType()), \
			StructField("cr_catalog_page_sk", IntegerType()), \
			StructField("cr_ship_mode_sk", IntegerType()), \
			StructField("cr_warehouse_sk", IntegerType()), \
			StructField("cr_reason_sk", IntegerType()), \
			StructField("cr_order_number", IntegerType()), \
			StructField("cr_return_quantity", IntegerType()), \
			StructField("cr_return_amount", DoubleType()), \
			StructField("cr_return_tax", DoubleType()), \
			StructField("cr_return_amt_inc_tax", DoubleType()), \
			StructField("cr_fee", DoubleType()), \
			StructField("cr_return_ship_cost", DoubleType()), \
			StructField("cr_refunded_cash", DoubleType()), \
			StructField("cr_reversed_charge", DoubleType()), \
			StructField("cr_store_credit", DoubleType()), \
			StructField("cr_net_loss", DoubleType()) \
	])
delta_table_definition = """
CREATE TABLE catalog_returns
	(
        cr_returned_date_sk       integer                       ,
        cr_returned_time_sk       integer                       ,
        cr_item_sk                integer               not null,
        cr_refunded_customer_sk   integer                       ,
        cr_refunded_cdemo_sk      integer                       ,
        cr_refunded_hdemo_sk      integer                       ,
        cr_refunded_addr_sk       integer                       ,
        cr_returning_customer_sk  integer                       ,
        cr_returning_cdemo_sk     integer                       ,
        cr_returning_hdemo_sk     integer                       ,
        cr_returning_addr_sk      integer                       ,
        cr_call_center_sk         integer                       ,
        cr_catalog_page_sk        integer                       ,
        cr_ship_mode_sk           integer                       ,
        cr_warehouse_sk           integer                       ,
        cr_reason_sk              integer                       ,
        cr_order_number           integer               not null,
        cr_return_quantity        integer                       ,
        cr_return_amount          decimal(7,2)                  ,
        cr_return_tax             decimal(7,2)                  ,
        cr_return_amt_inc_tax     decimal(7,2)                  ,
        cr_fee                    decimal(7,2)                  ,
        cr_return_ship_cost       decimal(7,2)                  ,
        cr_refunded_cash          decimal(7,2)                  ,
        cr_reversed_charge        decimal(7,2)                  ,
        cr_store_credit           decimal(7,2)                  ,
        cr_net_loss               decimal(7,2)                 
	)
    LOCATION '[DELTA_TABLE_LOCATION]'
"""

fn_process_data (file_name, table_name, dataframe_schema, delta_table_definition)

In [None]:
# catalog_sales
file_name = 'catalog_sales/*.dat'
table_name = 'catalog_sales'
dataframe_schema = \
	StructType([ \
			StructField("cs_sold_date_sk", IntegerType()), \
			StructField("cs_sold_time_sk", IntegerType()), \
			StructField("cs_ship_date_sk", IntegerType()), \
			StructField("cs_bill_customer_sk", IntegerType()), \
			StructField("cs_bill_cdemo_sk", IntegerType()), \
			StructField("cs_bill_hdemo_sk", IntegerType()), \
			StructField("cs_bill_addr_sk", IntegerType()), \
			StructField("cs_ship_customer_sk", IntegerType()), \
			StructField("cs_ship_cdemo_sk", IntegerType()), \
			StructField("cs_ship_hdemo_sk", IntegerType()), \
			StructField("cs_ship_addr_sk", IntegerType()), \
			StructField("cs_call_center_sk", IntegerType()), \
			StructField("cs_catalog_page_sk", IntegerType()), \
			StructField("cs_ship_mode_sk", IntegerType()), \
			StructField("cs_warehouse_sk", IntegerType()), \
			StructField("cs_item_sk", IntegerType()), \
			StructField("cs_promo_sk", IntegerType()), \
			StructField("cs_order_number", IntegerType()), \
			StructField("cs_quantity", IntegerType()), \
			StructField("cs_wholesale_cost", DoubleType()), \
			StructField("cs_list_price", DoubleType()), \
			StructField("cs_sales_price", DoubleType()), \
			StructField("cs_ext_discount_amt", DoubleType()), \
			StructField("cs_ext_sales_price", DoubleType()), \
			StructField("cs_ext_wholesale_cost", DoubleType()), \
			StructField("cs_ext_list_price", DoubleType()), \
			StructField("cs_ext_tax", DoubleType()), \
			StructField("cs_coupon_amt", DoubleType()), \
			StructField("cs_ext_ship_cost", DoubleType()), \
			StructField("cs_net_paid", DoubleType()), \
			StructField("cs_net_paid_inc_tax", DoubleType()), \
			StructField("cs_net_paid_inc_ship", DoubleType()), \
			StructField("cs_net_paid_inc_ship_tax", DoubleType()), \
			StructField("cs_net_profit", DoubleType()) \
	])
delta_table_definition = """
CREATE TABLE catalog_sales
	(
        cs_sold_date_sk           integer                       ,
        cs_sold_time_sk           integer                       ,
        cs_ship_date_sk           integer                       ,
        cs_bill_customer_sk       integer                       ,
        cs_bill_cdemo_sk          integer                       ,
        cs_bill_hdemo_sk          integer                       ,
        cs_bill_addr_sk           integer                       ,
        cs_ship_customer_sk       integer                       ,
        cs_ship_cdemo_sk          integer                       ,
        cs_ship_hdemo_sk          integer                       ,
        cs_ship_addr_sk           integer                       ,
        cs_call_center_sk         integer                       ,
        cs_catalog_page_sk        integer                       ,
        cs_ship_mode_sk           integer                       ,
        cs_warehouse_sk           integer                       ,
        cs_item_sk                integer               not null,
        cs_promo_sk               integer                       ,
        cs_order_number           integer               not null,
        cs_quantity               integer                       ,
        cs_wholesale_cost         decimal(7,2)                  ,
        cs_list_price             decimal(7,2)                  ,
        cs_sales_price            decimal(7,2)                  ,
        cs_ext_discount_amt       decimal(7,2)                  ,
        cs_ext_sales_price        decimal(7,2)                  ,
        cs_ext_wholesale_cost     decimal(7,2)                  ,
        cs_ext_list_price         decimal(7,2)                  ,
        cs_ext_tax                decimal(7,2)                  ,
        cs_coupon_amt             decimal(7,2)                  ,
        cs_ext_ship_cost          decimal(7,2)                  ,
        cs_net_paid               decimal(7,2)                  ,
        cs_net_paid_inc_tax       decimal(7,2)                  ,
        cs_net_paid_inc_ship      decimal(7,2)                  ,
        cs_net_paid_inc_ship_tax  decimal(7,2)                  ,
        cs_net_profit             decimal(7,2)                  
	)
	LOCATION '[DELTA_TABLE_LOCATION]'
"""

fn_process_data (file_name, table_name, dataframe_schema, delta_table_definition)

In [None]:
# customer
file_name = 'customer/*.dat'
table_name = 'customer'
dataframe_schema = \
	StructType([ \
			StructField("c_customer_sk", IntegerType()), \
			StructField("c_customer_id", StringType()), \
			StructField("c_current_cdemo_sk", IntegerType()), \
			StructField("c_current_hdemo_sk", IntegerType()), \
			StructField("c_current_addr_sk", IntegerType()), \
			StructField("c_first_shipto_date_sk", IntegerType()), \
			StructField("c_first_sales_date_sk", IntegerType()), \
			StructField("c_salutation", StringType()), \
			StructField("c_first_name", StringType()), \
			StructField("c_last_name", StringType()), \
			StructField("c_preferred_cust_flag", StringType()), \
			StructField("c_birth_day", IntegerType()), \
			StructField("c_birth_month", IntegerType()), \
			StructField("c_birth_year", IntegerType()), \
			StructField("c_birth_country", StringType()), \
			StructField("c_login", StringType()), \
			StructField("c_email_address", StringType()), \
			StructField("c_last_review_date", StringType()) \
	])
delta_table_definition = """
CREATE TABLE customer
	(
        c_customer_sk             integer               not null,
        c_customer_id             char(16)              not null,
        c_current_cdemo_sk        integer                       ,
        c_current_hdemo_sk        integer                       ,
        c_current_addr_sk         integer                       ,
        c_first_shipto_date_sk    integer                       ,
        c_first_sales_date_sk     integer                       ,
        c_salutation              char(10)                      ,
        c_first_name              char(20)                      ,
        c_last_name               char(30)                      ,
        c_preferred_cust_flag     char(1)                       ,
        c_birth_day               integer                       ,
        c_birth_month             integer                       ,
        c_birth_year              integer                       ,
        c_birth_country           varchar(20)                   ,
        c_login                   char(13)                      ,
        c_email_address           char(50)                      ,
        c_last_review_date        char(10)                      
	)
	LOCATION '[DELTA_TABLE_LOCATION]'
"""

fn_process_data (file_name, table_name, dataframe_schema, delta_table_definition)

In [None]:
# customer_address
file_name = 'customer_address/*.dat'
table_name = 'customer_address'
dataframe_schema = \
	StructType([ \
			StructField("ca_address_sk", IntegerType()), \
			StructField("ca_address_id", StringType()), \
			StructField("ca_street_number", StringType()), \
			StructField("ca_street_name", StringType()), \
			StructField("ca_street_type", StringType()), \
			StructField("ca_suite_number", StringType()), \
			StructField("ca_city", StringType()), \
			StructField("ca_county", StringType()), \
			StructField("ca_state", StringType()), \
			StructField("ca_zip", StringType()), \
			StructField("ca_country", StringType()), \
			StructField("ca_gmt_offset", DoubleType()), \
			StructField("ca_location_type", StringType()) \
	])
delta_table_definition = """
CREATE TABLE customer_address
	(
        ca_address_sk             integer               not null,
        ca_address_id             char(16)              not null,
        ca_street_number          char(10)                      ,
        ca_street_name            varchar(60)                   ,
        ca_street_type            char(15)                      ,
        ca_suite_number           char(10)                      ,
        ca_city                   varchar(60)                   ,
        ca_county                 varchar(30)                   ,
        ca_state                  char(2)                       ,
        ca_zip                    char(10)                      ,
        ca_country                varchar(20)                   ,
        ca_gmt_offset             decimal(5,2)                  ,
        ca_location_type          char(20)                      
	)
	LOCATION '[DELTA_TABLE_LOCATION]'
"""

fn_process_data (file_name, table_name, dataframe_schema, delta_table_definition)

In [None]:
# customer_demographics
file_name = 'customer_demographics/*.dat'
table_name = 'customer_demographics'
dataframe_schema = \
	StructType([ \
			StructField("cd_demo_sk", IntegerType()), \
			StructField("cd_gender", StringType()), \
			StructField("cd_marital_status", StringType()), \
			StructField("cd_education_status", StringType()), \
			StructField("cd_purchase_estimate", IntegerType()), \
			StructField("cd_credit_rating", StringType()), \
			StructField("cd_dep_count", IntegerType()), \
			StructField("cd_dep_employed_count", IntegerType()), \
			StructField("cd_dep_college_count", IntegerType()) \
	])
delta_table_definition = """
CREATE TABLE customer_demographics
	(
        cd_demo_sk                integer               not null,
        cd_gender                 char(1)                       ,
        cd_marital_status         char(1)                       ,
        cd_education_status       char(20)                      ,
        cd_purchase_estimate      integer                       ,
        cd_credit_rating          char(10)                      ,
        cd_dep_count              integer                       ,
        cd_dep_employed_count     integer                       ,
        cd_dep_college_count      integer                       
	)
	LOCATION '[DELTA_TABLE_LOCATION]'
"""

fn_process_data (file_name, table_name, dataframe_schema, delta_table_definition)

In [None]:
# date_dim
file_name = 'date_dim/*.dat'
table_name = 'date_dim'
dataframe_schema = \
	StructType([ \
			StructField("d_date_sk", IntegerType()), \
			StructField("d_date_id", StringType()), \
			StructField("d_date", DateType()), \
			StructField("d_month_seq", IntegerType()), \
			StructField("d_week_seq", IntegerType()), \
			StructField("d_quarter_seq", IntegerType()), \
			StructField("d_year", IntegerType()), \
			StructField("d_dow", IntegerType()), \
			StructField("d_moy", IntegerType()), \
			StructField("d_dom", IntegerType()), \
			StructField("d_qoy", IntegerType()), \
			StructField("d_fy_year", IntegerType()), \
			StructField("d_fy_quarter_seq", IntegerType()), \
			StructField("d_fy_week_seq", IntegerType()), \
			StructField("d_day_name", StringType()), \
			StructField("d_quarter_name", StringType()), \
			StructField("d_holiday", StringType()), \
			StructField("d_weekend", StringType()), \
			StructField("d_following_holiday", StringType()), \
			StructField("d_first_dom", IntegerType()), \
			StructField("d_last_dom", IntegerType()), \
			StructField("d_same_day_ly", IntegerType()), \
			StructField("d_same_day_lq", IntegerType()), \
			StructField("d_current_day", StringType()), \
			StructField("d_current_week", StringType()), \
			StructField("d_current_month", StringType()), \
			StructField("d_current_quarter", StringType()), \
			StructField("d_current_year", StringType()) \
	])
delta_table_definition = """
CREATE TABLE date_dim
	(
        d_date_sk                 integer               not null,
        d_date_id                 char(16)              not null,
        d_date                    date                          ,
        d_month_seq               integer                       ,
        d_week_seq                integer                       ,
        d_quarter_seq             integer                       ,
        d_year                    integer                       ,
        d_dow                     integer                       ,
        d_moy                     integer                       ,
        d_dom                     integer                       ,
        d_qoy                     integer                       ,
        d_fy_year                 integer                       ,
        d_fy_quarter_seq          integer                       ,
        d_fy_week_seq             integer                       ,
        d_day_name                char(9)                       ,
        d_quarter_name            char(6)                       ,
        d_holiday                 char(1)                       ,
        d_weekend                 char(1)                       ,
        d_following_holiday       char(1)                       ,
        d_first_dom               integer                       ,
        d_last_dom                integer                       ,
        d_same_day_ly             integer                       ,
        d_same_day_lq             integer                       ,
        d_current_day             char(1)                       ,
        d_current_week            char(1)                       ,
        d_current_month           char(1)                       ,
        d_current_quarter         char(1)                       ,
        d_current_year            char(1)                       
	)
	LOCATION '[DELTA_TABLE_LOCATION]'
"""

fn_process_data (file_name, table_name, dataframe_schema, delta_table_definition)

In [None]:
# household_demographics
file_name = 'household_demographics/*.dat'
table_name = 'household_demographics'
dataframe_schema = \
	StructType([ \
			StructField("hd_demo_sk", IntegerType()), \
			StructField("hd_income_band_sk", IntegerType()), \
			StructField("hd_buy_potential", StringType()), \
			StructField("hd_dep_count", IntegerType()), \
			StructField("hd_vehicle_count", IntegerType()) \
	])
delta_table_definition = """
CREATE TABLE household_demographics
	(
        hd_demo_sk                integer               not null,
        hd_income_band_sk         integer                       ,
        hd_buy_potential          char(15)                      ,
        hd_dep_count              integer                       ,
        hd_vehicle_count          integer                       
	)
	LOCATION '[DELTA_TABLE_LOCATION]'
"""

fn_process_data (file_name, table_name, dataframe_schema, delta_table_definition)

In [None]:
# income_band
file_name = 'income_band/*.dat'
table_name = 'income_band'
dataframe_schema = \
	StructType([ \
			StructField("ib_income_band_sk", IntegerType()), \
			StructField("ib_lower_bound", IntegerType()), \
			StructField("ib_upper_bound", IntegerType()) \
	])
delta_table_definition = """
CREATE TABLE income_band
	(
        ib_income_band_sk         integer               not null,
        ib_lower_bound            integer                       ,
        ib_upper_bound            integer                       
	)
	LOCATION '[DELTA_TABLE_LOCATION]'
"""

fn_process_data (file_name, table_name, dataframe_schema, delta_table_definition)

In [None]:
# inventory
file_name = 'inventory/*.dat'
table_name = 'inventory'
dataframe_schema = \
	StructType([ \
			StructField("inv_date_sk", IntegerType()), \
			StructField("inv_item_sk", IntegerType()), \
			StructField("inv_warehouse_sk", IntegerType()), \
			StructField("inv_quantity_on_hand", IntegerType()) \
	])
delta_table_definition = """
CREATE TABLE inventory
	(
        inv_date_sk               integer               not null,
        inv_item_sk               integer               not null,
        inv_warehouse_sk          integer               not null,
        inv_quantity_on_hand      integer                       
	)
	LOCATION '[DELTA_TABLE_LOCATION]'
"""

fn_process_data (file_name, table_name, dataframe_schema, delta_table_definition)

In [None]:
# item
file_name = 'item/*.dat'
table_name = 'item'
dataframe_schema = \
	StructType([ \
			StructField("i_item_sk", IntegerType()), \
			StructField("i_item_id", StringType()), \
			StructField("i_rec_start_date", DateType()), \
			StructField("i_rec_end_date", DateType()), \
			StructField("i_item_desc", StringType()), \
			StructField("i_current_price", DoubleType()), \
			StructField("i_wholesale_cost", DoubleType()), \
			StructField("i_brand_id", IntegerType()), \
			StructField("i_brand", StringType()), \
			StructField("i_class_id", IntegerType()), \
			StructField("i_class", StringType()), \
			StructField("i_category_id", IntegerType()), \
			StructField("i_category", StringType()), \
			StructField("i_manufact_id", IntegerType()), \
			StructField("i_manufact", StringType()), \
			StructField("i_size", StringType()), \
			StructField("i_formulation", StringType()), \
			StructField("i_color", StringType()), \
			StructField("i_units", StringType()), \
			StructField("i_container", StringType()), \
			StructField("i_manager_id", IntegerType()), \
			StructField("i_product_name", StringType()) \
	])
delta_table_definition = """
CREATE TABLE item
	(
        i_item_sk                 integer               not null,
        i_item_id                 char(16)              not null,
        i_rec_start_date          date                          ,
        i_rec_end_date            date                          ,
        i_item_desc               varchar(200)                  ,
        i_current_price           decimal(7,2)                  ,
        i_wholesale_cost          decimal(7,2)                  ,
        i_brand_id                integer                       ,
        i_brand                   char(50)                      ,
        i_class_id                integer                       ,
        i_class                   char(50)                      ,
        i_category_id             integer                       ,
        i_category                char(50)                      ,
        i_manufact_id             integer                       ,
        i_manufact                char(50)                      ,
        i_size                    char(20)                      ,
        i_formulation             char(20)                      ,
        i_color                   char(20)                      ,
        i_units                   char(10)                      ,
        i_container               char(10)                      ,
        i_manager_id              integer                       ,
        i_product_name            char(50)                      
	)
	LOCATION '[DELTA_TABLE_LOCATION]'
"""

fn_process_data (file_name, table_name, dataframe_schema, delta_table_definition)

In [None]:
# promotion
file_name = 'promotion/*.dat'
table_name = 'promotion'
dataframe_schema = \
	StructType([ \
			StructField("p_promo_sk", IntegerType()), \
			StructField("p_promo_id", StringType()), \
			StructField("p_start_date_sk", IntegerType()), \
			StructField("p_end_date_sk", IntegerType()), \
			StructField("p_item_sk", IntegerType()), \
			StructField("p_cost", DoubleType()), \
			StructField("p_response_target", IntegerType()), \
			StructField("p_promo_name", StringType()), \
			StructField("p_channel_dmail", StringType()), \
			StructField("p_channel_email", StringType()), \
			StructField("p_channel_catalog", StringType()), \
			StructField("p_channel_tv", StringType()), \
			StructField("p_channel_radio", StringType()), \
			StructField("p_channel_press", StringType()), \
			StructField("p_channel_event", StringType()), \
			StructField("p_channel_demo", StringType()), \
			StructField("p_channel_details", StringType()), \
			StructField("p_purpose", StringType()), \
			StructField("p_discount_active", StringType()) \
	])
delta_table_definition = """
CREATE TABLE promotion
	(
        p_promo_sk                integer               not null,
        p_promo_id                char(16)              not null,
        p_start_date_sk           integer                       ,
        p_end_date_sk             integer                       ,
        p_item_sk                 integer                       ,
        p_cost                    decimal(15,2)                 ,
        p_response_target         integer                       ,
        p_promo_name              char(50)                      ,
        p_channel_dmail           char(1)                       ,
        p_channel_email           char(1)                       ,
        p_channel_catalog         char(1)                       ,
        p_channel_tv              char(1)                       ,
        p_channel_radio           char(1)                       ,
        p_channel_press           char(1)                       ,
        p_channel_event           char(1)                       ,
        p_channel_demo            char(1)                       ,
        p_channel_details         varchar(100)                  ,
        p_purpose                 char(15)                      ,
        p_discount_active         char(1)                       
	)
	LOCATION '[DELTA_TABLE_LOCATION]'
"""

fn_process_data (file_name, table_name, dataframe_schema, delta_table_definition)

In [None]:
# reason
file_name = 'reason/*.dat'
table_name = 'reason'
dataframe_schema = \
	StructType([ \
			StructField("r_reason_sk", IntegerType()), \
			StructField("r_reason_id", StringType()), \
			StructField("r_reason_desc", StringType()) \
	])
delta_table_definition = """
CREATE TABLE reason
	(
        r_reason_sk               integer               not null,
        r_reason_id               char(16)              not null,
        r_reason_desc             char(100)                     
	)
	LOCATION '[DELTA_TABLE_LOCATION]'
"""

fn_process_data (file_name, table_name, dataframe_schema, delta_table_definition)

In [None]:
# ship_mode
file_name = 'ship_mode/*.dat'
table_name = 'ship_mode'
dataframe_schema = \
	StructType([ \
			StructField("sm_ship_mode_sk", IntegerType()), \
			StructField("sm_ship_mode_id", StringType()), \
			StructField("sm_type", StringType()), \
			StructField("sm_code", StringType()), \
			StructField("sm_carrier", StringType()), \
			StructField("sm_contract", StringType()) \
	])
delta_table_definition = """
CREATE TABLE ship_mode
	(
        sm_ship_mode_sk           integer               not null,
        sm_ship_mode_id           char(16)              not null,
        sm_type                   char(30)                      ,
        sm_code                   char(10)                      ,
        sm_carrier                char(20)                      ,
        sm_contract               char(20)                      
	)
	LOCATION '[DELTA_TABLE_LOCATION]'
"""

fn_process_data (file_name, table_name, dataframe_schema, delta_table_definition)

In [None]:
# store
file_name = 'store/*.dat'
table_name = 'store'
dataframe_schema = \
	StructType([ \
			StructField("s_store_sk", IntegerType()), \
			StructField("s_store_id", StringType()), \
			StructField("s_rec_start_date", DateType()), \
			StructField("s_rec_end_date", DateType()), \
			StructField("s_closed_date_sk", IntegerType()), \
			StructField("s_store_name", StringType()), \
			StructField("s_number_employees", IntegerType()), \
			StructField("s_floor_space", IntegerType()), \
			StructField("s_hours", StringType()), \
			StructField("s_manager", StringType()), \
			StructField("s_market_id", IntegerType()), \
			StructField("s_geography_class", StringType()), \
			StructField("s_market_desc", StringType()), \
			StructField("s_market_manager", StringType()), \
			StructField("s_division_id", IntegerType()), \
			StructField("s_division_name", StringType()), \
			StructField("s_company_id", IntegerType()), \
			StructField("s_company_name", StringType()), \
			StructField("s_street_number", StringType()), \
			StructField("s_street_name", StringType()), \
			StructField("s_street_type", StringType()), \
			StructField("s_suite_number", StringType()), \
			StructField("s_city", StringType()), \
			StructField("s_county", StringType()), \
			StructField("s_state", StringType()), \
			StructField("s_zip", StringType()), \
			StructField("s_country", StringType()), \
			StructField("s_gmt_offset", DoubleType()), \
			StructField("s_tax_precentage", DoubleType()) \
	])
delta_table_definition = """
CREATE TABLE store
	(
        s_store_sk                integer               not null,
        s_store_id                char(16)              not null,
        s_rec_start_date          date                          ,
        s_rec_end_date            date                          ,
        s_closed_date_sk          integer                       ,
        s_store_name              varchar(50)                   ,
        s_number_employees        integer                       ,
        s_floor_space             integer                       ,
        s_hours                   char(20)                      ,
        s_manager                 varchar(40)                   ,
        s_market_id               integer                       ,
        s_geography_class         varchar(100)                  ,
        s_market_desc             varchar(100)                  ,
        s_market_manager          varchar(40)                   ,
        s_division_id             integer                       ,
        s_division_name           varchar(50)                   ,
        s_company_id              integer                       ,
        s_company_name            varchar(50)                   ,
        s_street_number           varchar(10)                   ,
        s_street_name             varchar(60)                   ,
        s_street_type             char(15)                      ,
        s_suite_number            char(10)                      ,
        s_city                    varchar(60)                   ,
        s_county                  varchar(30)                   ,
        s_state                   char(2)                       ,
        s_zip                     char(10)                      ,
        s_country                 varchar(20)                   ,
        s_gmt_offset              decimal(5,2)                  ,
        s_tax_precentage          decimal(5,2)                  
	)
	LOCATION '[DELTA_TABLE_LOCATION]'
"""

fn_process_data (file_name, table_name, dataframe_schema, delta_table_definition)

In [None]:
# store_returns
file_name = 'store_returns/*.dat'
table_name = 'store_returns'
dataframe_schema = \
	StructType([ \
			StructField("sr_returned_date_sk", IntegerType()), \
			StructField("sr_return_time_sk", IntegerType()), \
			StructField("sr_item_sk", IntegerType()), \
			StructField("sr_customer_sk", IntegerType()), \
			StructField("sr_cdemo_sk", IntegerType()), \
			StructField("sr_hdemo_sk", IntegerType()), \
			StructField("sr_addr_sk", IntegerType()), \
			StructField("sr_store_sk", IntegerType()), \
			StructField("sr_reason_sk", IntegerType()), \
			StructField("sr_ticket_number", IntegerType()), \
			StructField("sr_return_quantity", IntegerType()), \
			StructField("sr_return_amt", DoubleType()), \
			StructField("sr_return_tax", DoubleType()), \
			StructField("sr_return_amt_inc_tax", DoubleType()), \
			StructField("sr_fee", DoubleType()), \
			StructField("sr_return_ship_cost", DoubleType()), \
			StructField("sr_refunded_cash", DoubleType()), \
			StructField("sr_reversed_charge", DoubleType()), \
			StructField("sr_store_credit", DoubleType()), \
			StructField("sr_net_loss", DoubleType()) \
	])
delta_table_definition = """
CREATE TABLE store_returns
	(
        sr_returned_date_sk       integer                       ,
        sr_return_time_sk         integer                       ,
        sr_item_sk                integer               not null,
        sr_customer_sk            integer                       ,
        sr_cdemo_sk               integer                       ,
        sr_hdemo_sk               integer                       ,
        sr_addr_sk                integer                       ,
        sr_store_sk               integer                       ,
        sr_reason_sk              integer                       ,
        sr_ticket_number          integer               not null,
        sr_return_quantity        integer                       ,
        sr_return_amt             decimal(7,2)                  ,
        sr_return_tax             decimal(7,2)                  ,
        sr_return_amt_inc_tax     decimal(7,2)                  ,
        sr_fee                    decimal(7,2)                  ,
        sr_return_ship_cost       decimal(7,2)                  ,
        sr_refunded_cash          decimal(7,2)                  ,
        sr_reversed_charge        decimal(7,2)                  ,
        sr_store_credit           decimal(7,2)                  ,
        sr_net_loss               decimal(7,2)                  
	)
	LOCATION '[DELTA_TABLE_LOCATION]'
"""

fn_process_data (file_name, table_name, dataframe_schema, delta_table_definition)

In [None]:
# store_sales
file_name = 'store_sales/*.dat'
table_name = 'store_sales'
dataframe_schema = \
	StructType([ \
			StructField("ss_sold_date_sk", IntegerType()), \
			StructField("ss_sold_time_sk", IntegerType()), \
			StructField("ss_item_sk", IntegerType()), \
			StructField("ss_customer_sk", IntegerType()), \
			StructField("ss_cdemo_sk", IntegerType()), \
			StructField("ss_hdemo_sk", IntegerType()), \
			StructField("ss_addr_sk", IntegerType()), \
			StructField("ss_store_sk", IntegerType()), \
			StructField("ss_promo_sk", IntegerType()), \
			StructField("ss_ticket_number", IntegerType()), \
			StructField("ss_quantity", IntegerType()), \
			StructField("ss_wholesale_cost", DoubleType()), \
			StructField("ss_list_price", DoubleType()), \
			StructField("ss_sales_price", DoubleType()), \
			StructField("ss_ext_discount_amt", DoubleType()), \
			StructField("ss_ext_sales_price", DoubleType()), \
			StructField("ss_ext_wholesale_cost", DoubleType()), \
			StructField("ss_ext_list_price", DoubleType()), \
			StructField("ss_ext_tax", DoubleType()), \
			StructField("ss_coupon_amt", DoubleType()), \
			StructField("ss_net_paid", DoubleType()), \
			StructField("ss_net_paid_inc_tax", DoubleType()), \
			StructField("ss_net_profit", DoubleType()) \
	])
delta_table_definition = """
CREATE TABLE store_sales
	(
        ss_sold_date_sk           integer                       ,
        ss_sold_time_sk           integer                       ,
        ss_item_sk                integer               not null,
        ss_customer_sk            integer                       ,
        ss_cdemo_sk               integer                       ,
        ss_hdemo_sk               integer                       ,
        ss_addr_sk                integer                       ,
        ss_store_sk               integer                       ,
        ss_promo_sk               integer                       ,
        ss_ticket_number          integer               not null,
        ss_quantity               integer                       ,
        ss_wholesale_cost         decimal(7,2)                  ,
        ss_list_price             decimal(7,2)                  ,
        ss_sales_price            decimal(7,2)                  ,
        ss_ext_discount_amt       decimal(7,2)                  ,
        ss_ext_sales_price        decimal(7,2)                  ,
        ss_ext_wholesale_cost     decimal(7,2)                  ,
        ss_ext_list_price         decimal(7,2)                  ,
        ss_ext_tax                decimal(7,2)                  ,
        ss_coupon_amt             decimal(7,2)                  ,
        ss_net_paid               decimal(7,2)                  ,
        ss_net_paid_inc_tax       decimal(7,2)                  ,
        ss_net_profit             decimal(7,2)                  
	)
	LOCATION '[DELTA_TABLE_LOCATION]'
"""

fn_process_data (file_name, table_name, dataframe_schema, delta_table_definition)

In [None]:
# time_dim
file_name = 'time_dim/*.dat'
table_name = 'time_dim'
dataframe_schema = \
	StructType([ \
			StructField("t_time_sk", IntegerType()), \
			StructField("t_time_id", StringType()), \
			StructField("t_time", IntegerType()), \
			StructField("t_hour", IntegerType()), \
			StructField("t_minute", IntegerType()), \
			StructField("t_second", IntegerType()), \
			StructField("t_am_pm", StringType()), \
			StructField("t_shift", StringType()), \
			StructField("t_sub_shift", StringType()), \
			StructField("t_meal_time", StringType()) \
	])
delta_table_definition = """
CREATE TABLE time_dim
	(
        t_time_sk                 integer               not null,
        t_time_id                 char(16)              not null,
        t_time                    integer                       ,
        t_hour                    integer                       ,
        t_minute                  integer                       ,
        t_second                  integer                       ,
        t_am_pm                   char(2)                       ,
        t_shift                   char(20)                      ,
        t_sub_shift               char(20)                      ,
        t_meal_time               char(20)                      
	)
	LOCATION '[DELTA_TABLE_LOCATION]'
"""

fn_process_data (file_name, table_name, dataframe_schema, delta_table_definition)

In [None]:
# warehouse
file_name = 'warehouse/*.dat'
table_name = 'warehouse'
dataframe_schema = \
	StructType([ \
			StructField("w_warehouse_sk", IntegerType()), \
			StructField("w_warehouse_id", StringType()), \
			StructField("w_warehouse_name", StringType()), \
			StructField("w_warehouse_sq_ft", IntegerType()), \
			StructField("w_street_number", StringType()), \
			StructField("w_street_name", StringType()), \
			StructField("w_street_type", StringType()), \
			StructField("w_suite_number", StringType()), \
			StructField("w_city", StringType()), \
			StructField("w_county", StringType()), \
			StructField("w_state", StringType()), \
			StructField("w_zip", StringType()), \
			StructField("w_country", StringType()), \
			StructField("w_gmt_offset", DoubleType()) \
	])
delta_table_definition = """
CREATE TABLE warehouse
	(
        w_warehouse_sk            integer               not null,
        w_warehouse_id            char(16)              not null,
        w_warehouse_name          varchar(20)                   ,
        w_warehouse_sq_ft         integer                       ,
        w_street_number           char(10)                      ,
        w_street_name             varchar(60)                   ,
        w_street_type             char(15)                      ,
        w_suite_number            char(10)                      ,
        w_city                    varchar(60)                   ,
        w_county                  varchar(30)                   ,
        w_state                   char(2)                       ,
        w_zip                     char(10)                      ,
        w_country                 varchar(20)                   ,
        w_gmt_offset              decimal(5,2)                  
	)
	LOCATION '[DELTA_TABLE_LOCATION]'
"""

fn_process_data (file_name, table_name, dataframe_schema, delta_table_definition)

In [None]:
# web_page
file_name = 'web_page/*.dat'
table_name = 'web_page'
dataframe_schema = \
	StructType([ \
			StructField("wp_web_page_sk", IntegerType()), \
			StructField("wp_web_page_id", StringType()), \
			StructField("wp_rec_start_date", DateType()), \
			StructField("wp_rec_end_date", DateType()), \
			StructField("wp_creation_date_sk", IntegerType()), \
			StructField("wp_access_date_sk", IntegerType()), \
			StructField("wp_autogen_flag", StringType()), \
			StructField("wp_customer_sk", IntegerType()), \
			StructField("wp_url", StringType()), \
			StructField("wp_type", StringType()), \
			StructField("wp_char_count", IntegerType()), \
			StructField("wp_link_count", IntegerType()), \
			StructField("wp_image_count", IntegerType()), \
			StructField("wp_max_ad_count", IntegerType()) \
	])
delta_table_definition = """
CREATE TABLE web_page
	(
        wp_web_page_sk            integer               not null,
        wp_web_page_id            char(16)              not null,
        wp_rec_start_date         date                          ,
        wp_rec_end_date           date                          ,
        wp_creation_date_sk       integer                       ,
        wp_access_date_sk         integer                       ,
        wp_autogen_flag           char(1)                       ,
        wp_customer_sk            integer                       ,
        wp_url                    varchar(100)                  ,
        wp_type                   char(50)                      ,
        wp_char_count             integer                       ,
        wp_link_count             integer                       ,
        wp_image_count            integer                       ,
        wp_max_ad_count           integer                       
	)
	LOCATION '[DELTA_TABLE_LOCATION]'
"""

fn_process_data (file_name, table_name, dataframe_schema, delta_table_definition)

In [None]:
# web_returns
file_name = 'web_returns/*.dat'
table_name = 'web_returns'
dataframe_schema = \
	StructType([ \
			StructField("wr_returned_date_sk", IntegerType()), \
			StructField("wr_returned_time_sk", IntegerType()), \
			StructField("wr_item_sk", IntegerType()), \
			StructField("wr_refunded_customer_sk", IntegerType()), \
			StructField("wr_refunded_cdemo_sk", IntegerType()), \
			StructField("wr_refunded_hdemo_sk", IntegerType()), \
			StructField("wr_refunded_addr_sk", IntegerType()), \
			StructField("wr_returning_customer_sk", IntegerType()), \
			StructField("wr_returning_cdemo_sk", IntegerType()), \
			StructField("wr_returning_hdemo_sk", IntegerType()), \
			StructField("wr_returning_addr_sk", IntegerType()), \
			StructField("wr_web_page_sk", IntegerType()), \
			StructField("wr_reason_sk", IntegerType()), \
			StructField("wr_order_number", IntegerType()), \
			StructField("wr_return_quantity", IntegerType()), \
			StructField("wr_return_amt", DoubleType()), \
			StructField("wr_return_tax", DoubleType()), \
			StructField("wr_return_amt_inc_tax", DoubleType()), \
			StructField("wr_fee", DoubleType()), \
			StructField("wr_return_ship_cost", DoubleType()), \
			StructField("wr_refunded_cash", DoubleType()), \
			StructField("wr_reversed_charge", DoubleType()), \
			StructField("wr_account_credit", DoubleType()), \
			StructField("wr_net_loss", DoubleType()) \
	])
delta_table_definition = """
CREATE TABLE web_returns
	(
        wr_returned_date_sk       integer                       ,
        wr_returned_time_sk       integer                       ,
        wr_item_sk                integer               not null,
        wr_refunded_customer_sk   integer                       ,
        wr_refunded_cdemo_sk      integer                       ,
        wr_refunded_hdemo_sk      integer                       ,
        wr_refunded_addr_sk       integer                       ,
        wr_returning_customer_sk  integer                       ,
        wr_returning_cdemo_sk     integer                       ,
        wr_returning_hdemo_sk     integer                       ,
        wr_returning_addr_sk      integer                       ,
        wr_web_page_sk            integer                       ,
        wr_reason_sk              integer                       ,
        wr_order_number           integer               not null,
        wr_return_quantity        integer                       ,
        wr_return_amt             decimal(7,2)                  ,
        wr_return_tax             decimal(7,2)                  ,
        wr_return_amt_inc_tax     decimal(7,2)                  ,
        wr_fee                    decimal(7,2)                  ,
        wr_return_ship_cost       decimal(7,2)                  ,
        wr_refunded_cash          decimal(7,2)                  ,
        wr_reversed_charge        decimal(7,2)                  ,
        wr_account_credit         decimal(7,2)                  ,
        wr_net_loss               decimal(7,2)                  
	)
	LOCATION '[DELTA_TABLE_LOCATION]'
"""

fn_process_data (file_name, table_name, dataframe_schema, delta_table_definition)

In [None]:
# web_sales
file_name = 'web_sales/*.dat'
table_name = 'web_sales'
dataframe_schema = \
	StructType([ \
			StructField("ws_sold_date_sk", IntegerType()), \
			StructField("ws_sold_time_sk", IntegerType()), \
			StructField("ws_ship_date_sk", IntegerType()), \
			StructField("ws_item_sk", IntegerType()), \
			StructField("ws_bill_customer_sk", IntegerType()), \
			StructField("ws_bill_cdemo_sk", IntegerType()), \
			StructField("ws_bill_hdemo_sk", IntegerType()), \
			StructField("ws_bill_addr_sk", IntegerType()), \
			StructField("ws_ship_customer_sk", IntegerType()), \
			StructField("ws_ship_cdemo_sk", IntegerType()), \
			StructField("ws_ship_hdemo_sk", IntegerType()), \
			StructField("ws_ship_addr_sk", IntegerType()), \
			StructField("ws_web_page_sk", IntegerType()), \
			StructField("ws_web_site_sk", IntegerType()), \
			StructField("ws_ship_mode_sk", IntegerType()), \
			StructField("ws_warehouse_sk", IntegerType()), \
			StructField("ws_promo_sk", IntegerType()), \
			StructField("ws_order_number", IntegerType()), \
			StructField("ws_quantity", IntegerType()), \
			StructField("ws_wholesale_cost", DoubleType()), \
			StructField("ws_list_price", DoubleType()), \
			StructField("ws_sales_price", DoubleType()), \
			StructField("ws_ext_discount_amt", DoubleType()), \
			StructField("ws_ext_sales_price", DoubleType()), \
			StructField("ws_ext_wholesale_cost", DoubleType()), \
			StructField("ws_ext_list_price", DoubleType()), \
			StructField("ws_ext_tax", DoubleType()), \
			StructField("ws_coupon_amt", DoubleType()), \
			StructField("ws_ext_ship_cost", DoubleType()), \
			StructField("ws_net_paid", DoubleType()), \
			StructField("ws_net_paid_inc_tax", DoubleType()), \
			StructField("ws_net_paid_inc_ship", DoubleType()), \
			StructField("ws_net_paid_inc_ship_tax", DoubleType()), \
			StructField("ws_net_profit", DoubleType()) \
	])
delta_table_definition = """
CREATE TABLE web_sales
	(
        ws_sold_date_sk           integer                       ,
        ws_sold_time_sk           integer                       ,
        ws_ship_date_sk           integer                       ,
        ws_item_sk                integer               not null,
        ws_bill_customer_sk       integer                       ,
        ws_bill_cdemo_sk          integer                       ,
        ws_bill_hdemo_sk          integer                       ,
        ws_bill_addr_sk           integer                       ,
        ws_ship_customer_sk       integer                       ,
        ws_ship_cdemo_sk          integer                       ,
        ws_ship_hdemo_sk          integer                       ,
        ws_ship_addr_sk           integer                       ,
        ws_web_page_sk            integer                       ,
        ws_web_site_sk            integer                       ,
        ws_ship_mode_sk           integer                       ,
        ws_warehouse_sk           integer                       ,
        ws_promo_sk               integer                       ,
        ws_order_number           integer               not null,
        ws_quantity               integer                       ,
        ws_wholesale_cost         decimal(7,2)                  ,
        ws_list_price             decimal(7,2)                  ,
        ws_sales_price            decimal(7,2)                  ,
        ws_ext_discount_amt       decimal(7,2)                  ,
        ws_ext_sales_price        decimal(7,2)                  ,
        ws_ext_wholesale_cost     decimal(7,2)                  ,
        ws_ext_list_price         decimal(7,2)                  ,
        ws_ext_tax                decimal(7,2)                  ,
        ws_coupon_amt             decimal(7,2)                  ,
        ws_ext_ship_cost          decimal(7,2)                  ,
        ws_net_paid               decimal(7,2)                  ,
        ws_net_paid_inc_tax       decimal(7,2)                  ,
        ws_net_paid_inc_ship      decimal(7,2)                  ,
        ws_net_paid_inc_ship_tax  decimal(7,2)                  ,
        ws_net_profit             decimal(7,2)                  
	)
	LOCATION '[DELTA_TABLE_LOCATION]'
"""

fn_process_data (file_name, table_name, dataframe_schema, delta_table_definition)

In [None]:
# web_site
file_name = 'web_site/*.dat'
table_name = 'web_site'
dataframe_schema = \
	StructType([ \
			StructField("web_site_sk", IntegerType()), \
			StructField("web_site_id", StringType()), \
			StructField("web_rec_start_date", DateType()), \
			StructField("web_rec_end_date", DateType()), \
			StructField("web_name", StringType()), \
			StructField("web_open_date_sk", IntegerType()), \
			StructField("web_close_date_sk", IntegerType()), \
			StructField("web_class", StringType()), \
			StructField("web_manager", StringType()), \
			StructField("web_mkt_id", IntegerType()), \
			StructField("web_mkt_class", StringType()), \
			StructField("web_mkt_desc", StringType()), \
			StructField("web_market_manager", StringType()), \
			StructField("web_company_id", IntegerType()), \
			StructField("web_company_name", StringType()), \
			StructField("web_street_number", StringType()), \
			StructField("web_street_name", StringType()), \
			StructField("web_street_type", StringType()), \
			StructField("web_suite_number", StringType()), \
			StructField("web_city", StringType()), \
			StructField("web_county", StringType()), \
			StructField("web_state", StringType()), \
			StructField("web_zip", StringType()), \
			StructField("web_country", StringType()), \
			StructField("web_gmt_offset", DoubleType()), \
			StructField("web_tax_percentage", DoubleType()) \
	])
delta_table_definition = """
CREATE TABLE web_site
	(
        web_site_sk               integer               not null,
        web_site_id               char(16)              not null,
        web_rec_start_date        date                          ,
        web_rec_end_date          date                          ,
        web_name                  varchar(50)                   ,
        web_open_date_sk          integer                       ,
        web_close_date_sk         integer                       ,
        web_class                 varchar(50)                   ,
        web_manager               varchar(40)                   ,
        web_mkt_id                integer                       ,
        web_mkt_class             varchar(50)                   ,
        web_mkt_desc              varchar(100)                  ,
        web_market_manager        varchar(40)                   ,
        web_company_id            integer                       ,
        web_company_name          char(50)                      ,
        web_street_number         char(10)                      ,
        web_street_name           varchar(60)                   ,
        web_street_type           char(15)                      ,
        web_suite_number          char(10)                      ,
        web_city                  varchar(60)                   ,
        web_county                varchar(30)                   ,
        web_state                 char(2)                       ,
        web_zip                   char(10)                      ,
        web_country               varchar(20)                   ,
        web_gmt_offset            decimal(5,2)                  ,
        web_tax_percentage        decimal(5,2)                  
	)
	LOCATION '[DELTA_TABLE_LOCATION]'
"""

fn_process_data (file_name, table_name, dataframe_schema, delta_table_definition)