
### Transform and Export Processed Data to ADLS 

Extract data from imported database assuming default Access NI schema.
Data is converted to flat denormalized CSV files and exported to ADLS **silver** container.

One CSV file will be created for each Sequence File name that was called.


In [None]:
# Declare script variables
debug_mode = True
database_name = "dbo"

bronze_source = "REDACTED"
bronze_target = bronze_source + database_name
bronze_mnt = "/mnt/bronze/" + database_name

silver_source = "REDACTED"
silver_target = silver_source + database_name
silver_mnt = "/mnt/silver/" + database_name

gold_source = "REDACTED"
gold_target = gold_source + database_name
gold_mnt = "/mnt/gold/" + database_name

In [None]:
# Get all desired test runs
# Cast Start_Date_Time as Timestamp, Test_Socket_Index AND Execution_Time as Integer, and Data as 3-Digit-Double

sql_string_crosstab = f"""
    SELECT  STEP_SEQCALL.SEQUENCE_FILE_PATH,
            UUT_RESULT.ID,
            CAST(UUT_RESULT.START_DATE_TIME AS TIMESTAMP), 
            UUT_RESULT.STATION_ID, 
            UUT_RESULT.UUT_SERIAL_NUMBER,
            CAST(UUT_RESULT.TEST_SOCKET_INDEX AS INT), 
            UUT_RESULT.UUT_STATUS, 
            CAST(UUT_RESULT.EXECUTION_TIME AS INT),
            STEP_RESULT.STEP_NAME, 
            ROUND(CAST(PROP_RESULT.DATA AS DOUBLE), 3) AS DATA
    FROM UUT_RESULT LEFT JOIN STEP_RESULT 
        ON UUT_RESULT.ID = STEP_RESULT.UUT_RESULT
            LEFT JOIN PROP_RESULT 
                ON STEP_RESULT.ID = PROP_RESULT.STEP_RESULT
                    LEFT JOIN STEP_SEQCALL 
                        ON STEP_SEQCALL.STEP_RESULT = STEP_RESULT.STEP_PARENT
    WHERE UUT_STATUS <> 'Terminated' 
        and UUT_STATUS <> 'Error' 
        and STATION_ID is not NULL
        and STEP_RESULT.STEP_TYPE = 'NumericLimitTest'   
        and PROP_RESULT.TYPE_NAME = 'NumericLimitTest'
    ORDER BY START_DATE_TIME, TEST_SOCKET_INDEX, STEP_RESULT.ORDER_NUMBER ASC
    """

test_results = spark.sql(sql_string_crosstab)

# Transform raw Column of filepaths to clean Column of Sequence_Name's
rdd=test_results.rdd.map(lambda x: 
    (x['SEQUENCE_FILE_PATH'].split('\\')[-1].split('.')[0]
    ,x['ID']
    ,x['START_DATE_TIME']
    ,x['STATION_ID']
    ,x['UUT_SERIAL_NUMBER']
    ,x['TEST_SOCKET_INDEX']
    ,x['UUT_STATUS']
    ,x['EXECUTION_TIME']
    ,x['STEP_NAME']
    ,x['DATA']
    )
)  
run_steps=rdd.toDF(
    ["Sequence_Name"
    ,"Run_ID"
    ,"Test_Start"
    ,"Station_ID"
    ,"Serial_Number"
    ,"Test_Socket"
    ,"Test_Status"
    ,"Test_Time_Sec"
    ,"Step_Name"
    ,"Data"
    ]
)

display(run_steps)

In [None]:
# Efficiently partition the Run_Steps into Sequence_Name folders
run_steps.write.mode("overwrite").option("header", "true").partitionBy("SEQUENCE_NAME").csv(silver_target)
