# Soda scan all batches

## Setup

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import datetime
from ipywidgets import Output
import pandas as pd
from pathlib import Path 
from soda.scan import Scan
from soda.common.json_helper import JsonHelper

from delta import *
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import input_file_name, split, regexp_extract, to_timestamp, explode

from utils.data_generation import FakerProfileDataSnapshot
from utils.soda_utils import build_scan_results, save_scan_results_with_spark_as_json

In [3]:
from soda.scan import Scan
from soda.common.json_helper import JsonHelper

def build_scan_results(scan) -> dict:

    checks = [
        check.get_cloud_dict() for check in scan._checks if check.outcome is not None and check.archetype is None
    ]
    autoamted_monitoring_checks = [
        check.get_cloud_dict()
        for check in scan._checks
        if check.outcome is not None and check.archetype is not None
    ]

    # TODO: [SODA-608] separate profile columns and sample tables by aligning with the backend team
    profiling = [
        profile_table.get_cloud_dict()
        for profile_table in scan._profile_columns_result_tables + scan._sample_tables_result_tables
    ]

    return JsonHelper.to_jsonnable(  # type: ignore
        {
            "definitionName": scan._scan_definition_name,
            "defaultDataSource": scan._data_source_name,
            "dataTimestamp": scan._data_timestamp,
            "scanStartTimestamp": scan._scan_start_timestamp,
            "scanEndTimestamp": scan._scan_end_timestamp,
            "hasErrors": scan.has_error_logs(),
            "hasWarnings": scan.has_check_warns(),
            "hasFailures": scan.has_check_fails(),
            "metrics": [metric.get_cloud_dict() for metric in scan._metrics],
            # If archetype is not None, it means that check is automated monitoring
            "checks": checks,
            # TODO Queries are not supported by Soda Cloud yet.
            # "queries": [query.get_cloud_dict() for query in scan._queries],
            "automatedMonitoringChecks": autoamted_monitoring_checks,
            "profiling": profiling,
            "metadata": [
                discover_tables_result.get_cloud_dict()
                for discover_tables_result in scan._discover_tables_result_tables
            ],
            "logs": [log.get_cloud_dict() for log in scan._logs.logs],
        }
    )


In [4]:
from delta import *

In [5]:
from pyspark.sql import SparkSession
from delta import *

builder = SparkSession.builder.appName("data-pipeline") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [6]:
# object for data generation
datagen = FakerProfileDataSnapshot(
    data_dir=str(Path('./data').resolve())
)
# clean landing storage
datagen.delete_batches_in_landing_dir()
datagen.delete_bronze_dir()
datagen.delete_bronze_scan_results_dir()

## Parameters

In [7]:
read_csv_options = {'header': 'true', 'inferSchema': 'true', 'delimiter': ',', 'quote': '"', 'multiLine': 'true'}

In [8]:
location_landing = datagen.landing_dir
location_bronze = datagen.bronze_dir
location_bronze_scan_results = datagen.bronze_scan_results_dir

In [9]:
sodacl_yaml_str = '''
checks for snapshot:
  - freshness(_delivery_date, FAKE_NOW) < 8d
  - schema:
      fail:
        when required column missing:
          - id
          - mail
          - birthdate
          - _delivery_date  
  - row_count > 0
  - duplicate_count(id) = 0
  - duplicate_count(mail) = 0
  - missing_count(id) = 0
  - invalid_count(mail) = 0:
      valid format: email
  - invalid_percent(birthdate) = 0:
      valid length: 10
discover tables:
  tables:
    - include snapshot
profile columns:
  columns:
    - snapshot.%
'''

## Processing all patches

In [10]:
for batch_id in range(1, 8):
    print('batch id :', batch_id)

    print('land file')
    path_csv = datagen.land_batch(batch_id)
    print(path_csv)

    print('ingest data to bronce delta table')
    df_landing = spark.read.format('csv').options(**read_csv_options).load(str(path_csv)) \
        .withColumn("_delivery_date", 
                    to_timestamp(
                        regexp_extract(
                            input_file_name(), '.*_(.*).csv$', 1
                        )
                    )
                   )
    df_landing.write.format('delta').option('overwriteSchema', 'true').mode('overwrite').save(location_bronze)

    print('recreate table')
    spark.sql('DROP TABLE IF EXISTS snapshot')
    spark.sql(f'CREATE TABLE IF NOT EXISTS snapshot USING DELTA LOCATION "{location_bronze}"')

    print('delete data in landing zone')
    datagen.delete_batches_in_landing_dir()

    print('prepare soda scan')
    df_raw = spark.sql('select * from snapshot')
    delta_history = spark.sql(f'DESCRIBE HISTORY snapshot').toPandas()
    display(delta_history)
    delta_table_version = max(delta_history['version'])
    
    scan = Scan()
    scan.disable_telemetry()
    scan.set_data_source_name('snapshot')
    scan.add_spark_session(spark, 'snapshot')
    
    # with every delivery, the fake now of processing and scanning is always two days later than the delivery date
    fake_now = datetime.date.fromisoformat('2022-08-12') + datetime.timedelta(days=(7 * batch_id) + ((batch_id - 1) * 2))
    print('fake_now:', fake_now)
    scan.add_variables({"FAKE_NOW": f"{fake_now} 00:00:00"})
    
    scan.add_sodacl_yaml_str(sodacl_yaml_str)
    scan.set_verbose(False) # this does not stop the scan.execute() call to print all kind of out put in this notebook
    
    print('soda scan')
    # we avoid it with the following two lines instead
    output = Output()
    with output:
        exit_code = scan.execute()
    print('exit_code:', exit_code)
    
    print('build scan result')
    scan_results = build_scan_results(scan)
    scan_results['delta_version'] = max(delta_history['version'])
    print('store scan result')
    filepath_scan_result = location_bronze_scan_results + f'/scan_results_{delta_table_version}.json'
    print(filepath_scan_result)
    rdd = spark.sparkContext.parallelize([JsonHelper.to_json(scan_results)])
    rdd.coalesce(1).saveAsTextFile(filepath_scan_result)


batch id : 1
land file
/home/jovyan/work/data/landing/snapshot_2022-08-19.csv
ingest data to bronce delta table
recreate table
delete data in landing zone
prepare soda scan


Unnamed: 0,version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
0,0,2022-08-21 16:45:23.128,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,,Serializable,False,"{'numOutputRows': '4', 'numOutputBytes': '3309...",,Apache-Spark/3.2.1 Delta-Lake/2.0.0


fake_now: 2022-08-19
soda scan
exit_code: 0
build scan result
store scan result
/home/jovyan/work/data/bronze_scan_results/scan_results_0.json
batch id : 2
land file
/home/jovyan/work/data/landing/snapshot_2022-08-26.csv
ingest data to bronce delta table
recreate table
delete data in landing zone
prepare soda scan


Unnamed: 0,version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
0,1,2022-08-21 16:46:14.282,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,0.0,Serializable,False,"{'numOutputRows': '6', 'numOutputBytes': '3561...",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
1,0,2022-08-21 16:45:23.128,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,,Serializable,False,"{'numOutputRows': '4', 'numOutputBytes': '3309...",,Apache-Spark/3.2.1 Delta-Lake/2.0.0


fake_now: 2022-08-28
soda scan
exit_code: 0
build scan result
store scan result
/home/jovyan/work/data/bronze_scan_results/scan_results_1.json
batch id : 3
land file
/home/jovyan/work/data/landing/snapshot_2022-09-02.csv
ingest data to bronce delta table
recreate table
delete data in landing zone
prepare soda scan


Unnamed: 0,version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
0,2,2022-08-21 16:46:45.546,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,1.0,Serializable,False,"{'numOutputRows': '7', 'numOutputBytes': '3662...",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
1,1,2022-08-21 16:46:14.282,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,0.0,Serializable,False,"{'numOutputRows': '6', 'numOutputBytes': '3561...",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
2,0,2022-08-21 16:45:23.128,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,,Serializable,False,"{'numOutputRows': '4', 'numOutputBytes': '3309...",,Apache-Spark/3.2.1 Delta-Lake/2.0.0


fake_now: 2022-09-06
soda scan
exit_code: 0
build scan result
store scan result
/home/jovyan/work/data/bronze_scan_results/scan_results_2.json
batch id : 4
land file
/home/jovyan/work/data/landing/snapshot_2022-09-09.csv
ingest data to bronce delta table
recreate table
delete data in landing zone
prepare soda scan


Unnamed: 0,version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
0,3,2022-08-21 16:47:16.839,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,2.0,Serializable,False,"{'numOutputRows': '7', 'numOutputBytes': '5246...",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
1,2,2022-08-21 16:46:45.546,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,1.0,Serializable,False,"{'numOutputRows': '7', 'numOutputBytes': '3662...",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
2,1,2022-08-21 16:46:14.282,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,0.0,Serializable,False,"{'numOutputRows': '6', 'numOutputBytes': '3561...",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
3,0,2022-08-21 16:45:23.128,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,,Serializable,False,"{'numOutputRows': '4', 'numOutputBytes': '3309...",,Apache-Spark/3.2.1 Delta-Lake/2.0.0


fake_now: 2022-09-15
soda scan
exit_code: 0
build scan result
store scan result
/home/jovyan/work/data/bronze_scan_results/scan_results_3.json
batch id : 5
land file
/home/jovyan/work/data/landing/snapshot_2022-09-16.csv
ingest data to bronce delta table
recreate table
delete data in landing zone
prepare soda scan


Unnamed: 0,version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
0,4,2022-08-21 16:47:53.032,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,3.0,Serializable,False,"{'numOutputRows': '9', 'numOutputBytes': '5651...",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
1,3,2022-08-21 16:47:16.839,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,2.0,Serializable,False,"{'numOutputRows': '7', 'numOutputBytes': '5246...",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
2,2,2022-08-21 16:46:45.546,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,1.0,Serializable,False,"{'numOutputRows': '7', 'numOutputBytes': '3662...",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
3,1,2022-08-21 16:46:14.282,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,0.0,Serializable,False,"{'numOutputRows': '6', 'numOutputBytes': '3561...",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
4,0,2022-08-21 16:45:23.128,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,,Serializable,False,"{'numOutputRows': '4', 'numOutputBytes': '3309...",,Apache-Spark/3.2.1 Delta-Lake/2.0.0


fake_now: 2022-09-24
soda scan
exit_code: 0
build scan result
store scan result
/home/jovyan/work/data/bronze_scan_results/scan_results_4.json
batch id : 6
land file
/home/jovyan/work/data/landing/snapshot_2022-09-23.csv
ingest data to bronce delta table
recreate table
delete data in landing zone
prepare soda scan


Unnamed: 0,version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
0,5,2022-08-21 16:48:25.695,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,4.0,Serializable,False,"{'numOutputRows': '11', 'numOutputBytes': '584...",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
1,4,2022-08-21 16:47:53.032,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,3.0,Serializable,False,"{'numOutputRows': '9', 'numOutputBytes': '5651...",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
2,3,2022-08-21 16:47:16.839,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,2.0,Serializable,False,"{'numOutputRows': '7', 'numOutputBytes': '5246...",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
3,2,2022-08-21 16:46:45.546,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,1.0,Serializable,False,"{'numOutputRows': '7', 'numOutputBytes': '3662...",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
4,1,2022-08-21 16:46:14.282,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,0.0,Serializable,False,"{'numOutputRows': '6', 'numOutputBytes': '3561...",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
5,0,2022-08-21 16:45:23.128,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,,Serializable,False,"{'numOutputRows': '4', 'numOutputBytes': '3309...",,Apache-Spark/3.2.1 Delta-Lake/2.0.0


fake_now: 2022-10-03
soda scan
exit_code: 2
build scan result
store scan result
/home/jovyan/work/data/bronze_scan_results/scan_results_5.json
batch id : 7
land file
/home/jovyan/work/data/landing/snapshot_2022-09-30.csv
ingest data to bronce delta table
recreate table
delete data in landing zone
prepare soda scan


Unnamed: 0,version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
0,6,2022-08-21 16:48:58.159,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,5.0,Serializable,False,"{'numOutputRows': '14', 'numOutputBytes': '626...",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
1,5,2022-08-21 16:48:25.695,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,4.0,Serializable,False,"{'numOutputRows': '11', 'numOutputBytes': '584...",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
2,4,2022-08-21 16:47:53.032,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,3.0,Serializable,False,"{'numOutputRows': '9', 'numOutputBytes': '5651...",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
3,3,2022-08-21 16:47:16.839,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,2.0,Serializable,False,"{'numOutputRows': '7', 'numOutputBytes': '5246...",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
4,2,2022-08-21 16:46:45.546,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,1.0,Serializable,False,"{'numOutputRows': '7', 'numOutputBytes': '3662...",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
5,1,2022-08-21 16:46:14.282,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,0.0,Serializable,False,"{'numOutputRows': '6', 'numOutputBytes': '3561...",,Apache-Spark/3.2.1 Delta-Lake/2.0.0
6,0,2022-08-21 16:45:23.128,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,,Serializable,False,"{'numOutputRows': '4', 'numOutputBytes': '3309...",,Apache-Spark/3.2.1 Delta-Lake/2.0.0


fake_now: 2022-10-12
soda scan
exit_code: 2
build scan result
store scan result
/home/jovyan/work/data/bronze_scan_results/scan_results_6.json


In [11]:
sdf_sr = spark.read.json(location_bronze_scan_results + f'/scan_results_*.json')
sdf_sr.toPandas()

Unnamed: 0,automatedMonitoringChecks,checks,dataTimestamp,defaultDataSource,definitionName,delta_version,hasErrors,hasFailures,hasWarnings,logs,metadata,metrics,profiling,scanEndTimestamp,scanStartTimestamp
0,[],"[(None, snapshot, checks for snapshot:\n - sc...",2022-08-21T16:49:00+00:00,snapshot,,6,False,True,False,"[(123, info, Soda Core 3.0.4, 2022-08-21T16:49...","[(snapshot, 14, [Row(columnName='id', sourceDa...","[(None, None, metric-snapshot-snapshot-id-dupl...","[([Row(columnName='id', profile=Row(avg=10.5, ...",2022-08-21T16:49:26+00:00,2022-08-21T16:49:00+00:00
1,[],"[(None, snapshot, checks for snapshot:\n - sc...",2022-08-21T16:48:28+00:00,snapshot,,5,False,True,False,"[(96, info, Soda Core 3.0.4, 2022-08-21T16:48:...","[(snapshot, 11, [Row(columnName='id', sourceDa...","[(None, None, metric-snapshot-snapshot-id-dupl...","[([Row(columnName='id', profile=Row(avg=7.0, a...",2022-08-21T16:48:56+00:00,2022-08-21T16:48:28+00:00
2,[],"[(None, snapshot, checks for snapshot:\n - sc...",2022-08-21T16:47:56+00:00,snapshot,,4,False,False,False,"[(77, info, Soda Core 3.0.4, 2022-08-21T16:47:...","[(snapshot, 9, [Row(columnName='id', sourceDat...","[(None, None, metric-snapshot-snapshot-id-dupl...","[([Row(columnName='id', profile=Row(avg=5.7777...",2022-08-21T16:48:24+00:00,2022-08-21T16:47:56+00:00
3,[],"[(None, snapshot, checks for snapshot:\n - sc...",2022-08-21T16:47:19+00:00,snapshot,,3,False,False,False,"[(58, info, Soda Core 3.0.4, 2022-08-21T16:47:...","[(snapshot, 7, [Row(columnName='id', sourceDat...","[(None, None, metric-snapshot-snapshot-id-dupl...","[([Row(columnName='id', profile=Row(avg=4.4285...",2022-08-21T16:47:51+00:00,2022-08-21T16:47:19+00:00
4,[],"[(None, snapshot, checks for snapshot:\n - sc...",2022-08-21T16:46:48+00:00,snapshot,,2,False,False,False,"[(39, info, Soda Core 3.0.4, 2022-08-21T16:46:...","[(snapshot, 7, [Row(columnName='id', sourceDat...","[(None, None, metric-snapshot-snapshot-id-dupl...","[([Row(columnName='id', profile=Row(avg=4.1428...",2022-08-21T16:47:15+00:00,2022-08-21T16:46:48+00:00
5,[],"[(None, snapshot, checks for snapshot:\n - sc...",2022-08-21T16:46:17+00:00,snapshot,,1,False,False,False,"[(20, info, Soda Core 3.0.4, 2022-08-21T16:46:...","[(snapshot, 6, [Row(columnName='id', sourceDat...","[(None, None, metric-snapshot-snapshot-id-dupl...","[([Row(columnName='id', profile=Row(avg=2.5, a...",2022-08-21T16:46:44+00:00,2022-08-21T16:46:17+00:00
6,[],"[(None, snapshot, checks for snapshot:\n - sc...",2022-08-21T16:45:32+00:00,snapshot,,0,False,False,False,"[(1, info, Soda Core 3.0.4, 2022-08-21T16:45:3...","[(snapshot, 4, [Row(columnName='id', sourceDat...","[(None, None, metric-snapshot-snapshot-id-dupl...","[([Row(columnName='id', profile=Row(avg=1.5, a...",2022-08-21T16:46:11+00:00,2022-08-21T16:45:32+00:00


In [12]:
sdf_sr.printSchema()

root
 |-- automatedMonitoringChecks: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- checks: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- column: string (nullable = true)
 |    |    |-- dataSource: string (nullable = true)
 |    |    |-- definition: string (nullable = true)
 |    |    |-- diagnostics: struct (nullable = true)
 |    |    |    |-- fail: struct (nullable = true)
 |    |    |    |    |-- greaterThan: double (nullable = true)
 |    |    |    |    |-- lessThan: double (nullable = true)
 |    |    |    |    |-- lessThanOrEqual: double (nullable = true)
 |    |    |    |-- freshness: string (nullable = true)
 |    |    |    |-- maxColumnTimestamp: string (nullable = true)
 |    |    |    |-- maxColumnTimestampUtc: string (nullable = true)
 |    |    |    |-- nowTimestamp: string (nullable = true)
 |    |    |    |-- nowTimestampUtc: string (nullable = true)
 |    |    |    |-- nowVariableName: string (nulla

## Read scan results

Helpful links 
* https://stackoverflow.com/questions/63838239/pyspark-dataframe-column-contains-array-of-dictionaries-want-to-make-each-key-f
* https://stackoverflow.com/questions/49416637/access-dataframes-row-inside-row-nested-json-with-pyspark

### Checks

#### Table checks

In [13]:
sdf_sr.select(
    sdf_sr.delta_version, 
    explode(sdf_sr.checks).alias("checks_ex")
).select('delta_version', 'checks_ex.*', 'checks_ex.diagnostics.value') \
.filter(F.col('column').isNull()).toPandas()

Unnamed: 0,delta_version,column,dataSource,definition,diagnostics,identity,location,metrics,name,outcome,table,type,value
0,6,,snapshot,checks for snapshot:\n - schema:\n fail:...,"(None, None, None, None, None, None, None, [(i...",a9f97edf,"(5, sodacl_string.yml, 4)",[metric-snapshot-snapshot-schema],Schema Check,pass,snapshot,schema,
1,6,,snapshot,checks for snapshot:\n row_count > 0,"((None, None, 0.0), None, None, None, None, No...",0c97eed4,"(5, sodacl_string.yml, 11)",[metric-snapshot-snapshot-row_count],row_count > 0,pass,snapshot,metricThreshold,14.0
2,5,,snapshot,checks for snapshot:\n - schema:\n fail:...,"(None, None, None, None, None, None, None, [(i...",a9f97edf,"(5, sodacl_string.yml, 4)",[metric-snapshot-snapshot-schema],Schema Check,pass,snapshot,schema,
3,5,,snapshot,checks for snapshot:\n row_count > 0,"((None, None, 0.0), None, None, None, None, No...",0c97eed4,"(5, sodacl_string.yml, 11)",[metric-snapshot-snapshot-row_count],row_count > 0,pass,snapshot,metricThreshold,11.0
4,4,,snapshot,checks for snapshot:\n - schema:\n fail:...,"(None, None, None, None, None, None, None, [(i...",a9f97edf,"(5, sodacl_string.yml, 4)",[metric-snapshot-snapshot-schema],Schema Check,pass,snapshot,schema,
5,4,,snapshot,checks for snapshot:\n row_count > 0,"((None, None, 0.0), None, None, None, None, No...",0c97eed4,"(5, sodacl_string.yml, 11)",[metric-snapshot-snapshot-row_count],row_count > 0,pass,snapshot,metricThreshold,9.0
6,3,,snapshot,checks for snapshot:\n - schema:\n fail:...,"(None, None, None, None, None, None, None, [(i...",a9f97edf,"(5, sodacl_string.yml, 4)",[metric-snapshot-snapshot-schema],Schema Check,pass,snapshot,schema,
7,3,,snapshot,checks for snapshot:\n row_count > 0,"((None, None, 0.0), None, None, None, None, No...",0c97eed4,"(5, sodacl_string.yml, 11)",[metric-snapshot-snapshot-row_count],row_count > 0,pass,snapshot,metricThreshold,7.0
8,2,,snapshot,checks for snapshot:\n - schema:\n fail:...,"(None, None, None, None, None, None, None, [(i...",a9f97edf,"(5, sodacl_string.yml, 4)",[metric-snapshot-snapshot-schema],Schema Check,pass,snapshot,schema,
9,2,,snapshot,checks for snapshot:\n row_count > 0,"((None, None, 0.0), None, None, None, None, No...",0c97eed4,"(5, sodacl_string.yml, 11)",[metric-snapshot-snapshot-row_count],row_count > 0,pass,snapshot,metricThreshold,7.0


#### All checks of particular column (here id)

In [14]:
sdf_sr.select(
    sdf_sr.delta_version, 
    explode(sdf_sr.checks).alias("checks_ex")
).select('delta_version', 'checks_ex.*', 'checks_ex.diagnostics.value') \
.filter(F.col('column') == 'id').toPandas()

Unnamed: 0,delta_version,column,dataSource,definition,diagnostics,identity,location,metrics,name,outcome,table,type,value
0,6,id,snapshot,checks for snapshot:\n duplicate_count(id) = 0,"((0.0, 0.0, None), None, None, None, None, Non...",6a2f657f,"(5, sodacl_string.yml, 12)",[metric-snapshot-snapshot-id-duplicate_count],duplicate_count(id) = 0,pass,snapshot,metricThreshold,0.0
1,6,id,snapshot,checks for snapshot:\n missing_count(id) = 0,"((0.0, 0.0, None), None, None, None, None, Non...",33f7df58,"(5, sodacl_string.yml, 14)",[metric-snapshot-snapshot-id-missing_count],missing_count(id) = 0,pass,snapshot,metricThreshold,0.0
2,5,id,snapshot,checks for snapshot:\n duplicate_count(id) = 0,"((0.0, 0.0, None), None, None, None, None, Non...",6a2f657f,"(5, sodacl_string.yml, 12)",[metric-snapshot-snapshot-id-duplicate_count],duplicate_count(id) = 0,pass,snapshot,metricThreshold,0.0
3,5,id,snapshot,checks for snapshot:\n missing_count(id) = 0,"((0.0, 0.0, None), None, None, None, None, Non...",33f7df58,"(5, sodacl_string.yml, 14)",[metric-snapshot-snapshot-id-missing_count],missing_count(id) = 0,pass,snapshot,metricThreshold,0.0
4,4,id,snapshot,checks for snapshot:\n duplicate_count(id) = 0,"((0.0, 0.0, None), None, None, None, None, Non...",6a2f657f,"(5, sodacl_string.yml, 12)",[metric-snapshot-snapshot-id-duplicate_count],duplicate_count(id) = 0,pass,snapshot,metricThreshold,0.0
5,4,id,snapshot,checks for snapshot:\n missing_count(id) = 0,"((0.0, 0.0, None), None, None, None, None, Non...",33f7df58,"(5, sodacl_string.yml, 14)",[metric-snapshot-snapshot-id-missing_count],missing_count(id) = 0,pass,snapshot,metricThreshold,0.0
6,3,id,snapshot,checks for snapshot:\n duplicate_count(id) = 0,"((0.0, 0.0, None), None, None, None, None, Non...",6a2f657f,"(5, sodacl_string.yml, 12)",[metric-snapshot-snapshot-id-duplicate_count],duplicate_count(id) = 0,pass,snapshot,metricThreshold,0.0
7,3,id,snapshot,checks for snapshot:\n missing_count(id) = 0,"((0.0, 0.0, None), None, None, None, None, Non...",33f7df58,"(5, sodacl_string.yml, 14)",[metric-snapshot-snapshot-id-missing_count],missing_count(id) = 0,pass,snapshot,metricThreshold,0.0
8,2,id,snapshot,checks for snapshot:\n duplicate_count(id) = 0,"((0.0, 0.0, None), None, None, None, None, Non...",6a2f657f,"(5, sodacl_string.yml, 12)",[metric-snapshot-snapshot-id-duplicate_count],duplicate_count(id) = 0,pass,snapshot,metricThreshold,0.0
9,2,id,snapshot,checks for snapshot:\n missing_count(id) = 0,"((0.0, 0.0, None), None, None, None, None, Non...",33f7df58,"(5, sodacl_string.yml, 14)",[metric-snapshot-snapshot-id-missing_count],missing_count(id) = 0,pass,snapshot,metricThreshold,0.0


### Metrics
#### Table metrics

In [15]:
sdf_sr.select(
    sdf_sr.delta_version, 
    explode(sdf_sr.metrics).alias("metrics_ex")
).select('delta_version', 'metrics_ex.*') \
.filter(F.col('columnName').isNull()) \
.toPandas()

Unnamed: 0,delta_version,columnName,dataSourceName,identity,metricName,partitionName,tableName,value
0,6,,,metric-snapshot-snapshot-id-duplicate_count,duplicate_count,,,0
1,6,,,metric-snapshot-snapshot-birthdate-invalid_cou...,invalid_count,,,0
2,6,,,metric-snapshot-snapshot-_delivery_date-max,max,,,2022-09-30T00:00:00+00:00
3,6,,snapshot,metric-snapshot-snapshot-schema,schema,,snapshot,"[{""columnName"":""id"",""sourceDataType"":""int""},{""..."
4,6,,,metric-snapshot-snapshot-row_count,row_count,,,14
...,...,...,...,...,...,...,...,...
58,0,,,metric-snapshot-snapshot-row_count,row_count,,,4
59,0,,,metric-snapshot-snapshot-mail-duplicate_count,duplicate_count,,,0
60,0,,,metric-snapshot-snapshot-mail-invalid_count-61...,invalid_count,,,0
61,0,,,metric-snapshot-snapshot-birthdate-invalid_per...,invalid_percent,,,0.0


#### All metrics of particular column

In [16]:
sdf_sr.select(
    sdf_sr.delta_version, 
    explode(sdf_sr.metrics).alias("metrics_ex")
).select('delta_version', 'metrics_ex.*') \
.filter(F.col('columnName') == 'id').toPandas()
# why is this empty ???

Unnamed: 0,delta_version,columnName,dataSourceName,identity,metricName,partitionName,tableName,value


In [17]:
sdf_sr.select(
    sdf_sr.delta_version, 
    explode(sdf_sr.checks).alias("checks_ex")) \
.withColumn("checks_ex.diagnostics.value", F.col("checks_ex.diagnostics.value")) \
.withColumn("checks_ex.diagnostics.value", F.col("checks_ex.diagnostics.value")) \
.show()


+-------------+--------------------+---------------------------+
|delta_version|           checks_ex|checks_ex.diagnostics.value|
+-------------+--------------------+---------------------------+
|            6|{null, snapshot, ...|                       null|
|            6|{null, snapshot, ...|                       14.0|
|            6|{_delivery_date, ...|                        0.0|
|            6|{id, snapshot, ch...|                        0.0|
|            6|{id, snapshot, ch...|                        0.0|
|            6|{mail, snapshot, ...|                        0.0|
|            6|{mail, snapshot, ...|                        0.0|
|            6|{birthdate, snaps...|                        0.0|
|            5|{null, snapshot, ...|                       null|
|            5|{null, snapshot, ...|                       11.0|
|            5|{_delivery_date, ...|                        0.0|
|            5|{id, snapshot, ch...|                        0.0|
|            5|{id, snaps