# Soda scan delta table

## Setup

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
from ipywidgets import Output
import pandas as pd
from pathlib import Path 
from soda.scan import Scan
from soda.common.json_helper import JsonHelper

from delta import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import input_file_name, split, regexp_extract, to_timestamp

from utils.data_generation import FakerProfileDataSnapshot
from utils.soda_utils import build_scan_results, save_scan_results_with_spark_as_json

In [3]:
builder = SparkSession.builder.appName("data-pipeline") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [4]:
# object for data generation
datagen = FakerProfileDataSnapshot(
    data_dir=str(Path('./data').resolve())
)
# clean landing storage
datagen.delete_batches_in_landing_dir()
datagen.delete_bronze_dir()
datagen.delete_bronze_scan_results_dir()

Directory does ot exist.
/home/jovyan/work/data/bronze


## Parameters

In [5]:
read_csv_options = {'header': 'true', 'inferSchema': 'true', 'delimiter': ',', 'quote': '"', 'multiLine': 'true'}

In [6]:
location_landing = datagen.landing_dir
location_bronze = datagen.bronze_dir
location_bronze_scan_results = datagen.bronze_scan_results_dir

In [7]:
sodacl_yaml_str = '''
checks for snapshot:
  - freshness(_delivery_date, FAKE_NOW) < 8d
  - schema:
      fail:
        when required column missing:
          - id
          - mail
          - birthdate
      warn:
        when required column missing: [_last_modified_timestamp, _delivery_date]
  - row_count > 0
  - duplicate_count(id) = 0
  - duplicate_count(mail) = 0
  - missing_count(id) = 0
  - invalid_count(mail) = 0:
      valid format: email
  - invalid_percent(birthdate) = 0:
      valid length: 10
discover tables:
  tables:
    - include snapshot
profile columns:
  columns:
    - snapshot.%
'''

## Processing

### Generate data - CSV

In [8]:
batch_id = 7

path_csv = datagen.land_batch(batch_id)
path_csv

'/home/jovyan/work/data/landing/snapshot_2022-09-30.csv'

### Create delta table 

In [9]:
df = spark.read.format('csv').options(**read_csv_options).load(str(path_csv)) \
    .withColumn("_delivery_date", 
                to_timestamp(
                    regexp_extract(
                        input_file_name(), '.*_(.*).csv$', 1
                    )
                )
               )
df.toPandas()

Unnamed: 0,id,job,company,ssn,website,username,name,address,mail,birthdate,_delivery_date
0,0,Contractor,"Boone, Gallagher and Scott",508-98-7365,"['https://hall.info/', 'https://www.monroe-haw...",paula86,Misty Phillips,"4978 Chapman Bypass\nSanchezfurt, TN 23177",ana51@yahoo.com,1996-12-21,2022-09-30
1,1,Rural practice surveyor,Osborne PLC,861-51-6071,"['https://kelley.net/', 'http://www.herring-ca...",andreawhite,Alfred Hall,"11070 Wright Creek Apt. 541\nEast Jonathan, TN...",shawn57@yahoo.com,1922-03-06,2022-09-30
2,3,Newspaper journalist,Gordon-Smith,627-52-4610,['http://www.jackson.com/'],matthew14,Kyle Randall,"98150 Jones Way Apt. 251\nJonesside, OK 11215",michael49@hotmail.com,1922-03-12,2022-09-30
3,4,"Programmer, systems",Rodriguez-Williams,303-02-0239,"['https://www.pearson.com/', 'http://carlson.c...",kdavis,Sally Davis,"4052 Sparks Prairie\nJohnville, RI 76389",sanchezandrew@gmail.com,1961-04-26,2022-09-30
4,6,"Surveyor, rural practice",Jackson-Davis,498-01-2910,"['https://evans.info/', 'http://www.choi.com/']",simpsonphillip,Keith Todd,"280 Lindsey Road Apt. 412\nSouth Melissabury, ...",umartin@yahoo.com,1923-10-30,2022-09-30
5,9,"Surveyor, commercial/residential",Kelly-Singh,402-72-6549,"['http://www.english.com/', 'https://wilson.co...",rpeterson,Kelsey Ramsey,"36377 Christensen Fort\nPort Justin, OR 51565",shannonscott@yahoo.com,1987-06-05,2022-09-30
6,12,Secondary school teacher,Hernandez Ltd,334-42-9390,['http://walker.biz/'],harriscarl,Veronica Maldonado,"32860 Yoder Oval\nJeffreyborough, NM 31331",xchavez@hotmail.com,1932-05-02,2022-09-30
7,13,Race relations officer,"Jones, Randall and Cherry",418-26-0136,"['https://turner-hunter.org/', 'http://www.fie...",hbryant,Samuel Mack,"2024 Hodge Radial\nStefaniechester, NY 38154",bvazquez@yahoo.com,2012-05-20,2022-09-30
8,14,"Radiographer, diagnostic",Baldwin Group,412-92-6966,"['https://shah.net/', 'https://www.cohen-white...",gphillips,Anita Price,"97126 Gary Glens Suite 847\nPort Katherine, NM...",toddjessica@yahoo.com,2003-02-07,2022-09-30
9,15,"Geologist, engineering",Fernandez Group,308-98-1651,"['https://morgan.com/', 'https://www.gordon.co...",robert00,Courtney Gonzalez,"34549 Johnson Isle\nDonaldview, WI 19280",reedchristopher@gmail.com,1947-06-19,2022-09-30


In [10]:
df.write.format('delta').option('overwriteSchema', 'true').mode('overwrite').save(location_bronze)

In [11]:
spark.sql('DROP TABLE IF EXISTS snapshot')
spark.sql(f'CREATE TABLE IF NOT EXISTS snapshot USING DELTA LOCATION "{location_bronze}"')

DataFrame[]

In [12]:
spark.sql(f'DESCRIBE HISTORY snapshot').toPandas()

Unnamed: 0,version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
0,0,2022-08-21 16:36:46.840,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,,Serializable,False,"{'numOutputRows': '14', 'numOutputBytes': '626...",,Apache-Spark/3.2.1 Delta-Lake/2.0.0


### Run soda scan

... with checks, discovery, profiling.

In [13]:
import datetime

scan = Scan()
scan.disable_telemetry()
scan.set_data_source_name('snapshot')
scan.add_spark_session(spark, 'snapshot')

fake_now = datetime.date.fromisoformat('2022-10-01') # should make freshness check fail
scan.add_variables({"FAKE_NOW": f"{fake_now} 00:00:00"})

scan.add_sodacl_yaml_str(sodacl_yaml_str=sodacl_yaml_str)

# we avoid it with the following two lines instead
output = Output()
with output:
    exit_code = scan.execute()
exit_code                       

1

## Soda scan results

### Create soda cloud scan result data structure 

Source of the code below 
* discussion in PR 1406 https://github.com/sodadata/soda-core/pull/1406) 
* implementation and the implementation https://github.com/sodadata/soda-core/blob/b4f3d692fd912cc4fc4964c293debd09deb09551/soda/core/soda/scan.py 

Be aware of the caveat (cited from PR discussion):

*Exposing this data is a good idea. But it should be done in a dedicated data structure. I see you have recycled the soda cloud data structure, which is not suitable for this use case.*

[...]

*It's not intended for external usage. We cannot give any stability guarantees on it.*

[...]

*The better long term solution is introducing stable scan results data classes. That's something that we ll put on our roadmap.*


In [14]:
scan_results = build_scan_results(scan)
scan_results.keys()



### Investigate scan results

#### Investigate non-list entries 

In [15]:
for key, value in scan_results.items():
    if type(value) != list:
        print('*' * 40)
        print(key)
        print('-' * 20)
        print(value)


****************************************
definitionName
--------------------
None
****************************************
defaultDataSource
--------------------
snapshot
****************************************
dataTimestamp
--------------------
2022-08-21T16:36:57+00:00
****************************************
scanStartTimestamp
--------------------
2022-08-21T16:36:57+00:00
****************************************
scanEndTimestamp
--------------------
2022-08-21T16:37:40+00:00
****************************************
hasErrors
--------------------
False
****************************************
--------------------
True
****************************************
hasFailures
--------------------
False


#### Investigate list entries 

In [16]:
[key for key, value in scan_results.items() if type(value) == list]

['metrics',
 'checks',
 'automatedMonitoringChecks',
 'profiling',
 'metadata',
 'logs']

#### metrics

In [17]:
pd.DataFrame(scan_results['metrics'])

Unnamed: 0,identity,metricName,value,dataSourceName,tableName,partitionName,columnName
0,metric-snapshot-snapshot-row_count,row_count,14,,,,
1,metric-snapshot-snapshot-schema,schema,"[{'columnName': 'id', 'sourceDataType': 'int'}...",snapshot,snapshot,,
2,metric-snapshot-snapshot-id-duplicate_count,duplicate_count,0,,,,
3,metric-snapshot-snapshot-birthdate-invalid_cou...,invalid_count,0,,,,
4,metric-snapshot-snapshot-mail-duplicate_count,duplicate_count,0,,,,
5,metric-snapshot-snapshot-mail-invalid_count-61...,invalid_count,0,,,,
6,metric-snapshot-snapshot-birthdate-invalid_per...,invalid_percent,0.0,,,,
7,metric-snapshot-snapshot-_delivery_date-max,max,2022-09-30T00:00:00+00:00,,,,
8,metric-snapshot-snapshot-id-missing_count,missing_count,0,,,,


#### checks

In [18]:
pd.DataFrame(scan_results['checks'])

Unnamed: 0,identity,name,type,definition,location,dataSource,table,column,metrics,outcome,diagnostics
0,2ed46382,Schema Check,schema,checks for snapshot:\n - schema:\n fail:...,"{'filePath': 'sodacl_string.yml', 'line': 4, '...",snapshot,snapshot,,[metric-snapshot-snapshot-schema],warn,"{'schema': [{'columnName': 'id', 'sourceDataTy..."
1,0c97eed4,row_count > 0,metricThreshold,checks for snapshot:\n row_count > 0,"{'filePath': 'sodacl_string.yml', 'line': 12, ...",snapshot,snapshot,,[metric-snapshot-snapshot-row_count],pass,"{'value': 14, 'fail': {'lessThanOrEqual': 0.0}}"
2,7dbf4d67,"freshness(_delivery_date, FAKE_NOW) < 8d",metricThreshold,checks for snapshot:\n freshness(_delivery_da...,"{'filePath': 'sodacl_string.yml', 'line': 3, '...",snapshot,snapshot,_delivery_date,[metric-snapshot-snapshot-_delivery_date-max],pass,"{'value': 0, 'maxColumnTimestamp': '2022-09-30..."
3,6a2f657f,duplicate_count(id) = 0,metricThreshold,checks for snapshot:\n duplicate_count(id) = 0,"{'filePath': 'sodacl_string.yml', 'line': 13, ...",snapshot,snapshot,id,[metric-snapshot-snapshot-id-duplicate_count],pass,"{'value': 0, 'fail': {'greaterThan': 0.0, 'les..."
4,33f7df58,missing_count(id) = 0,metricThreshold,checks for snapshot:\n missing_count(id) = 0,"{'filePath': 'sodacl_string.yml', 'line': 15, ...",snapshot,snapshot,id,[metric-snapshot-snapshot-id-missing_count],pass,"{'value': 0, 'fail': {'greaterThan': 0.0, 'les..."
5,bd933496,duplicate_count(mail) = 0,metricThreshold,checks for snapshot:\n duplicate_count(mail) = 0,"{'filePath': 'sodacl_string.yml', 'line': 14, ...",snapshot,snapshot,mail,[metric-snapshot-snapshot-mail-duplicate_count],pass,"{'value': 0, 'fail': {'greaterThan': 0.0, 'les..."
6,55ab369a,invalid_count(mail) = 0,metricThreshold,checks for snapshot:\n - invalid_count(mail) ...,"{'filePath': 'sodacl_string.yml', 'line': 16, ...",snapshot,snapshot,mail,[metric-snapshot-snapshot-mail-invalid_count-6...,pass,"{'value': 0, 'fail': {'greaterThan': 0.0, 'les..."
7,450977ba,invalid_percent(birthdate) = 0,metricThreshold,checks for snapshot:\n - invalid_percent(birt...,"{'filePath': 'sodacl_string.yml', 'line': 18, ...",snapshot,snapshot,birthdate,[metric-snapshot-snapshot-birthdate-invalid_pe...,pass,"{'value': 0.0, 'fail': {'greaterThan': 0.0, 'l..."


In [19]:
scan_results['checks'][0]['location']

{'filePath': 'sodacl_string.yml', 'line': 4, 'col': 5}

In [20]:
pd.json_normalize(pd.DataFrame(scan_results['checks'])['location'])

Unnamed: 0,filePath,line,col
0,sodacl_string.yml,4,5
1,sodacl_string.yml,12,5
2,sodacl_string.yml,3,5
3,sodacl_string.yml,13,5
4,sodacl_string.yml,15,5
5,sodacl_string.yml,14,5
6,sodacl_string.yml,16,5
7,sodacl_string.yml,18,5


In [21]:
scan_results['checks'][0]['diagnostics']

{'schema': [{'columnName': 'id', 'sourceDataType': 'int'},
  {'columnName': 'job', 'sourceDataType': 'string'},
  {'columnName': 'company', 'sourceDataType': 'string'},
  {'columnName': 'ssn', 'sourceDataType': 'string'},
  {'columnName': 'website', 'sourceDataType': 'string'},
  {'columnName': 'username', 'sourceDataType': 'string'},
  {'columnName': 'name', 'sourceDataType': 'string'},
  {'columnName': 'address', 'sourceDataType': 'string'},
  {'columnName': 'mail', 'sourceDataType': 'string'},
  {'columnName': 'birthdate', 'sourceDataType': 'string'},
  {'columnName': '_delivery_date', 'sourceDataType': 'timestamp'},
  {'columnName': '', 'sourceDataType': ''},
  {'columnName': '# Partitioning', 'sourceDataType': ''},
  {'columnName': 'Not partitioned', 'sourceDataType': ''}]}

In [22]:
pd.json_normalize(pd.DataFrame(scan_results['checks'])['diagnostics'])

Unnamed: 0,schema,value,fail.lessThanOrEqual,maxColumnTimestamp,maxColumnTimestampUtc,nowVariableName,nowTimestamp,nowTimestampUtc,freshness,fail.greaterThan,fail.lessThan
0,"[{'columnName': 'id', 'sourceDataType': 'int'}...",,,,,,,,,,
1,,14.0,0.0,,,,,,,,
2,,0.0,,2022-09-30 00:00:00,2022-09-30 00:00:00+00:00,FAKE_NOW,2022-10-01 00:00:00,2022-10-01 00:00:00+00:00,"1 day, 0:00:00",,
3,,0.0,,,,,,,,0.0,0.0
4,,0.0,,,,,,,,0.0,0.0
5,,0.0,,,,,,,,0.0,0.0
6,,0.0,,,,,,,,0.0,0.0
7,,0.0,,,,,,,,0.0,0.0


#### automatedMonitoringChecks

In [23]:
pd.DataFrame(scan_results['automatedMonitoringChecks'])

#### profiling

In [24]:
pd.DataFrame(scan_results['profiling'])

Unnamed: 0,table,dataSource,columnProfiles
0,snapshot,snapshot,"[{'columnName': 'id', 'profile': {'mins': [0.0..."


In [25]:
pd.DataFrame(scan_results['profiling'][0]['columnProfiles'])

Unnamed: 0,columnName,profile
0,id,"{'mins': [0.0, 1.0, 3.0, 4.0, 6.0], 'maxs': [1..."
1,job,"{'mins': None, 'maxs': None, 'min': None, 'max..."
2,company,"{'mins': None, 'maxs': None, 'min': None, 'max..."
3,ssn,"{'mins': None, 'maxs': None, 'min': None, 'max..."
4,website,"{'mins': None, 'maxs': None, 'min': None, 'max..."
5,username,"{'mins': None, 'maxs': None, 'min': None, 'max..."
6,name,"{'mins': None, 'maxs': None, 'min': None, 'max..."
7,address,"{'mins': None, 'maxs': None, 'min': None, 'max..."
8,mail,"{'mins': None, 'maxs': None, 'min': None, 'max..."
9,birthdate,"{'mins': None, 'maxs': None, 'min': None, 'max..."


In [26]:
def long_dataframe_from_profiling_results(profiling_list: list):
    df_tables = pd.DataFrame(profiling_list)
    df_all_metrics = []
    for table_id, row in df_tables.iterrows():
        df_col = pd.DataFrame(row['columnProfiles'])
        df_col['table_id'] = table_id
        df_met = pd.json_normalize(df_col['profile']).melt(var_name='metricName', ignore_index=False)
        df_all_metrics.append(
            pd.merge(df_col.drop('profile', axis=1), 
                     df_met, 
                     left_index=True, right_index=True)
        )
    df_all_metrics= pd.concat(df_all_metrics)
    return pd.merge(
        df_tables.drop('columnProfiles', axis=1), 
        df_all_metrics, 
        left_index=True, right_on='table_id').drop('table_id', axis=1)
    
df_profiling = long_dataframe_from_profiling_results(scan_results['profiling'])
print(df_profiling['metricName'].unique())
df_profiling

['mins' 'maxs' 'min' 'max' 'frequent_values' 'avg' 'sum' 'stddev'
 'variance' 'distinct' 'missing_count' 'avg_length' 'min_length'
 'max_length' 'histogram.boundaries' 'histogram.frequencies' 'histogram']


Unnamed: 0,table,dataSource,columnName,metricName,value
0,snapshot,snapshot,id,mins,"[0.0, 1.0, 3.0, 4.0, 6.0]"
0,snapshot,snapshot,id,maxs,"[19.0, 18.0, 17.0, 16.0, 15.0]"
0,snapshot,snapshot,id,min,0.0
0,snapshot,snapshot,id,max,19.0
0,snapshot,snapshot,id,frequent_values,"[{'value': '12', 'frequency': 1}, {'value': '1..."
...,...,...,...,...,...
9,snapshot,snapshot,birthdate,min_length,10.0
9,snapshot,snapshot,birthdate,max_length,10.0
9,snapshot,snapshot,birthdate,histogram.boundaries,
9,snapshot,snapshot,birthdate,histogram.frequencies,


In [27]:
df_profiling.query('metricName == "missing_count"')

Unnamed: 0,table,dataSource,columnName,metricName,value
0,snapshot,snapshot,id,missing_count,0
1,snapshot,snapshot,job,missing_count,0
2,snapshot,snapshot,company,missing_count,0
3,snapshot,snapshot,ssn,missing_count,0
4,snapshot,snapshot,website,missing_count,0
5,snapshot,snapshot,username,missing_count,0
6,snapshot,snapshot,name,missing_count,0
7,snapshot,snapshot,address,missing_count,0
8,snapshot,snapshot,mail,missing_count,0
9,snapshot,snapshot,birthdate,missing_count,0


#### metadata

In [28]:
pd.DataFrame(scan_results['metadata'])

Unnamed: 0,table,dataSource,rowCount,schema
0,snapshot,snapshot,14,"[{'columnName': 'id', 'sourceDataType': 'int'}..."


In [29]:
scan_results['metadata'][0]['schema']

[{'columnName': 'id', 'sourceDataType': 'int'},
 {'columnName': 'job', 'sourceDataType': 'string'},
 {'columnName': 'company', 'sourceDataType': 'string'},
 {'columnName': 'ssn', 'sourceDataType': 'string'},
 {'columnName': 'website', 'sourceDataType': 'string'},
 {'columnName': 'username', 'sourceDataType': 'string'},
 {'columnName': 'name', 'sourceDataType': 'string'},
 {'columnName': 'address', 'sourceDataType': 'string'},
 {'columnName': 'mail', 'sourceDataType': 'string'},
 {'columnName': 'birthdate', 'sourceDataType': 'string'},
 {'columnName': '_delivery_date', 'sourceDataType': 'timestamp'},
 {'columnName': '', 'sourceDataType': ''},
 {'columnName': '# Partitioning', 'sourceDataType': ''},
 {'columnName': 'Not partitioned', 'sourceDataType': ''}]

In [30]:
pd.DataFrame(scan_results['metadata'][0]['schema'])

Unnamed: 0,columnName,sourceDataType
0,id,int
1,job,string
2,company,string
3,ssn,string
4,website,string
5,username,string
6,name,string
7,address,string
8,mail,string
9,birthdate,string


#### logs

In [31]:
pd.DataFrame(scan_results['logs'])

Unnamed: 0,level,message,timestamp,index
0,info,Soda Core 3.0.4,2022-08-21T16:36:57+00:00,1
1,info,Running discover datasets for data source: sna...,2022-08-21T16:37:07+00:00,2
2,info,Discovering the following tables:,2022-08-21T16:37:07+00:00,3
3,info,- snapshot,2022-08-21T16:37:07+00:00,4
4,info,Running column profiling for data source: snap...,2022-08-21T16:37:08+00:00,5
5,info,Profiling columns for the following tables:,2022-08-21T16:37:08+00:00,6
6,info,- snapshot,2022-08-21T16:37:08+00:00,7
7,info,Scan summary:,2022-08-21T16:37:40+00:00,8
8,info,7/8 checks PASSED:,2022-08-21T16:37:40+00:00,9
9,info,snapshot in snapshot,2022-08-21T16:37:40+00:00,10


### Save and read with Spark

#### Save to json with Spark

In [36]:
# also implemented in utils.soda_utils.save_scan_results_with_spark_as_json
rdd = spark.sparkContext.parallelize([JsonHelper.to_json(scan_results)])
rdd.coalesce(1).saveAsTextFile(location_bronze_scan_results + f'/scan_result_{batch_id}.json')

#### Read json with Spark

In [37]:
sdf_sr = spark.read.json(location_bronze_scan_results + f'/scan_result_{batch_id}.json')
sdf_sr.printSchema()

root
 |-- automatedMonitoringChecks: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- checks: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- column: string (nullable = true)
 |    |    |-- dataSource: string (nullable = true)
 |    |    |-- definition: string (nullable = true)
 |    |    |-- diagnostics: struct (nullable = true)
 |    |    |    |-- fail: struct (nullable = true)
 |    |    |    |    |-- greaterThan: double (nullable = true)
 |    |    |    |    |-- lessThan: double (nullable = true)
 |    |    |    |    |-- lessThanOrEqual: double (nullable = true)
 |    |    |    |-- freshness: string (nullable = true)
 |    |    |    |-- maxColumnTimestamp: string (nullable = true)
 |    |    |    |-- maxColumnTimestampUtc: string (nullable = true)
 |    |    |    |-- nowTimestamp: string (nullable = true)
 |    |    |    |-- nowTimestampUtc: string (nullable = true)
 |    |    |    |-- nowVariableName: string (nulla

### Print complete scan result dictionary

In [38]:
print(JsonHelper.to_json_pretty(scan_results))

{
  "definitionName": null,
  "defaultDataSource": "snapshot",
  "dataTimestamp": "2022-08-21T16:36:57+00:00",
  "scanStartTimestamp": "2022-08-21T16:36:57+00:00",
  "scanEndTimestamp": "2022-08-21T16:37:40+00:00",
  "hasErrors": false,
  "hasFailures": false,
  "metrics": [
    {
      "identity": "metric-snapshot-snapshot-row_count",
      "metricName": "row_count",
      "value": 14
    },
    {
      "identity": "metric-snapshot-snapshot-schema",
      "metricName": "schema",
      "dataSourceName": "snapshot",
      "tableName": "snapshot",
      "partitionName": null,
      "columnName": null,
      "value": [
        {
          "columnName": "id",
          "sourceDataType": "int"
        },
        {
          "columnName": "job",
          "sourceDataType": "string"
        },
        {
          "columnName": "company",
          "sourceDataType": "string"
        },
        {
          "columnName": "ssn",
          "sourceDataType": "string"
        },
        {
          