# Test delta and soda

## Test delta

In [1]:
from pyspark.sql import SparkSession
from delta import *

builder = SparkSession.builder.appName("MyApp")
    # # this is needed if not setup in dockerfile
    # \
    # .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    # .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()
data = spark.range(0, 5)
data.write.format("delta").mode('overwrite').save("/home/jovyan/work/temp/delta-table")


## Test soda-spark

In [2]:
from sodaspark import scan


id = "a76824f0-50c0-11eb-8be8-88e9fe6293fd"
df = spark.createDataFrame([
   	   {"id": id, "name": "Paula Landry", "size": 3006},
   	   {"id": id, "name": "Kevin Crawford", "size": 7243}
    ])

scan_definition = ("""
    table_name: demodata
    metrics:
    - row_count
    - max
    - min_length
    tests:
    - row_count > 0
    columns:
      id:
        valid_format: uuid
        tests:
        - invalid_percentage == 0
    """)

scan_result = scan.execute(scan_definition, df)

scan_result.measurements

[Measurement(metric='schema', column_name=None, value=[{'name': 'id', 'type': 'string', 'dataType': 'string', 'nullable': True, 'logicalType': 'text', 'semanticType': 'text'}, {'name': 'name', 'type': 'string', 'dataType': 'string', 'nullable': True, 'logicalType': 'text', 'semanticType': 'text'}, {'name': 'size', 'type': 'bigint', 'dataType': 'bigint', 'nullable': True, 'logicalType': 'number', 'semanticType': 'number'}], group_values=None),
 Measurement(metric='row_count', column_name=None, value=2, group_values=None),
 Measurement(metric='values_count', column_name='id', value=2, group_values=None),
 Measurement(metric='valid_count', column_name='id', value=2, group_values=None),
 Measurement(metric='min_length', column_name='id', value=36, group_values=None),
 Measurement(metric='min_length', column_name='name', value=12, group_values=None),
 Measurement(metric='max', column_name='size', value=7243, group_values=None),
 Measurement(metric='missing_percentage', column_name='id', val

In [3]:
spark.sql('show tables').toPandas()

Unnamed: 0,namespace,tableName,isTemporary
0,,demodata,True


In [4]:
spark.sql('select * from demodata').toPandas()

Unnamed: 0,id,name,size
0,a76824f0-50c0-11eb-8be8-88e9fe6293fd,Paula Landry,3006
1,a76824f0-50c0-11eb-8be8-88e9fe6293fd,Kevin Crawford,7243


## Test soda-core-spark-df

In [5]:
from soda.scan import Scan

Overriding of current TracerProvider is not allowed


In [6]:
scan = Scan()
scan.disable_telemetry()
scan.set_data_source_name('demodata')
scan.add_spark_session(spark, 'demodata')

scan.add_sodacl_yaml_str('''
checks for demodata:
  - row_count > 0
''')

exit_code = scan.execute()
exit_code                       

root
 |-- count(1): long (nullable = false)

+--------+
|count(1)|
+--------+
|       2|
+--------+



0