### Single Column Profiling Example

In [None]:
#Configuracion inicial antes de correr Spark

In [1]:
import os
os.environ["SPARK_VERSION"] = '3.3'
os.environ["JAVA_HOME"] = '/usr/lib/jvm/java-11-openjdk-amd64/'

In [2]:
pip install pydeequ==1.2.0

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install pyspark

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install sagemaker_pyspark

Note: you may need to restart the kernel to use updated packages.


In [5]:
### Ejemplo de profiling

In [9]:
import pydeequ

import sagemaker_pyspark
from pyspark.sql import SparkSession, Row
from pydeequ.profiles import *

In [6]:
classpath = ":".join(sagemaker_pyspark.classpath_jars()) # aws-specific jars

spark = (SparkSession
    .builder
    .config("spark.driver.extraClassPath", classpath)
    .config("spark.jars.packages", pydeequ.deequ_maven_coord)
    .config("spark.jars.excludes", pydeequ.f2j_maven_coord)
    .getOrCreate())

:: loading settings :: url = jar:file:/opt/conda/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/sagemaker-user/.ivy2/cache
The jars for the packages stored in: /home/sagemaker-user/.ivy2/jars
com.amazon.deequ#deequ added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-3f2bfb93-39f0-4c33-81c6-5fa5eaec88fd;1.0
	confs: [default]
	found com.amazon.deequ#deequ;2.0.3-spark-3.3 in central
	found org.scala-lang#scala-reflect;2.12.10 in central
	found org.scalanlp#breeze_2.12;0.13.2 in central
	found org.scalanlp#breeze-macros_2.12;0.13.2 in central
	found com.github.fommil.netlib#core;1.1.2 in central
	found net.sf.opencsv#opencsv;2.3 in central
	found com.github.rwl#jtransforms;2.4.0 in central
	found junit#junit;4.8.2 in central
	found org.apache.commons#commons-math3;3.2 in central
	found org.spire-math#spire_2.12;0.13.0 in central
	found org.spire-math#spire-macros_2.12;0.13.0 in central
	found org.typelevel#machinist_2.12;0.6.1 in central
	found com.chuusai#shapeless_2.12;2.3.2 in central
	found org.typelevel#macro-co

24/09/09 02:08:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/09/09 02:08:18 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/09/09 02:08:18 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [7]:
df = spark.sparkContext.parallelize([
    Row(productName="thingA", totalNumber="13.0", status="IN_TRANSIT", valuable="true"),
    Row(productName="thingA", totalNumber="5", status="DELAYED", valuable="false"),
    Row(productName="thingB", totalNumber=None, status="DELAYED", valuable=None),
    Row(productName="thingC", totalNumber=None, status="IN_TRANSIT", valuable="false"),
    Row(productName="thingD", totalNumber="1.0", status="DELAYED", valuable="true"),
    Row(productName="thingC", totalNumber="7.0", status="UNKNOWN", valuable=None),
    Row(productName="thingC", totalNumber="20", status="UNKNOWN", valuable=None),
    Row(productName="thingE", totalNumber="20", status="DELAYED", valuable="false")]).toDF()

                                                                                

In [8]:
print(df.printSchema())

root
 |-- productName: string (nullable = true)
 |-- totalNumber: string (nullable = true)
 |-- status: string (nullable = true)
 |-- valuable: string (nullable = true)

None


In [10]:
result = ColumnProfilerRunner(spark) \
            .onData(df) \
            .run()

24/09/09 02:09:27 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

In [11]:
for col, profile in result.profiles.items():
    print(f'Column \'{col}\'')
    print('\t',f'completeness: {profile.completeness}')
    print('\t',f'approximate number of distinct values: {profile.approximateNumDistinctValues}')
    print('\t',f'datatype: {profile.dataType}')

Column 'productName'
	 completeness: 1.0
	 approximate number of distinct values: 5
	 datatype: String
Column 'totalNumber'
	 completeness: 0.75
	 approximate number of distinct values: 5
	 datatype: Fractional
Column 'status'
	 completeness: 1.0
	 approximate number of distinct values: 3
	 datatype: String
Column 'valuable'
	 completeness: 0.625
	 approximate number of distinct values: 2
	 datatype: Boolean


In [12]:
totalNumber_profile = result.profiles['totalNumber']

print(f'Statistics of \'totalNumber\':')
print('\t',f"minimum: {totalNumber_profile.minimum}")
print('\t',f"maximum: {totalNumber_profile.maximum}")
print('\t',f"mean: {totalNumber_profile.mean}")
print('\t',f"standard deviation: {totalNumber_profile.stdDev}")

Statistics of 'totalNumber':
	 minimum: 1.0
	 maximum: 20.0
	 mean: 11.0
	 standard deviation: 7.280109889280518


In [13]:
status_profile = result.profiles['status']

print('Value distribution in \'status\':')
for unique_entry in status_profile.histogram: 
    print('\t',f"{unique_entry.value} occurred {unique_entry.count} times (ratio is {unique_entry.ratio})")

Value distribution in 'status':
	 IN_TRANSIT occurred 2 times (ratio is 0.25)
	 UNKNOWN occurred 2 times (ratio is 0.25)
	 DELAYED occurred 4 times (ratio is 0.5)
