# dqResearch

In [1]:
!pip install dataquality_bnr

Looking in indexes: http://artifactory.produbanbr.corp/artifactory/api/pypi/pypi-all/simple
Collecting dataquality_bnr
  Downloading http://artifactory.produbanbr.corp/artifactory/api/pypi/pypi-all/dataquality-bnr/0.3.5/dataquality_bnr-0.3.5-py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 25.4 MB/s eta 0:00:01
Installing collected packages: dataquality-bnr
Successfully installed dataquality-bnr-0.3.5
You should consider upgrading via the '/opt/miniconda/bin/python3 -m pip install --upgrade pip' command.[0m


### Set up a PySpark session
A biblioteca foi construida para ser utilizada com __PySpark__ e possibilitar '*testes unitarios dos dados*', executando validacoes qualitativas em datasets de larga escala.</br>
A integracao da sessao spark e a biblioteca depende apenas de duas configuracoes adicionais:


In [2]:
from pyspark.sql import functions as F

In [10]:
from pydeequ.verification import *
VerificationSuite

pydeequ.verification.VerificationSuite

In [3]:
from pyspark.sql import SparkSession, Row
from dataquality_bnr.dqSupport import main as dqSup

spark = SparkSession\
        .builder\
        .config("spark.jars", dqSup.getDeequJar_path())\
        .config("spark.jars.excludes", dqSup.getDeequJar_excludes())\
        .getOrCreate()

## Load Dataset

In [74]:
sql_query = """
select
    i1c_renda_final,
    i1c_lim_pre_ap_preventivo,
    i1c_rating_riscos,
    i1d_idade,
    i1d_sexo,
    i1c_cli_possui_conta,
    i1c_soc_cd_segm_empr1,
    i1c_soc_cd_ramo_atvd1,
    dat_ref_carga
from th.thbpd381 where dat_ref_carga='2022-01-03'
"""
df_input = spark.sql(sql_query)

In [75]:
df_input.printSchema()

root
 |-- i1c_renda_final: integer (nullable = true)
 |-- i1c_lim_pre_ap_preventivo: integer (nullable = true)
 |-- i1c_rating_riscos: integer (nullable = true)
 |-- i1d_idade: integer (nullable = true)
 |-- i1d_sexo: string (nullable = true)
 |-- i1c_cli_possui_conta: string (nullable = true)
 |-- i1c_soc_cd_segm_empr1: integer (nullable = true)
 |-- i1c_soc_cd_ramo_atvd1: integer (nullable = true)
 |-- dat_ref_carga: string (nullable = true)



## Research on data
with __dqRes.runProfile()__ and __dqRes.runConstraintSuggestion()__

In [64]:
from dataquality_bnr.dqResearch import main as dqRes

### runProfile()

O processo interno de Profiling depende da passagem de um diretório para execucao e o nome de um arquivo .json que será gerado como saída. Os resultados em sua forma "bruta" serao armazenado temporariamente nesse json, e a lib dataquality_bnr trará uma abstracao acessível desses resultados em um dataframe de saída.

In [68]:
metrics_file = "/tmp/arquivo/para/processamento/interno/file.json"

In [69]:
profile_df = dqRes.runProfile(spark, df_input, metrics_file)

In [70]:
profile_df.show()

+--------------------+--------------------+-------------------+-------------+
|              column|              metric|              value|research_date|
+--------------------+--------------------+-------------------+-------------+
|i1c_lim_pre_ap_pr...|        Completeness|                1.0|   2022-02-02|
|i1c_lim_pre_ap_pr...| ApproxCountDistinct|                2.0|   2022-02-02|
|i1c_lim_pre_ap_pr...|             Minimum|                0.0|   2022-02-02|
|i1c_lim_pre_ap_pr...|             Maximum|                2.0|   2022-02-02|
|i1c_lim_pre_ap_pr...|                Mean|0.01650938167414294|   2022-02-02|
|i1c_lim_pre_ap_pr...|   StandardDeviation|0.18095912153031535|   2022-02-02|
|i1c_lim_pre_ap_pr...|                 Sum|             8936.0|   2022-02-02|
|           i1d_idade|        Completeness|                1.0|   2022-02-02|
|           i1d_idade| ApproxCountDistinct|              125.0|   2022-02-02|
|           i1d_idade|             Minimum|                0.0| 

##### Save as .xlsx

In [47]:
file_name="excelFiles/profileExcel_df.xlsx"
dqSup.convertToExcel(spark, profile_df, excel_writer=file_name)

##### Save as .parquet

In [48]:
file_name="profile_df"
profile_parquet_path="/user/x266727/casoDeUso/dqResearch/thbpd381/"+file_name

In [49]:
profile_df.write.mode("overwrite").parquet(profile_parquet_path)

### runConstraintSuggestion()

In [50]:
suggestions_df = dqRes.runConstraintSuggestion(spark, df_input)

In [51]:
suggestions_df.select(F.col("code_for_constraint")).show(truncate=False)

+---------------------------------------------------------------------------------------------------------------+
|code_for_constraint                                                                                            |
+---------------------------------------------------------------------------------------------------------------+
|.isComplete("i1d_idade")                                                                                       |
|.isNonNegative("i1d_idade")                                                                                    |
|.isContainedIn("i1c_lim_pre_ap_preventivo", ["0", "2"])                                                        |
|.isComplete("i1c_lim_pre_ap_preventivo")                                                                       |
|.isContainedIn("i1c_lim_pre_ap_preventivo", ["0"], lambda x: x >= 0.99, "It should be above 0.99!")            |
|.isNonNegative("i1c_lim_pre_ap_preventivo")                                            

##### Save as .xlsx

In [54]:
file_name="excelFiles/suggestionsExcel_df.xlsx"

In [55]:
dqSup.convertToExcel(spark, suggestions_df, excel_writer=file_name)

##### Save as .parquet

In [4]:
file_name="suggestions_df"
sugg_parquet_path="/user/x266727/casoDeUso/dqResearch/thbpd381/"+file_name

In [57]:
suggestions_df.write.mode("overwrite").parquet(sugg_parquet_path)

## Diving deep...
Load researched data back from storage and analyze it closely

In [5]:
profile_df = spark.read.parquet(profile_parquet_path)
suggestions_df = spark.read.parquet(sugg_parquet_path)

In [79]:
df_input.printSchema()

root
 |-- i1c_renda_final: integer (nullable = true)
 |-- i1c_lim_pre_ap_preventivo: integer (nullable = true)
 |-- i1c_rating_riscos: integer (nullable = true)
 |-- i1d_idade: integer (nullable = true)
 |-- i1d_sexo: string (nullable = true)
 |-- i1c_cli_possui_conta: string (nullable = true)
 |-- i1c_soc_cd_segm_empr1: integer (nullable = true)
 |-- i1c_soc_cd_ramo_atvd1: integer (nullable = true)
 |-- dat_ref_carga: string (nullable = true)



##### root

In [80]:
profile_df.filter(F.col("column")=="*").show(truncate=False)

+------+------+--------+-------------+
|column|metric|value   |research_date|
+------+------+--------+-------------+
|*     |Size  |541268.0|2022-02-02   |
+------+------+--------+-------------+



##### |-- i1c_renda_final: integer (nullable = true)

In [81]:
profile_df.filter(F.col("column")=="i1c_renda_final").show(truncate=False)

+---------------+-------------------+-----------------+-------------+
|column         |metric             |value            |research_date|
+---------------+-------------------+-----------------+-------------+
|i1c_renda_final|Completeness       |1.0              |2022-02-02   |
|i1c_renda_final|ApproxCountDistinct|23551.0          |2022-02-02   |
|i1c_renda_final|Minimum            |0.0              |2022-02-02   |
|i1c_renda_final|Maximum            |9.9999999E7      |2022-02-02   |
|i1c_renda_final|Mean               |9662.858364063643|2022-02-02   |
|i1c_renda_final|StandardDeviation  |725924.6431629129|2022-02-02   |
|i1c_renda_final|Sum                |5.230196021E9    |2022-02-02   |
+---------------+-------------------+-----------------+-------------+



In [82]:
(suggestions_df.filter(F.col("column_name")=="i1c_renda_final")
 .select(F.col("code_for_constraint"), F.col("column_name"))).show(truncate=False)

+---------------------------------+---------------+
|code_for_constraint              |column_name    |
+---------------------------------+---------------+
|.isComplete("i1c_renda_final")   |i1c_renda_final|
|.isNonNegative("i1c_renda_final")|i1c_renda_final|
+---------------------------------+---------------+



##### |-- i1c_lim_pre_ap_preventivo: integer (nullable = true)

In [83]:
profile_df.filter(F.col("column")=="i1c_lim_pre_ap_preventivo").show(truncate=False)

+-------------------------+-------------------+-------------------+-------------+
|column                   |metric             |value              |research_date|
+-------------------------+-------------------+-------------------+-------------+
|i1c_lim_pre_ap_preventivo|Completeness       |1.0                |2022-02-02   |
|i1c_lim_pre_ap_preventivo|ApproxCountDistinct|2.0                |2022-02-02   |
|i1c_lim_pre_ap_preventivo|Minimum            |0.0                |2022-02-02   |
|i1c_lim_pre_ap_preventivo|Maximum            |2.0                |2022-02-02   |
|i1c_lim_pre_ap_preventivo|Mean               |0.01650938167414294|2022-02-02   |
|i1c_lim_pre_ap_preventivo|StandardDeviation  |0.18095912153031535|2022-02-02   |
|i1c_lim_pre_ap_preventivo|Sum                |8936.0             |2022-02-02   |
|i1c_lim_pre_ap_preventivo|Histogram.bins     |2.0                |2022-02-02   |
|i1c_lim_pre_ap_preventivo|Histogram.abs.2    |4468.0             |2022-02-02   |
|i1c_lim_pre_ap_

In [84]:
(suggestions_df.filter(F.col("column_name")=="i1c_lim_pre_ap_preventivo")
 .select(F.col("code_for_constraint"), F.col("column_name"))).show(truncate=False)

+---------------------------------------------------------------------------------------------------+-------------------------+
|code_for_constraint                                                                                |column_name              |
+---------------------------------------------------------------------------------------------------+-------------------------+
|.isContainedIn("i1c_lim_pre_ap_preventivo", ["0", "2"])                                            |i1c_lim_pre_ap_preventivo|
|.isComplete("i1c_lim_pre_ap_preventivo")                                                           |i1c_lim_pre_ap_preventivo|
|.isContainedIn("i1c_lim_pre_ap_preventivo", ["0"], lambda x: x >= 0.99, "It should be above 0.99!")|i1c_lim_pre_ap_preventivo|
|.isNonNegative("i1c_lim_pre_ap_preventivo")                                                        |i1c_lim_pre_ap_preventivo|
+---------------------------------------------------------------------------------------------------+---

##### |-- i1c_rating_riscos: integer (nullable = true)

In [85]:
profile_df.filter(F.col("column")=="i1c_rating_riscos").show(truncate=False)

+-----------------+-----------------+--------------------+-------------+
|column           |metric           |value               |research_date|
+-----------------+-----------------+--------------------+-------------+
|i1c_rating_riscos|Histogram.bins   |9.0                 |2022-02-02   |
|i1c_rating_riscos|Histogram.abs.8  |15510.0             |2022-02-02   |
|i1c_rating_riscos|Histogram.ratio.8|0.028654936186879698|2022-02-02   |
|i1c_rating_riscos|Histogram.abs.4  |7235.0              |2022-02-02   |
|i1c_rating_riscos|Histogram.ratio.4|0.013366761013028666|2022-02-02   |
|i1c_rating_riscos|Histogram.abs.9  |19293.0             |2022-02-02   |
|i1c_rating_riscos|Histogram.ratio.9|0.03564408019687105 |2022-02-02   |
|i1c_rating_riscos|Histogram.abs.5  |3.0                 |2022-02-02   |
|i1c_rating_riscos|Histogram.ratio.5|5.542540848526054E-6|2022-02-02   |
|i1c_rating_riscos|Histogram.abs.6  |12866.0             |2022-02-02   |
|i1c_rating_riscos|Histogram.ratio.6|0.023770110185

In [86]:
(suggestions_df.filter(F.col("column_name")=="i1c_rating_riscos")
 .select(F.col("code_for_constraint"), F.col("column_name"))).show(truncate=False)

+---------------------------------------------------------------------------------------------------------------+-----------------+
|code_for_constraint                                                                                            |column_name      |
+---------------------------------------------------------------------------------------------------------------+-----------------+
|.isContainedIn("i1c_rating_riscos", ["0", "1", "2", "7", "9", "8", "6", "4", "5"])                             |i1c_rating_riscos|
|.isComplete("i1c_rating_riscos")                                                                               |i1c_rating_riscos|
|.isContainedIn("i1c_rating_riscos", ["0", "1", "2", "7", "9"], lambda x: x >= 0.93, "It should be above 0.93!")|i1c_rating_riscos|
|.isNonNegative("i1c_rating_riscos")                                                                            |i1c_rating_riscos|
+---------------------------------------------------------------------------

##### |-- i1d_idade: integer (nullable = true)

In [87]:
profile_df.filter(F.col("column")=="i1d_idade").show(truncate=False)

+---------+-------------------+------------------+-------------+
|column   |metric             |value             |research_date|
+---------+-------------------+------------------+-------------+
|i1d_idade|Completeness       |1.0               |2022-02-02   |
|i1d_idade|ApproxCountDistinct|125.0             |2022-02-02   |
|i1d_idade|Minimum            |0.0               |2022-02-02   |
|i1d_idade|Maximum            |124.0             |2022-02-02   |
|i1d_idade|Mean               |53.9417829984407  |2022-02-02   |
|i1d_idade|StandardDeviation  |14.860442431353526|2022-02-02   |
|i1d_idade|Sum                |2.9196961E7       |2022-02-02   |
+---------+-------------------+------------------+-------------+



In [88]:
(suggestions_df.filter(F.col("column_name")=="i1d_idade")
 .select(F.col("code_for_constraint"), F.col("column_name"))).show(truncate=False)

+---------------------------+-----------+
|code_for_constraint        |column_name|
+---------------------------+-----------+
|.isComplete("i1d_idade")   |i1d_idade  |
|.isNonNegative("i1d_idade")|i1d_idade  |
+---------------------------+-----------+



##### |-- i1d_sexo: string (nullable = true)

In [89]:
profile_df.filter(F.col("column")=="i1d_sexo").show(truncate=False)

+--------+--------------------------+---------------------+-------------+
|column  |metric                    |value                |research_date|
+--------+--------------------------+---------------------+-------------+
|i1d_sexo|Histogram.bins            |3.0                  |2022-02-02   |
|i1d_sexo|Histogram.abs.M           |329963.0             |2022-02-02   |
|i1d_sexo|Histogram.ratio.M         |0.6096111353340674   |2022-02-02   |
|i1d_sexo|Histogram.abs.            |856.0                |2022-02-02   |
|i1d_sexo|Histogram.ratio.          |0.0015814716554461007|2022-02-02   |
|i1d_sexo|Histogram.abs.F           |210449.0             |2022-02-02   |
|i1d_sexo|Histogram.ratio.F         |0.3888073930104865   |2022-02-02   |
|i1d_sexo|Completeness              |1.0                  |2022-02-02   |
|i1d_sexo|ApproxCountDistinct       |3.0                  |2022-02-02   |
|i1d_sexo|Histogram.bins            |5.0                  |2022-02-02   |
|i1d_sexo|Histogram.abs.Boolean     |0

In [90]:
(suggestions_df.filter(F.col("column_name")=="i1d_sexo")
 .select(F.col("code_for_constraint"), F.col("column_name"))).show(truncate=False)

+---------------------------------------------------------------------------------------+-----------+
|code_for_constraint                                                                    |column_name|
+---------------------------------------------------------------------------------------+-----------+
|.isContainedIn("i1d_sexo", ["M", "F", " "])                                            |i1d_sexo   |
|.isComplete("i1d_sexo")                                                                |i1d_sexo   |
|.isContainedIn("i1d_sexo", ["M", "F"], lambda x: x >= 0.99, "It should be above 0.99!")|i1d_sexo   |
+---------------------------------------------------------------------------------------+-----------+



##### |-- i1c_cli_possui_conta: string (nullable = true)

In [91]:
profile_df.filter(F.col("column")=="i1c_cli_possui_conta").show(truncate=False)

+--------------------+--------------------------+-------------------+-------------+
|column              |metric                    |value              |research_date|
+--------------------+--------------------------+-------------------+-------------+
|i1c_cli_possui_conta|Histogram.bins            |2.0                |2022-02-02   |
|i1c_cli_possui_conta|Histogram.abs.1           |335955.0           |2022-02-02   |
|i1c_cli_possui_conta|Histogram.ratio.1         |0.6206814369221901 |2022-02-02   |
|i1c_cli_possui_conta|Histogram.abs.0           |205313.0           |2022-02-02   |
|i1c_cli_possui_conta|Histogram.ratio.0         |0.37931856307780987|2022-02-02   |
|i1c_cli_possui_conta|Completeness              |1.0                |2022-02-02   |
|i1c_cli_possui_conta|ApproxCountDistinct       |2.0                |2022-02-02   |
|i1c_cli_possui_conta|Histogram.bins            |5.0                |2022-02-02   |
|i1c_cli_possui_conta|Histogram.abs.Boolean     |0.0                |2022-02

In [92]:
(suggestions_df.filter(F.col("column_name")=="i1c_cli_possui_conta")
 .select(F.col("code_for_constraint"), F.col("column_name"))).show(truncate=False)

+---------------------------------------------------------------------+--------------------+
|code_for_constraint                                                  |column_name         |
+---------------------------------------------------------------------+--------------------+
|.isContainedIn("i1c_cli_possui_conta", ["1", "0"])                   |i1c_cli_possui_conta|
|.isComplete("i1c_cli_possui_conta")                                  |i1c_cli_possui_conta|
|.isNonNegative("i1c_cli_possui_conta")                               |i1c_cli_possui_conta|
|.hasDataType("i1c_cli_possui_conta", ConstrainableDataTypes.Integral)|i1c_cli_possui_conta|
+---------------------------------------------------------------------+--------------------+



##### |-- i1c_soc_cd_segm_empr1: integer (nullable = true)

In [93]:
profile_df.filter(F.col("column")=="i1c_soc_cd_segm_empr1").show(truncate=False)

+---------------------+-------------------+---------------------+-------------+
|column               |metric             |value                |research_date|
+---------------------+-------------------+---------------------+-------------+
|i1c_soc_cd_segm_empr1|Histogram.ratio.274|3.6950272323507024E-6|2022-02-02   |
|i1c_soc_cd_segm_empr1|Histogram.abs.21   |171.0                |2022-02-02   |
|i1c_soc_cd_segm_empr1|Histogram.ratio.21 |3.1592482836598505E-4|2022-02-02   |
|i1c_soc_cd_segm_empr1|Histogram.abs.228  |1211.0               |2022-02-02   |
|i1c_soc_cd_segm_empr1|Histogram.ratio.228|0.0022373389891883503|2022-02-02   |
|i1c_soc_cd_segm_empr1|Histogram.abs.138  |3.0                  |2022-02-02   |
|i1c_soc_cd_segm_empr1|Histogram.ratio.138|5.542540848526054E-6 |2022-02-02   |
|i1c_soc_cd_segm_empr1|Histogram.abs.213  |2.0                  |2022-02-02   |
|i1c_soc_cd_segm_empr1|Histogram.ratio.213|3.6950272323507024E-6|2022-02-02   |
|i1c_soc_cd_segm_empr1|Histogram.abs.137

In [94]:
(suggestions_df.filter(F.col("column_name")=="i1c_soc_cd_segm_empr1")
 .select(F.col("code_for_constraint"), F.col("column_name"))).show(truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------+
|code_for_constraint                                                                                                                                                                                                                                                                                                             |column_name          |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

##### |-- i1c_soc_cd_ramo_atvd1: integer (nullable = true)

In [95]:
profile_df.filter(F.col("column")=="i1c_soc_cd_ramo_atvd1").show(truncate=False)

+---------------------+-------------------+-------------------+-------------+
|column               |metric             |value              |research_date|
+---------------------+-------------------+-------------------+-------------+
|i1c_soc_cd_ramo_atvd1|Completeness       |0.07817938618207616|2022-02-02   |
|i1c_soc_cd_ramo_atvd1|ApproxCountDistinct|1081.0             |2022-02-02   |
|i1c_soc_cd_ramo_atvd1|Minimum            |0.0                |2022-02-02   |
|i1c_soc_cd_ramo_atvd1|Maximum            |4420055.0          |2022-02-02   |
|i1c_soc_cd_ramo_atvd1|Mean               |2532201.787385386  |2022-02-02   |
|i1c_soc_cd_ramo_atvd1|StandardDeviation  |1393790.1626147425 |2022-02-02   |
|i1c_soc_cd_ramo_atvd1|Sum                |1.07152650835E11   |2022-02-02   |
+---------------------+-------------------+-------------------+-------------+



In [96]:
(suggestions_df.filter(F.col("column_name")=="i1c_soc_cd_ramo_atvd1")
 .select(F.col("code_for_constraint"), F.col("column_name"))).show(truncate=False)

+---------------------------------------+---------------------+
|code_for_constraint                    |column_name          |
+---------------------------------------+---------------------+
|.isNonNegative("i1c_soc_cd_ramo_atvd1")|i1c_soc_cd_ramo_atvd1|
+---------------------------------------+---------------------+



##### |-- dat_ref_carga: string (nullable = true)

In [97]:
profile_df.filter(F.col("column")=="dat_ref_carga").show(truncate=False)

+-------------+--------------------------+--------+-------------+
|column       |metric                    |value   |research_date|
+-------------+--------------------------+--------+-------------+
|dat_ref_carga|Completeness              |1.0     |2022-02-02   |
|dat_ref_carga|ApproxCountDistinct       |1.0     |2022-02-02   |
|dat_ref_carga|Histogram.bins            |5.0     |2022-02-02   |
|dat_ref_carga|Histogram.abs.Boolean     |0.0     |2022-02-02   |
|dat_ref_carga|Histogram.ratio.Boolean   |0.0     |2022-02-02   |
|dat_ref_carga|Histogram.abs.Fractional  |0.0     |2022-02-02   |
|dat_ref_carga|Histogram.ratio.Fractional|0.0     |2022-02-02   |
|dat_ref_carga|Histogram.abs.Integral    |0.0     |2022-02-02   |
|dat_ref_carga|Histogram.ratio.Integral  |0.0     |2022-02-02   |
|dat_ref_carga|Histogram.abs.Unknown     |0.0     |2022-02-02   |
|dat_ref_carga|Histogram.ratio.Unknown   |0.0     |2022-02-02   |
|dat_ref_carga|Histogram.abs.String      |541268.0|2022-02-02   |
|dat_ref_c

In [98]:
(suggestions_df.filter(F.col("column_name")=="dat_ref_carga")
 .select(F.col("code_for_constraint"), F.col("column_name"))).show(truncate=False)

+-----------------------------------------------+-------------+
|code_for_constraint                            |column_name  |
+-----------------------------------------------+-------------+
|.isContainedIn("dat_ref_carga", ["2022-01-03"])|dat_ref_carga|
|.isComplete("dat_ref_carga")                   |dat_ref_carga|
+-----------------------------------------------+-------------+



## Build yaml...
All modules within dqResearch are used to analyze and understand the input_data and define witch metrics are going to be used in running processes</br>
dqRunning modules run uppon yaml files, and yaml file are build uppon dqResearch results.</br>
Its possible to __simulate the verificationSuite.yaml__ already with resulting suggestions from dqResearch

In [7]:
from dataquality_bnr.yamlHandler import verificationSuite as vsYaml

In [8]:
vsYaml.printTemplate(suggestions_df)


Check: {
    level: Error,
    description: 'CheckObject by yaml File'
}

ResultKey: {
    key_tags: {
        tag1: 1,
        tag2: 2,
        tag3: 3
    } 
}

Constraints: [
    addConstraint: 'isContainedIn("dat_ref_carga", ["2022-01-03"])',
    addConstraint: 'isComplete("dat_ref_carga")',
    addConstraint: 'isContainedIn("i1c_cli_possui_conta", ["1", "0"])',
    addConstraint: 'isComplete("i1c_cli_possui_conta")',
    addConstraint: 'isNonNegative("i1c_cli_possui_conta")',
    addConstraint: 'hasDataType("i1c_cli_possui_conta", ConstrainableDataTypes.Integral)',
    addConstraint: 'isContainedIn("i1c_lim_pre_ap_preventivo", ["0", "2"])',
    addConstraint: 'isComplete("i1c_lim_pre_ap_preventivo")',
    addConstraint: 'isContainedIn("i1c_lim_pre_ap_preventivo", ["0"], lambda x: x >= 0.99, "It should be above 0.99!")',
    addConstraint: 'isNonNegative("i1c_lim_pre_ap_preventivo")',
    addConstraint: 'isContainedIn("i1c_rating_riscos", ["0", "1", "2", "7", "9", "8", "6", "4", "

#### pydeequ shutdown_callback_server()
#### spark.stop()
__Importante!__
Após a execucao dos jobs, garanta que a sessao __spark__ juntamente com o __callback_server__ sejam encerrados, evitando que qualquer processo "fantasma" fique pendurado.<br>
Leia mais sobre __Pydeequ__ e __callback_server__ em: https://github.com/awslabs/python-deequ

In [None]:
spark.sparkContext._gateway.shutdown_callback_server()
spark.stop()