In [8]:
import great_expectations as ge
import pandas as pd
import yaml
from great_expectations.core.expectation_configuration import ExpectationConfiguration

#INIT
context = ge.get_context()

#LOAD CONFIG

with open("ge_config.yaml", "r") as yamlfile:
    conf_data = yaml.load(yamlfile, Loader=yaml.FullLoader)
    yamlfile.close()
scrapping_suit_conf = conf_data[0]['scrapping_suit']
ean_suit_conf       = conf_data[0]['ean_suit']

In [None]:
#DataSet: SCRAPPING

#LOAD EXAMPLE DATASET

#Segmentación de Marca
pand = pd.read_csv('C:\\Users\\Wilber\\Downloads\\datasetCrystal06102023.csv', sep=',',encoding='latin1')
scrapping_ds        = context.sources.add_or_update_pandas(name='scrapping')
scrapping_src_asset = scrapping_ds.add_dataframe_asset(f"scrapping_df")


In [None]:
#EXPECTATION: EXCEPT - NOTIFY Validation
scrapping_suite_except   = context.add_or_update_expectation_suite(expectation_suite_name = "scrapping_suit_exception")

#Column Name
scrapping_column_names = ExpectationConfiguration(
    expectation_type="expect_table_columns_to_match_set",
    kwargs={
        "column_set":    ["date", "canal", "category", "subcategory", "subcategory2",
                          "subcategory3", "marca", "modelo", "sku", "upc", "item",
                          "item characteristics", "url sku", "image", "price", "sale price",
                          "shipment cost", "sales flag", "store id", "store name",
                          "store address", "stock", "upc wm", "final price", "upc wm2", "comp",
                          "composition", "homogenized_clothing", "homogenized_subcategory",
                          "homogenized_category", "homogenized_color", "made_in"
                         ],
        "exact_match":   True,
        "result_format": "SUMMARY"
    },
    meta={
        "notes": {
            "format": "markdown",
            "content": "Las columnas del archivo de Scrapping no concuerdan con las esperadas"
        }
    }
)

#Price Not Null
scrapping_price_notnull = ExpectationConfiguration(
    expectation_type="expect_column_values_to_not_be_null",
    kwargs={
        "column":        "price",
        "result_format": "SUMMARY"
    },
    meta={
        "notes": {
            "format": "markdown",
            "content": "Algunos precios del archivo de Scrapping no están presentes"
        }
    }
)

#ADD EXPECTATIONS
scrapping_suite_except.add_expectation(expectation_configuration=scrapping_column_names)
scrapping_suite_except.add_expectation(expectation_configuration=scrapping_price_notnull)
context.save_expectation_suite(scrapping_suite_except)

In [None]:
#EXPECTATION:  WARNING - CONTINUE
scrapping_suite_warn   = context.add_or_update_expectation_suite(expectation_suite_name = "scrapping_suit_warning")
#homogenized_clothing NOT NULL
scrapping_clothing_notnull = ExpectationConfiguration(
    expectation_type="expect_column_values_to_not_be_null",
    kwargs={
        "column":        "clothing",
        "result_format": "SUMMARY"
    },
    meta={
        "notes": {
            "format": "markdown",
            "content": "Algunos elementos de la columna 'clothing' no están presentes"
        }
    }
)
#homogenized_category NOT NULL
scrapping_homogenized_category_notnull = ExpectationConfiguration(
    expectation_type="expect_column_values_to_not_be_null",
    kwargs={
        "column":        "homogenized_category",
        "result_format": "SUMMARY"
    },
    meta={
        "notes": {
            "format": "markdown",
            "content": "Algunos elementos de la columna 'homogenized_category' no están presentes"
        }
    }
)

#homogenized_subcategory NOT NULL
scrapping_homogenized_subcategory_notnull = ExpectationConfiguration(
    expectation_type="expect_column_values_to_not_be_null",
    kwargs={
        "column":        "homogenized_subcategory",
        "result_format": "SUMMARY"
    },
    meta={
        "notes": {
            "format": "markdown",
            "content": "Algunos elementos de la columna 'homogenized_subcategory' no están presentes"
        }
    }
)
#homogenized_color NOT NULL
scrapping_homogenized_color_notnull = ExpectationConfiguration(
    expectation_type="expect_column_values_to_not_be_null",
    kwargs={
        "column":        "homogenized_color",
        "result_format": "SUMMARY"
    },
    meta={
        "notes": {
            "format": "markdown",
            "content": "Algunos elementos de la columna 'homogenized_color' no están presentes"
        }
    }
)
#marca 9 values (up to 11) <- PARAMETER
scrapping_marca_unique = ExpectationConfiguration(
    expectation_type="expect_column_unique_value_count_to_be_between",
    kwargs={
        "column":        "marca",
        "min_value":     scrapping_suit_conf['marca_column_number_min'],
        "max_value":     scrapping_suit_conf['marca_column_number_max'],
        "result_format": "SUMMARY"
    },
    meta={
        "notes": {
            "format": "markdown",
            "content": "Algunos elementos de la columna 'homogenized_color' no están presentes"
        }
    }
)
#Final Price
#Precio Final Between
##Zara 12.000 - 4.000.000
scrapping_zara_price_notnull = ExpectationConfiguration(
    expectation_type="expect_column_values_to_be_between",
    kwargs={
        "column":        "final price",
        "min_value":     12000
        "max_value":     4000000
        "result_format": "SUMMARY"
    },
    meta={
        "notes": {
            "format": "markdown",
            "content": "Para la marca Zara, algunos precios no se encuentran en el rango esperado."
        }
    }
)

##Otras - 12.000 - 1.300.000
scrapping_otras_price_notnull = ExpectationConfiguration(
    expectation_type="expect_column_values_to_be_between",
    kwargs={
        "column":        "final price",
        "min_value":     12000
        "max_value":     1300000
        "result_format": "SUMMARY"
    },
    meta={
        "notes": {
            "format": "markdown",
            "content": "Algunos precios no se encuentran en el rango esperado."
        }
    }
)
#Segment

#Segmentación de Marca
scrapping_zara_batch = scrapping_src_asset.build_batch_request(options = {"user":"zara"})
scrapping_norm_batch = scrapping_src_asset.build_batch_request(dataframe = pand[pand['user']!='zara'])


#Add Expectations
scrapping_suite_warn.add_expectation(scrapping_clothing_notnull)
scrapping_suite_warn.add_expectation(scrapping_homogenized_category_notnull)
scrapping_suite_warn.add_expectation(scrapping_homogenized_subcategory_notnull)
scrapping_suite_warn.add_expectation(scrapping_homogenized_color_notnull)
scrapping_suite_warn.add_expectation(scrapping_marca_unique)
scrapping_suite_warn.add_expectation(scrapping_otras_price_notnull)
context.save_expectation_suite(scrapping_suite_warn)

In [None]:
#DataSet: Marcas propias

#LOAD EXAMPLE DATASET
pand = pd.read_csv('C:\\Users\\Wilber\\Downloads\\marcprop.csv', sep=',',encoding='latin1')
marcprop_ds        = context.sources.add_or_update_pandas(name='marcprop')
marcprop_src_asset = marcprop_ds.add_dataframe_asset(f"marcprop_df")



In [None]:
#WARNING - CONTINUE
marcprop_suite_warn   = context.add_or_update_expectation_suite(expectation_suite_name = "marcprop_suit_warning")

#Categoria NOT NULL
marcprop_categoria_notnull = ExpectationConfiguration(
    expectation_type="expect_column_values_to_not_be_null",
    kwargs={
        "column":        "categoria",
        "result_format": "SUMMARY"
    },
    meta={
        "notes": {
            "format": "markdown",
            "content": "Algunos elementos de la columna 'categoria' no están presentes"
        }
    }
)

#Uso NOT NULL
marcprop_use_notnull = ExpectationConfiguration(
    expectation_type="expect_column_values_to_not_be_null",
    kwargs={
        "column":        "use",
        "result_format": "SUMMARY"
    },
    meta={
        "notes": {
            "format": "markdown",
            "content": "Algunos elementos de la columna 'use' no están presentes"
        }
    }
)

#Tipo Prenda NOT NULL
marcprop_tipo_prenda_notnull = ExpectationConfiguration(
    expectation_type="expect_column_values_to_not_be_null",
    kwargs={
        "column":        "prendasGenerales", #??????
        "result_format": "SUMMARY"
    },
    meta={
        "notes": {
            "format": "markdown",
            "content": "Algunos elementos de la columna 'prendasGenerales' no están presentes"
        }
    }
)


marcprop_suite_warn.add_expectation(marcprop_categoria_notnull)
marcprop_suite_warn.add_expectation(marcprop_use_notnull)
marcprop_suite_warn.add_expectation(marcprop_tipo_prenda_notnull)
context.save_expectation_suite(marcprop_suite_warn)


In [None]:
#DataSet: Equivalencias

#LOAD EXAMPLE DATASET
homologacion_pand      = pd.read_csv('C:\\Users\\Wilber\\Downloads\\homologaciones.csv', sep=',',encoding='latin1')
homologacion_ds        = context.sources.add_or_update_pandas(name='homologacion')
homologacion_src_asset = homologacion_ds.add_dataframe_asset(f"homologacion_df")


In [None]:
#WARNING - CONTINUE
#marca 9 values (up to 11) <- PARAMETER?????
homologacion_suite_warn   = context.add_or_update_expectation_suite(expectation_suite_name = "homologacion_suit_warning")

#Categoria NOT NULL
homologacion_marca_join = ExpectationConfiguration(
    expectation_type="expect_column_unique_value_count_to_be_between",
    kwargs={
        "column":        "Marca",
        "min_value":     scrapping_suit_conf['marca_column_number_min'],
        "max_value":     scrapping_suit_conf['marca_column_number_max'],
        "result_format": "SUMMARY"
    },
    meta={
        "notes": {
            "format": "markdown",
            "content": "Algunos elementos de la columna 'marca' no están presentes"
        }
    }
)
homologacion_suite_warn.add_expectation(homologacion_marca_join)
context.save_expectation_suite(homologacion_suite_warn)

In [None]:
#DataSet: Orden Tallas

#LOAD DATASET
ordentallas_pand      = pd.read_csv('C:\\Users\\Wilber\\Downloads\\OrdenTallasScrapping.csv', sep=',',encoding='latin1')
ordentallas_ds        = context.sources.add_or_update_pandas(name='ordentallas')
ordentallas_src_asset = ordentallas_ds.add_dataframe_asset(f"ordentallas_df")

#SET AUXILIAR
marcas_set = homologacion_pand.Marca.unique()


In [None]:
#WARNING - CONTINUE

#JOIN MarcaScrapping - Marca Homologaciones

ordentallas_suite_warn   = context.add_or_update_expectation_suite(expectation_suite_name = "ordentallas_suit_warning")

#Categoria NOT NULL
ordentallas_marca_join = ExpectationConfiguration(
    expectation_type="expect_column_values_to_be_in_set",
    kwargs={
        "column":        "Marca",
        "valueset":      marcas_set,
        "result_format": "SUMMARY"
    },
    meta={
        "notes": {
            "format": "markdown",
            "content": "Algunos elementos de la columna 'marca' no están presentes en el archivo de Orden Tallas"
        }
    }
)
ordentallas_suite_warn.add_expectation(ordentallas_marca_join)
context.save_expectation_suite(ordentallas_suite_warn)

In [None]:
#DataSet: EAN

#LOAD DATASET

ean_pand      = pd.read_csv('C:\\Users\\Wilber\\Downloads\\ean.csv', sep=',',encoding='latin1')
ean_ds        = context.sources.add_or_update_pandas(name='ean')
ean_src_asset = ean_ds.add_dataframe_asset(f"ean_df")


#SET Auxiliar
ean_set       = ean_pand.EAN.unique()


In [None]:
#WARNING - CONTINUE
#EXPECT 3000-4000 lines

ean_suite_warn   = context.add_or_update_expectation_suite(expectation_suite_name = "ean_suit_warning")


ean_unique = ExpectationConfiguration(
    expectation_type="expect_column_unique_value_count_to_be_between",
    kwargs={
        "column":        "EAN",
        "min_value":     ean_suit_conf['current_ean'],
        "max_value":     ean_suit_conf['current_ean'],
        "result_format": "SUMMARY"
    },
    meta={
        "notes": {
            "format": "markdown",
            "content": "La cantidad de productos presentes excede la esperada"
        }
    }
)
ean_suite_warn.add_expectation(ean_unique)
context.save_expectation_suite(ean_suite_warn)

In [None]:
#DataSet: Atributos
#LOAD DATASET
atributos_pand      = pd.read_csv('C:\\Users\\Wilber\\Downloads\\atributos.csv', sep=',',encoding='latin1')
atributos_df        = context.sources.add_or_update_pandas(name='atributos')
atributos_src_asset = atributos_df.add_dataframe_asset(f"atributos_df")



In [None]:
#WARNING - CONTINUE
#JOIN Referencia - EAN
atributos_suite_warn   = context.add_or_update_expectation_suite(expectation_suite_name = "atributos_suite_warn")

######################################
#ESTA VALIDACION TIENE UN PROBLEMA, REVISAR EN WORKFLOW COMO CRUZA
######################################


atributos_join = ExpectationConfiguration(
    expectation_type="expect_column_values_to_be_in_set",
    kwargs={
        "column":        "REFERENCIA",
        "valueset":      marcas_set,
        "result_format": "SUMMARY"
    },
    meta={
        "notes": {
            "format": "markdown",
            "content": "Algunos registros del archivo de Atributos no cruzan con el archivo EAN"
        }
    }
)
atributos_suite_warn.add_expectation(atributos_join)
context.save_expectation_suite(atributos_suite_warn)

In [None]:
#DataSet: Tallas Agotadas
#LOAD DATASET
agotados_pand      = pd.read_csv('C:\\Users\\Wilber\\Downloads\\tallas_agotadas.csv', sep=',',encoding='latin1')
agotados_df        = context.sources.add_or_update_pandas(name='agotados')
agotados_src_asset = agotados_df.add_dataframe_asset(f"agotados_df")


In [None]:
#WARNING - CONTINUE
#JOIN Partnumeber- EAN
agotados_suite_warn   = context.add_or_update_expectation_suite(expectation_suite_name = "agotados_suite_warn")


agotados_join = ExpectationConfiguration(
    expectation_type="expect_column_values_to_be_in_set",
    kwargs={
        "column":        "PARTNUMBER",
        "valueset":      ean_set,
        "result_format": "SUMMARY"
    },
    meta={
        "notes": {
            "format": "markdown",
            "content": "El archivo agotados posee EANs que no cruzan con archivo EAN"
        }
    }
)
agotados_suite_warn.add_expectation(agotados_join)
context.save_expectation_suite(agotados_suite_warn)

In [None]:
#DataSet: Precios
#LOAD DATASET
precios_pand      = pd.read_csv('C:\\Users\\Wilber\\Downloads\\precios.csv', sep=',',encoding='latin1')
precios_df        = context.sources.add_or_update_pandas(name='agotados')
precios_src_asset = precios_df.add_dataframe_asset(f"precios_df")


In [None]:
#WARNING - CONTINUE
#JOIN EAN - EAN
precios_suite_warn   = context.add_or_update_expectation_suite(expectation_suite_name = "precios_suite_warn")


precios_join = ExpectationConfiguration(
    expectation_type="expect_column_values_to_be_in_set",
    kwargs={
        "column":        "EAN",
        "valueset":      ean_set,
        "result_format": "SUMMARY"
    },
    meta={
        "notes": {
            "format": "markdown",
            "content": "El archivo precios posee EANs que no cruzan con archivo EAN"
        }
    }
)
precios_suite_warn.add_expectation(precios_join)
context.save_expectation_suite(precios_suite_warn)

In [None]:
#DataSet: Homologaciones Color
#LOAD DATASET

In [None]:
#WARNING - CONTINUE
#JOIN Color - Atributos(Color)

In [None]:
#DataSet: EAN Complementos
#LOAD DATASET
ean_comp_pand      = pd.read_csv('C:\\Users\\Wilber\\Downloads\\complemento.csv', sep=',',encoding='latin1')
ean_comp_ds        = context.sources.add_or_update_pandas(name='ean')
ean_comp_src_asset = ean_comp_ds.add_dataframe_asset(f"ean_comp_df")


In [None]:
#WARNING - CONTINUE
#JOIN EAN COMPLEMENTOS

In [None]:
#TEST CELL
homologacion_batch = homologacion_src_asset.build_batch_request(pand)
validator = context.get_validator(
    batch_request=homologacion_batch,
    expectation_suite_name="homologacion_suit_warning",
)
homologacion_checkpoint = context.add_or_update_checkpoint(
    name="homologacion_checkpoint",
    validations=[
        {
            "batch_request": homologacion_batch,
            "expectation_suite_name": "homologacion_suit_warning",
        },
    ],
)
scrapping_checkpoint_result = homologacion_checkpoint.run()
