In [None]:
import logging
import sys
sys.path.append('../')
logger = logging.getLogger()
logger.setLevel(logging.INFO)
from dotenv import load_dotenv
load_dotenv()
import yaml

#with open("example.yaml", "r") as stream:
#    try:
#        print(yaml.safe_load(stream))
#    except yaml.YAMLError as exc:
#        print(exc)

from centraal_dataframework.resources import datalake
#info = datalake.read_csv("test/test_framework.csv", sep="|", decimal = ",").drop(columns=['Unnamed: 0'])
from great_expectations.core.expectation_configuration import ExpectationConfiguration
from centraal_dataframework.tasks import task_dq


In [None]:
@task_dq
def scrapping_validate_prerequisites(datalake, gx_toolkit, logger):
    """Saluda al gx."""
    source = datalake.read_csv("cleansed-zone/scrapping.csv", sep="|")
    logger.info(source.head(1))
    logger.info("Validando prerrequisitos del archivo Scrapping...")
    # creaciones de expectativas

    #Column Name
    scrapping_column_names = ExpectationConfiguration(
        expectation_type="expect_table_columns_to_match_set",
        kwargs={
            "column_set":    ["date", "canal", "category", "subcategory", "subcategory2",
                            "subcategory3", "marca", "modelo", "sku", "upc", "item",
                            "item characteristics", "url sku", "image", "price", "sale price",
                            "shipment cost", "sales flag", "store id", "store name",
                            "store address", "stock", "upc wm", "final price", "upc wm2", "comp",
                            "composition", "homogenized_clothing", "homogenized_subcategory",
                            "homogenized_category", "homogenized_color", "made_in"
                            ],
            "exact_match":   True,
            "result_format": "SUMMARY"
        },
        meta={
            "notes": {
                "format": "markdown",
                "content": "Las columnas del archivo de Scrapping no concuerdan con las esperadas"
            }
        }
    )

    #Price Not Null
    scrapping_price_notnull = ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={
            "column":        "price",
            "result_format": "SUMMARY"
        },
        meta={
            "notes": {
                "format": "markdown",
                "content": "Algunos precios del archivo de Scrapping no están presentes"
            }
        }
    )

    #ADD EXPECTATIONS
    url = gx_toolkit.run_expectations_on_df(source, "test", [scrapping_column_names, scrapping_price_notnull])
    print("reporte de expectativas", url)

In [None]:
@task_dq
def scrapping_validate_column_contents(datalake, gx_toolkit, logger):
    """Saluda al gx."""
    source = datalake.read_csv("cleansed-zone/scrapping.csv", sep="|")
    logger.info(source.head(1))
    logger.info("Validando contenido de columnas del archivo Scrapping...")
    # creaciones de expectativas

    scrapping_clothing_notnull = ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={
            "column":        "clothing",
            "result_format": "SUMMARY"
        },
        meta={
            "notes": {
                "format": "markdown",
                "content": "Algunos elementos de la columna 'clothing' no están presentes"
            }
        }
    )
    #homogenized_category NOT NULL
    scrapping_homogenized_category_notnull = ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={
            "column":        "homogenized_category",
            "result_format": "SUMMARY"
        },
        meta={
            "notes": {
                "format": "markdown",
                "content": "Algunos elementos de la columna 'homogenized_category' no están presentes"
            }
        }
    )

    #homogenized_subcategory NOT NULL
    scrapping_homogenized_subcategory_notnull = ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={
            "column":        "homogenized_subcategory",
            "result_format": "SUMMARY"
        },
        meta={
            "notes": {
                "format": "markdown",
                "content": "Algunos elementos de la columna 'homogenized_subcategory' no están presentes"
            }
        }
    )
    #homogenized_color NOT NULL
    scrapping_homogenized_color_notnull = ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={
            "column":        "homogenized_color",
            "result_format": "SUMMARY"
        },
        meta={
            "notes": {
                "format": "markdown",
                "content": "Algunos elementos de la columna 'homogenized_color' no están presentes"
            }
        }
    )
    #marca 9 values (up to 11) <- PARAMETER
    scrapping_marca_unique = ExpectationConfiguration(
        expectation_type="expect_column_unique_value_count_to_be_between",
        kwargs={
            "column":        "marca",
            "min_value":     scrapping_suit_conf['marca_column_number_min'],
            "max_value":     scrapping_suit_conf['marca_column_number_max'],
            "result_format": "SUMMARY"
        },
        meta={
            "notes": {
                "format": "markdown",
                "content": "Algunos elementos de la columna 'homogenized_color' no están presentes"
            }
        }
    )
    
    url = gx_toolkit.run_expectations_on_df(source, "test", [scrapping_clothing_notnull, scrapping_homogenized_category_notnull,
                                                             scrapping_homogenized_subcategory_notnull, scrapping_homogenized_color_notnull,
                                                             scrapping_marca_unique])
    print("reporte de expectativas", url)

In [None]:
@task_dq
def scrapping_validate_prices(datalake, gx_toolkit, logger):
    """Saluda al gx."""
    source = datalake.read_csv("cleansed-zone/scrapping.csv", sep="|")
    logger.info(source.head(1))
    logger.info("Validando contenido de columnas del archivo Scrapping...")
    # creaciones de expectativas
    #Final Price
    #Precio Final Between
    ##Zara 12.000 - 4.000.000
    scrapping_zara_price_range = ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={
            "column":        "final price",
            "min_value":     12000
            "max_value":     4000000
            "result_format": "SUMMARY"
        },
        meta={
            "notes": {
                "format": "markdown",
                "content": "Para la marca Zara, algunos precios no se encuentran en el rango esperado."
            }
        }
    )

    ##Otras - 12.000 - 1.300.000
    scrapping_otras_price_range = ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={
            "column":        "final price",
            "min_value":     12000
            "max_value":     1300000
            "result_format": "SUMMARY"
        },
        meta={
            "notes": {
                "format": "markdown",
                "content": "Algunos precios no se encuentran en el rango esperado."
            }
        }
    )
    pand[pand['user']!='zara']
    url = gx_toolkit.run_expectations_on_df(source[source['user']=='zara']], "test", [scrapping_zara_price_range])
    print("reporte de expectativas", url)
    url = gx_toolkit.run_expectations_on_df(source[source['user']!='zara']], "test", [scrapping_otras_price_range])
    print("reporte de expectativas", url)
    

In [None]:
@task_dq
def marcaspropias_validate_column_contents(datalake, gx_toolkit, logger):
    """Saluda al gx."""
    source = datalake.read_csv("cleansed-zone/marcas_propias.csv", sep="|")
    logger.info(source.head(1))
    logger.info("Validando contenido de columnas del archivo Marcas Propias...")
    # creaciones de expectativas
    marcprop_categoria_notnull = ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={
            "column":        "categoria",
            "result_format": "SUMMARY"
        },
        meta={
            "notes": {
                "format": "markdown",
                "content": "Algunos elementos de la columna 'categoria' no están presentes"
            }
        }
    )

    #Uso NOT NULL
    marcprop_use_notnull = ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={
            "column":        "use",
            "result_format": "SUMMARY"
        },
        meta={
            "notes": {
                "format": "markdown",
                "content": "Algunos elementos de la columna 'use' no están presentes"
            }
        }
    )

    #Tipo Prenda NOT NULL
    marcprop_tipo_prenda_notnull = ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={
            "column":        "prendasGenerales", #??????
            "result_format": "SUMMARY"
        },
        meta={
            "notes": {
                "format": "markdown",
                "content": "Algunos elementos de la columna 'prendasGenerales' no están presentes"
            }
        }
    )
    url = gx_toolkit.run_expectations_on_df(source, "test", [marcprop_categoria_notnull, marcprop_use_notnull
                                                             marcprop_tipo_prenda_notnull])
    print("reporte de expectativas", url)


In [None]:
@task_dq
def homologaciones_validar_cantidad(datalake, gx_toolkit, logger):
    """Saluda al gx."""
    source = datalake.read_csv("cleansed-zone/homologaciones.csv", sep="|")
    logger.info(source.head(1))
    logger.info("Validando cantidad de registros en homologaciones..")
    homologacion_marca_join = ExpectationConfiguration(
        expectation_type="expect_column_unique_value_count_to_be_between",
        kwargs={
            "column":        "Marca",
            "min_value":     scrapping_suit_conf['marca_column_number_min'],
            "max_value":     scrapping_suit_conf['marca_column_number_max'],
            "result_format": "SUMMARY"
        },
        meta={
            "notes": {
                "format": "markdown",
                "content": "Algunos elementos de la columna 'marca' no están presentes"
            }
        }
    )
    url = gx_toolkit.run_expectations_on_df(source, "test", [homologacion_marca_join])
    print("reporte de expectativas", url)
    

In [None]:
@task_dq
def ordentallas_validar_join(datalake, gx_toolkit, logger):
    """Saluda al gx."""
    source = datalake.read_csv("cleansed-zone/orden_tallas.csv", sep="|")
    validation_set = datalake.read_csv("cleansed-zone/homologaciones.csv", sep="|")
    marcas_set = validation_set.Marca.unique()
    logger.info(source.head(1))
    logger.info("Validando JOIN Orden Tallas...")

    ordentallas_marca_join = ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_in_set",
        kwargs={
            "column":        "Marca",
            "valueset":      marcas_set,
            "result_format": "SUMMARY"
        },
        meta={
            "notes": {
                "format": "markdown",
                "content": "Algunos elementos de la columna 'marca' no están presentes en el archivo de Orden Tallas"
            }
        }
    )
    url = gx_toolkit.run_expectations_on_df(source, "test", [ordentallas_marca_join])
    print("reporte de expectativas", url)



In [None]:
@task_dq
def ordentallas_validar_join(datalake, gx_toolkit, logger):
    """Saluda al gx."""
    source = datalake.read_csv("cleansed-zone/orden_tallas.csv", sep="|")
    validation_set = datalake.read_csv("cleansed-zone/homologaciones.csv", sep="|")
    marcas_set = validation_set.Marca.unique()
    logger.info(source.head(1))
    logger.info("Validando JOIN Orden Tallas...")

    ordentallas_marca_join = ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_in_set",
        kwargs={
            "column":        "Marca",
            "valueset":      marcas_set,
            "result_format": "SUMMARY"
        },
        meta={
            "notes": {
                "format": "markdown",
                "content": "Algunos elementos de la columna 'marca' no están presentes en el archivo de Orden Tallas"
            }
        }
    )
    url = gx_toolkit.run_expectations_on_df(source, "test", [ordentallas_marca_join])
    print("reporte de expectativas", url)

In [None]:
def ean_validar_cantidad_registros(datalake, gx_toolkit, logger):
    """Saluda al gx."""
    source = datalake.read_csv("cleansed-zone/ean.csv", sep="|")
    logger.info(source.head(1))
    logger.info("Validando JOIN Orden Tallas...")
    ean_unique = ExpectationConfiguration(
        expectation_type="expect_column_unique_value_count_to_be_between",
        kwargs={
            "column":        "EAN",
            "min_value":     3000,
            "max_value":     4000,
            "result_format": "SUMMARY"
        },
        meta={
            "notes": {
                "format": "markdown",
                "content": "La cantidad de productos presentes excede la esperada"
            }
        }
    )
    url = gx_toolkit.run_expectations_on_df(source, "test", [ordentallas_marca_join])
    print("reporte de expectativas", url)

In [None]:
@task_dq
def tallasagotadas_validar_join(datalake, gx_toolkit, logger):
    """Saluda al gx."""
    source = datalake.read_csv("cleansed-zone/tallas_agotadas.csv", sep="|")
    validation_set = datalake.read_csv("cleansed-zone/ean.csv", sep="|")
    marcas_set = validation_set.EAN.unique()
    logger.info(source.head(1))
    logger.info("Validando JOIN Orden Tallas...")

    tallasagotadas_marca_join = ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_in_set",
        kwargs={
            "column":        "Marca",
            "valueset":      marcas_set,
            "result_format": "SUMMARY"
        },
        meta={
            "notes": {
                "format": "markdown",
                "content": "Algunos elementos de la columna 'marca' no están presentes en el archivo de Orden Tallas"
            }
        }
    )
    url = gx_toolkit.run_expectations_on_df(source, "test", [tallasagotadas_marca_join])
    print("reporte de expectativas", url)