In [1]:
#Imports generales
import pysftp
import logging
import sys
import yaml
import os
import pandas as pd

#Cargar directorio CDF y Configuraciones
sys.path.append('../')
from dotenv                                            import load_dotenv
load_dotenv()

#Importar CDF
from centraal_dataframework.resources                  import datalake
from great_expectations.core.expectation_configuration import ExpectationConfiguration
from centraal_dataframework.tasks                      import task_dq, task

#Preparación de ambiente
logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [None]:
#Traer Archivos SFTP
@task
def get_scrapping_file(datalake, logger):
    #Variables
    last_file_date = 0
    last_file_name = ''
    scrapping_file = None
    csv_output_dir = os.environ['datalake_workdir'] + '/' + os.environ['scrapping_workdir'] + '/'
    #Abrir conexión
    cnopts = pysftp.CnOpts()
    cnopts.hostkeys = None
    sftp = pysftp.Connection(host     = os.environ['sftp_servidor'], 
                             port     = int(os.environ['sftp_port']), 
                             username = os.environ['sftp_usuario'], 
                             password = os.environ['sftp_clave'], 
                             cnopts   = cnopts)
    sftp.cwd(os.environ['sftp_raiz'])                         
    #Buscar el último archivo
    archivos = sftp.listdir_attr()
    for archivo in archivos:
        if archivo.longname[0] != 'd':
            if archivo.st_atime > last_file_date:
                last_file_date = archivo.st_atime
                last_file_name = archivo.filename
    #Cargar el archivo al DataFrame
    with sftp.open(last_file_name) as sfile:
        scrapping_file = pd.read_csv(sfile, sep = ',', encoding = 'latin1')
    sftp.close()
    #Escribimos el DataFrame en nuestro raw-zone
    datalake.write_csv(scrapping_file, csv_output_dir+'scrapping.csv', sep='|', index = False, encoding='latin1')

In [None]:
@task_dq
def scrapping_validate_prerequisites(datalake, gx_toolkit, logger):
    """Valida los pre-requisitos básicos del archivo de Scrapping"""
    csv_input_dir = os.environ['datalake_workdir'] + '/' + os.environ['scrapping_workdir'] + '/'
    source = datalake.read_csv(csv_input_dir + "scrapping.csv", sep="|", encoding = 'latin1')
    logger.info(source.head(1))
    logger.info("Validando prerrequisitos del archivo Scrapping...")

    #Nombres de columnas
    scrapping_column_names = ExpectationConfiguration(
        expectation_type="expect_table_columns_to_match_set",
        kwargs={
            "column_set":    ["date", "canal", "category", "subcategory", "subcategory2",
                            "subcategory3", "marca", "modelo", "sku", "upc", "item",
                            "item characteristics", "url sku", "image", "price", "sale price",
                            "shipment cost", "sales flag", "store id", "store name",
                            "store address", "stock", "upc wm", "final price", "upc wm2", "comp",
                            "composition", "homogenized_clothing", "homogenized_subcategory",
                            "homogenized_category", "homogenized_color", "made_in"
                            ],
            "exact_match":   True,
            "result_format": "SUMMARY"
        },
        meta={
            "notes": {
                "format": "markdown",
                "content": "Las columnas del archivo de Scrapping no concuerdan con las esperadas"
            }
        }
    )

    #Price Not Null
    scrapping_price_notnull = ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={
            "column":        "price",
            "result_format": "SUMMARY"
        },
        meta={
            "notes": {
                "format": "markdown",
                "content": "Algunos precios del archivo de Scrapping no están presentes"
            }
        }
    )

    #ADD EXPECTATIONS
    url = gx_toolkit.run_expectations_on_df(source, "SCRAPPING_MANDATORY", [scrapping_column_names, scrapping_price_notnull])
    print("reporte de expectativas", url)

In [None]:
scrapping_validate_prerequisites()

In [2]:
@task_dq
def scrapping_validate_column_contents(datalake, gx_toolkit, logger):
    """Identifica inconsistencias en el contenido de las columnas del archivo Scrapping"""
    csv_input_dir = os.environ['datalake_workdir'] + '/' + os.environ['scrapping_workdir'] + '/'
    source = datalake.read_csv(csv_input_dir+"scrapping.csv", sep="|", encoding = 'latin1')
    logger.info(source.head(1))
    logger.info("Validando contenido de columnas del archivo Scrapping...")
    # creaciones de expectativas

    scrapping_clothing_notnull = ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={
            "column":        "clothing",
            "result_format": "SUMMARY"
        },
        meta={
            "notes": {
                "format": "markdown",
                "content": "Algunos elementos de la columna 'clothing' no están presentes"
            }
        }
    )
    #homogenized_category NOT NULL
    scrapping_homogenized_category_notnull = ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={
            "column":        "homogenized_category",
            "result_format": "SUMMARY"
        },
        meta={
            "notes": {
                "format": "markdown",
                "content": "Algunos elementos de la columna 'homogenized_category' no están presentes"
            }
        }
    )

    #homogenized_subcategory NOT NULL
    scrapping_homogenized_subcategory_notnull = ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={
            "column":        "homogenized_subcategory",
            "result_format": "SUMMARY"
        },
        meta={
            "notes": {
                "format": "markdown",
                "content": "Algunos elementos de la columna 'homogenized_subcategory' no están presentes"
            }
        }
    )
    #homogenized_color NOT NULL
    scrapping_homogenized_color_notnull = ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={
            "column":        "homogenized_color",
            "result_format": "SUMMARY"
        },
        meta={
            "notes": {
                "format": "markdown",
                "content": "Algunos elementos de la columna 'homogenized_color' no están presentes"
            }
        }
    )
    #marca 9 values (up to 11) <- PARAMETER
    scrapping_marca_unique = ExpectationConfiguration(
        expectation_type="expect_column_unique_value_count_to_be_between",
        kwargs={
            "column":        "marca",
            "min_value":     9,
            "max_value":     11,
            "result_format": "SUMMARY"
        },
        meta={
            "notes": {
                "format": "markdown",
                "content": "Algunos elementos de la columna 'homogenized_color' no están presentes"
            }
        }
    )
    
    url = gx_toolkit.run_expectations_on_df(source, "SCRAPPING_CONSISTENCE", [scrapping_clothing_notnull, scrapping_homogenized_category_notnull,
                                                                              scrapping_homogenized_subcategory_notnull, scrapping_homogenized_color_notnull,
                                                                              scrapping_marca_unique])
    print("reporte de expectativas", url)

In [3]:
scrapping_validate_column_contents()

TAREA: scrapping_validate_column_contents--2023-11-25 07:40:51,979-INFO-         date          canal category subcategory   subcategory2 subcategory3  \
0  05/10/2023  Zara Colombia   Hombre      JERSEY  F. Jersey M/C          NaN   

  marca   modelo        sku                   upc  ...     upc wm final price  \
0  Zara  MarrÃ³n  275572806  275572806_MarrÃ³n_XL  ...  275572806    249000.0   

     upc wm2 comp                                        composition  \
0  275572806  NaN  Exterior: 100% poliÃ©ster: que contiene al men...   

   homogenized_clothing homogenized_subcategory homogenized_category  \
0             Chaquetas           Ropa exterior               Hombre   

  homogenized_color made_in  
0             CafÃ©   China  

[1 rows x 32 columns]
TAREA: scrapping_validate_column_contents--2023-11-25 07:40:51,979-INFO-Validando contenido de columnas del archivo Scrapping...


Calculating Metrics:   0%|          | 0/24 [00:00<?, ?it/s]

NameError: name 'container_path' is not defined

In [2]:
@task_dq
def scrapping_validate_prices(datalake, gx_toolkit, logger):
    """Valida el rango de precios del archivo de Scrapping
       Según el lote Zara y de las demás marcas"""
    csv_input_dir = os.environ['datalake_workdir'] + '/' + os.environ['scrapping_workdir'] + '/'
    source = datalake.read_csv(csv_input_dir + "scrapping.csv", sep="|", encoding = 'latin1')
    logger.info(source.head(1))
    logger.info("Validando contenido de columnas del archivo Scrapping...")
    # creaciones de expectativas
    #Final Price
    #Precio Final Between
    ##Zara 12.000 - 4.000.000
    scrapping_zara_price_range = ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={
            "column":        "final price",
            "min_value":     12000,
            "max_value":     4000000,
            "result_format": "SUMMARY"
        },
        meta={
            "notes": {
                "format": "markdown",
                "content": "Para la marca Zara, algunos precios no se encuentran en el rango esperado."
            }
        }
    )

    ##Otras - 12.000 - 1.300.000
    scrapping_otras_price_range = ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={
            "column":        "final price",
            "min_value":     12000,
            "max_value":     1300000,
            "result_format": "SUMMARY"
        },
        meta={
            "notes": {
                "format": "markdown",
                "content": "Algunos precios no se encuentran en el rango esperado."
            }
        }
    )
    
    url = gx_toolkit.run_expectations_on_df(source[source['canal'] == 'Zara Colombia'], "SCRAPPING_PRECIOS_ZARA", [scrapping_zara_price_range])
    print("reporte de expectativas", url)
    url = gx_toolkit.run_expectations_on_df(source[source['canal'] != 'Zara Colombia'], "SCRAPPING_PRECIOS", [scrapping_otras_price_range])
    print("reporte de expectativas", url)
    

In [3]:
scrapping_validate_prices()

TAREA: scrapping_validate_prices--2023-11-25 07:45:55,378-INFO-         date          canal category subcategory   subcategory2 subcategory3  \
0  05/10/2023  Zara Colombia   Hombre      JERSEY  F. Jersey M/C          NaN   

  marca   modelo        sku                   upc  ...     upc wm final price  \
0  Zara  MarrÃ³n  275572806  275572806_MarrÃ³n_XL  ...  275572806    249000.0   

     upc wm2 comp                                        composition  \
0  275572806  NaN  Exterior: 100% poliÃ©ster: que contiene al men...   

   homogenized_clothing homogenized_subcategory homogenized_category  \
0             Chaquetas           Ropa exterior               Hombre   

  homogenized_color made_in  
0             CafÃ©   China  

[1 rows x 32 columns]
TAREA: scrapping_validate_prices--2023-11-25 07:45:55,394-INFO-Validando contenido de columnas del archivo Scrapping...


Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

NameError: name 'container_path' is not defined

In [3]:
@task_dq
def marcaspropias_validate_column_contents(datalake, gx_toolkit, logger):
    """Validar el contenido de las columnas de marcas propias"""
    csv_input_dir = os.environ['datalake_workdir'] + '/' + os.environ['datasvcs_workdir'] + '/'
    source = datalake.read_csv(csv_input_dir + "marcas_propias.csv", sep=",")
    logger.info(source.head(1))
    logger.info("Validando contenido de columnas del archivo Marcas Propias...")
    # creaciones de expectativas
    marcprop_categoria_notnull = ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={
            "column":        "categoria",
            "result_format": "SUMMARY"
        },
        meta={
            "notes": {
                "format": "markdown",
                "content": "Algunos elementos de la columna 'categoria' no están presentes"
            }
        }
    )

    #Uso NOT NULL
    marcprop_use_notnull = ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={
            "column":        "use",
            "result_format": "SUMMARY"
        },
        meta={
            "notes": {
                "format": "markdown",
                "content": "Algunos elementos de la columna 'use' no están presentes"
            }
        }
    )

    #Tipo Prenda NOT NULL
    marcprop_tipo_prenda_notnull = ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={
            "column":        "prendasGenerales", #??????
            "result_format": "SUMMARY"
        },
        meta={
            "notes": {
                "format": "markdown",
                "content": "Algunos elementos de la columna 'prendasGenerales' no están presentes"
            }
        }
    )
    url = gx_toolkit.run_expectations_on_df(source, "MARCAS_PROPIAS_CONSISTENCY", [marcprop_categoria_notnull, marcprop_use_notnull,
                                                                                   marcprop_tipo_prenda_notnull])
    print("reporte de expectativas", url)


In [6]:
marcaspropias_validate_column_contents()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,part_number,imageName,base64,year,quarter,origin,user,prendasGenerales,...,porcentaje5,porcentaje6,porcentaje7,tag,discontinued,enlaceImagen,fecha_consulta,createdAt,updatedAt,cargado_mongo
0,0,33105,730889_015212,CAMISETA LUWA ARENA XL,https://www.gef.co/dx/api/dam/custom/2023/gef/...,2023,2,Gef,Gef,Camisetas,...,,,,Descuento,True,https://www.gef.co/dx/api/dam/custom/2023/gef/...,2023-06-15,2023-05-06,2023-09-27,1
1,1,18784,707733_016608,POLO MATT CAFÉ 8,https://www.gef.co/dx/api/dam/custom/2022/gef/...,2023,3,Gef,Gef,Polo,...,,,,Descuento,False,https://www.gef.co/dx/api/dam/custom/2022/gef/...,2023-08-30,2023-05-06,2023-09-27,1
2,2,168902,722475_008000,Medias Aisha X 3 Jr Surtido 8-10,https://www.galax.co/dx/api/dam/custom/reempla...,2023,3,Galax,Galax,Calcetines,...,,,,Nuevo,False,https://www.galax.co/dx/api/dam/custom/reempla...,2023-08-02,2023-05-06,2023-09-27,1
3,3,147443,725177_034050,Pantalon Merri Rosado 9-12M,https://www.babyfresh.co/dx/api/dam/custom/202...,2023,3,Baby fresh,Baby fresh,Otros accesorios,...,,,,Descuento,False,https://www.babyfresh.co/dx/api/dam/custom/202...,2023-09-06,2023-05-06,2023-09-27,1
4,4,73269,723379_000900,CONJUNTO LOJU KD BLANCO 8,https://www.gef.co/dx/api/dam/custom/2022/gef/...,2023,2,Gef,Gef,Conjunto,...,,,,Descuento,False,https://www.gef.co/dx/api/dam/custom/2022/gef/...,2023-05-12,2023-05-06,2023-09-27,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1995,62655,117314_009909,4 Pares De Medias Frima Surtido 9-11,https://www.gef.co/dx/api/dam/custom/reemplaza...,2023,2,Gef,Gef,Calcetines,...,,,,Descuento,True,https://www.gef.co/dx/api/dam/custom/reemplaza...,2023-05-22,2023-05-06,2023-09-27,1
1996,1996,152162,727760_015520,Camiseta Carmela Rosada 9-12M,https://www.babyfresh.co/dx/api/dam/custom/202...,2023,3,Baby fresh,Baby fresh,Camisetas,...,,,,,False,https://www.babyfresh.co/dx/api/dam/custom/202...,2023-07-19,2023-05-06,2023-09-27,1
1997,1997,140217,725849_000718,Drei Tenis Suela Gris 26,https://www.babyfresh.co/dx/api/dam/custom/ree...,2023,2,Baby fresh,Baby fresh,Tenis,...,,,,Descuento,False,https://www.babyfresh.co/dx/api/dam/custom/ree...,2023-05-26,2023-05-06,2023-09-27,1
1998,1998,3499,721638_007221,Medias Masa Gris Jaspe 10-12,https://www.gef.co/dx/api/dam/custom/2020/GEF/...,2023,3,Gef,Gef,Calcetines,...,,,,Descuento,False,https://www.gef.co/dx/api/dam/custom/2020/GEF/...,2023-08-09,2023-08-02,2023-09-27,1


In [10]:
@task_dq
def homologaciones_validar_cantidad(datalake, gx_toolkit, logger):
    """Valida la cantidad de marcas que existen en el archivo de homologaciones"""
    csv_input_dir = os.environ['datalake_workdir'] + '/' + os.environ['homologa_workdir'] + '/'
    source = datalake.read_csv(csv_input_dir + "homologaciones.csv", sep=",")
    logger.info(source.head(1))
    logger.info("Validando cantidad de registros en homologaciones..")
    homologacion_marca_join = ExpectationConfiguration(
        expectation_type="expect_column_unique_value_count_to_be_between",
        kwargs={
            "column":        "Marca",
            "min_value":     9,
            "max_value":     11,
            "result_format": "SUMMARY"
        },
        meta={
            "notes": {
                "format": "markdown",
                "content": "Algunos elementos de la columna 'marca' no están presentes"
            }
        }
    )
    url = gx_toolkit.run_expectations_on_df(source, "CANTIDAD_MARCAS_HOMOLOGACION", [homologacion_marca_join])
    print("reporte de expectativas", url)
    

In [11]:
homologaciones_validar_cantidad()

TAREA: homologaciones_validar_cantidad--2023-11-25 10:51:22,410-INFO-   Unnamed: 0 Marca Canal
0           0   H&M   H&M
TAREA: homologaciones_validar_cantidad--2023-11-25 10:51:22,410-INFO-Validando cantidad de registros en homologaciones..


Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

NameError: name 'container_path' is not defined

In [2]:
@task_dq
def ordentallas_validar_join(datalake, gx_toolkit, logger):
    """Validación de Orden Tallas para Join"""
    csv_input_dir = os.environ['datalake_workdir'] + '/' + os.environ['homologa_workdir'] + '/'
    source = datalake.read_csv(csv_input_dir + "orden_tallas.csv", sep=",")
    validation_set = datalake.read_csv(csv_input_dir + "homologaciones.csv", sep=",")
    marcas_set = validation_set.Marca.unique().tolist()
    logger.info(source.head(1))
    logger.info("Validando JOIN Orden Tallas...")

    ordentallas_marca_join = ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_in_set",
        kwargs={
            "column":        "Marca",
            "value_set":      marcas_set,
            "result_format": "SUMMARY"
        },
        meta={
            "notes": {
                "format": "markdown",
                "content": "Algunos elementos de la columna 'marca' no están presentes en el archivo de Orden Tallas"
            }
        }
    )
    url = gx_toolkit.run_expectations_on_df(source, "ORDENTALLA_MARCAS_JOIN", [ordentallas_marca_join])
    print("reporte de expectativas", url)



In [3]:
ordentallas_validar_join()

    

TAREA: ordentallas_validar_join--2023-11-25 11:13:53,747-INFO-   Unnamed: 0.1  Unnamed: 0         Marca Talla Equivalencia  Orden
0             0           0  Arturo Calle     2            2      1
TAREA: ordentallas_validar_join--2023-11-25 11:13:53,747-INFO-Validando JOIN Orden Tallas...


Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

NameError: name 'container_path' is not defined

In [None]:
def ean_validar_cantidad_registros(datalake, gx_toolkit, logger):
    """Valida la canitdad de EANs"""
    csv_input_dir = os.environ['datalake_workdir'] + '/' + os.environ['homologa_workdir'] + '/'
    source = datalake.read_csv(csv_input_dir + "ean.csv", sep="|")
    logger.info(source.head(1))
    logger.info("Validando Cantidad EAN...")
    ean_unique = ExpectationConfiguration(
        expectation_type="expect_column_unique_value_count_to_be_between",
        kwargs={
            "column":        "EAN",
            "min_value":     3000,
            "max_value":     4000,
            "result_format": "SUMMARY"
        },
        meta={
            "notes": {
                "format": "markdown",
                "content": "La cantidad de productos presentes excede la esperada"
            }
        }
    )
    url = gx_toolkit.run_expectations_on_df(source, "CANTIDAD_EAN", [ean_unique])
    print("reporte de expectativas", url)

In [4]:
ean = datalake.read_csv('raw-zone/blackBox/scrapingEquivalencias/EAN_COMPLEMENTOS.csv')
ean
#ean = pd.read_csv('C:\\Users\\Wilber\\Download\\')

KeyboardInterrupt: 

In [None]:
@task_dq
def tallasagotadas_validar_join(datalake, gx_toolkit, logger):
    """Saluda al gx."""
    source = datalake.read_csv("cleansed-zone/tallas_agotadas.csv", sep="|")
    validation_set = datalake.read_csv("cleansed-zone/ean.csv", sep="|")
    marcas_set = validation_set.EAN.unique()
    logger.info(source.head(1))
    logger.info("Validando JOIN Orden Tallas...")

    tallasagotadas_marca_join = ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_in_set",
        kwargs={
            "column":        "Marca",
            "valueset":      marcas_set,
            "result_format": "SUMMARY"
        },
        meta={
            "notes": {
                "format": "markdown",
                "content": "Algunos elementos de la columna 'marca' no están presentes en el archivo de Orden Tallas"
            }
        }
    )
    url = gx_toolkit.run_expectations_on_df(source, "test", [tallasagotadas_marca_join])
    print("reporte de expectativas", url)