In [None]:
import pandas as pd
from pathlib import Path
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_info_columns', 10000)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:.2f}'.format)

import seaborn as sns
%matplotlib inline
import re


In [None]:
def cut_decimal_part(df, column):
    """
    Converts a DataFrame column from float (or numeric string) to a string
    by removing the decimal part (i.e. converting 13.5 to "13", 12.0 to "12").
    Non-numeric values are converted to NaN and then to an empty string.
    """
    df[column] = pd.to_numeric(df[column], errors='coerce')

    df[column] = df[column].apply(lambda x: str(int(x)) if pd.notnull(x) else '')
    
    return df


In [None]:

def handle_null_values(df, fill_str="", fill_float=0.0, fill_datetime=""):
    """
    Fill null values in DataFrame columns based on data type.

    Parameters:
        df (pd.DataFrame): The input DataFrame.
        fill_str (str): Value to replace nulls in object/string columns. Default is "".
        fill_float (float): Value to replace nulls in float columns. Default is 0.0.
        fill_datetime: Value to replace nulls in datetime columns. 
                       Default is "", but you can also pass a default datetime.
    
    Returns:
        pd.DataFrame: The DataFrame with nulls handled.
    """

    obj_cols = df.select_dtypes(include=['object']).columns
    for col in obj_cols:
        df[col] = df[col].fillna(fill_str).astype(str)
    

    float_cols = df.select_dtypes(include=['float64']).columns
    for col in float_cols:
        df[col] = df[col].fillna(fill_float)
        

    datetime_cols = df.select_dtypes(include=['datetime64[ns]']) 
    for col in datetime_cols:
        df[col] = df[col].fillna(fill_datetime)
        
    return df

In [None]:
import pandas as pd

def get_dataframe_summary(df):
    """
    Returns a summary DataFrame for the given DataFrame.
    
    The summary includes:
      - Data Type
      - Non Null Count
      - Null Count
      - Null Percentage
      - Unique Values count
    """
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 1000)
    
    summary_df = pd.DataFrame({
        'Data Type': df.dtypes,
        'Non Null Count': df.count(),
        'Null Count': df.isna().sum(),
        'Null Percentage': (df.isna().sum() / len(df) * 100).round(2),
        'Unique Values': [df[col].nunique() for col in df.columns],
    })
    
    return summary_df

In [None]:
BASE_DIR = Path.cwd().parent.parent.parent.parent.parent.parent.parent.parent
SAVE_DIR_EXTRACT_EXCEL = BASE_DIR / "media" / "minpub" / "validator_report" / "extract" / "excel"/ "CORTE 3 _ 10-03-25 AL 16-03-25_20250328_182204.xlsx"
SAVE_DIR_EXTRACT_SGA_335 = BASE_DIR / "media" / "minpub" / "validator_report" / "extract" / "sga_335" / "sga_reporte_10-03-2025_16-03-2025_20250402_112253.xlsx"
CID_CUISMP_PATH = BASE_DIR / "media" / "minpub" / "validator_report" / "extract" / "sharepoint_cid_cuismp" / "MINPU - CID-CUISMP - AB.xlsx"


In [None]:
df_corte_excel = pd.read_excel(SAVE_DIR_EXTRACT_EXCEL, skipfooter=2, engine="openpyxl")

In [None]:
info_df_excel2 = get_dataframe_summary(df_corte_excel)
df_corte_excel = cut_decimal_part(df_corte_excel, 'CUISMP')

info_df_excel2

In [None]:
df_corte_excel = handle_null_values(df_corte_excel)
info_df_excel2 = get_dataframe_summary(df_corte_excel)
info_df_excel2


In [None]:
df_corte_excel.head(1)

In [None]:

df_sga_dinamico_335 = pd.read_excel(SAVE_DIR_EXTRACT_SGA_335)
info_df_335 = get_dataframe_summary(df_sga_dinamico_335)
info_df_335

In [None]:
df_sga_dinamico_335.head(1)

In [None]:

df_sga_dinamico_335['interrupcion_inicio'] =df_sga_dinamico_335['interrupcion_inicio'].astype(str).apply(lambda x : re.sub(r"\s+", " ", x.strip()))
df_sga_dinamico_335['interrupcion_fin'] =df_sga_dinamico_335['interrupcion_fin'].astype(str).apply(lambda x : re.sub(r"\s+", " ", x.strip()))

In [None]:
df_sga_dinamico_335['interrupcion_inicio'] = pd.to_datetime(df_sga_dinamico_335['interrupcion_inicio'], errors='coerce', dayfirst=True)
df_sga_dinamico_335['interrupcion_fin'] = pd.to_datetime(df_sga_dinamico_335['interrupcion_fin'], errors='coerce', dayfirst=True)

In [None]:
df_sga_dinamico_335 = handle_null_values(df_sga_dinamico_335)
info_sga_dinamico_335 = get_dataframe_summary(df_sga_dinamico_335)
info_sga_dinamico_335


In [None]:
null_rows = df_sga_dinamico_335[df_sga_dinamico_335['interrupcion_inicio'].isna()]
null_rows.shape

In [None]:
import numpy as np

df_sga_dinamico_335['Expected_Inicio'] = np.where(df_sga_dinamico_335['masivo'] == "Si",
                                     df_sga_dinamico_335['fecha_generacion'],
                                     df_sga_dinamico_335['interrupcion_inicio'])
df_sga_dinamico_335.head(1)


In [None]:
info_sga_dinamico_335 = get_dataframe_summary(df_sga_dinamico_335)
info_sga_dinamico_335

In [None]:

df_sharepoint_cid_cuismp = pd.read_excel(CID_CUISMP_PATH)
info_df_cuismp = get_dataframe_summary(df_sharepoint_cid_cuismp)
info_df_cuismp

In [None]:
df_sharepoint_cid_cuismp = handle_null_values(df_sharepoint_cid_cuismp)
df_sharepoint_cid_cuismp = cut_decimal_part(df_sharepoint_cid_cuismp, 'CUISMP')
info_df_cuismp = get_dataframe_summary(df_sharepoint_cid_cuismp)
info_df_cuismp

In [None]:
df_sharepoint_cid_cuismp.head(1)

Validación de Columnas en B1, “CUISMP” y “DF”: A partir del código de incidencia (“nro_incidencia”) en B2, se obtiene el CID, y con el CID y DF en B4, se obtiene el CUISMP y DF, se compara con el CUISMP y DF de B1 respectivamente, deben coincidir el CUISMP y DF. Además, el CUISMP también debe figurar en la columna “MEDIDAS CORRECTIVAS Y/O PREVENTIVAS TOMADAS” de B1.

In [None]:
def merge_df_sharepoint_cid_cuimp_df_sga_dinamico(df_sga_dinamico_335, df_sharepoint_cid_cuismp ):

    df_sharepoint_cid_cuismp = df_sharepoint_cid_cuismp.rename(columns={"CID":"cid"})
    df_sharepoint_cid_cuismp["cid"] = df_sharepoint_cid_cuismp["cid"].astype(str).fillna("")
    df_sga_dinamico_335["cid"] = df_sga_dinamico_335["cid"].astype(str).fillna("")

    merge_sga_dinamico_335_sharepoint_cid_cuismp = pd.merge(
        df_sga_dinamico_335,
        df_sharepoint_cid_cuismp,
        on='cid',
        how='left',
        suffixes=('_sga_dinamico_335', '_sharepoint_cid_cuismp')
    ) 

    return merge_sga_dinamico_335_sharepoint_cid_cuismp

df = merge_df_sharepoint_cid_cuimp_df_sga_dinamico(df_sga_dinamico_335, df_sharepoint_cid_cuismp)
df.head(1)


In [None]:
info_df_cuismp_335 = get_dataframe_summary(df)
info_df_cuismp_335

In [None]:
def merge_df_corte_excel_merge_sga_dinamico_335_sharepoint_cid_cuismp(df_corte_excel, merge_sga_dinamico_335_sharepoint_cid_cuismp):

    df_corte_excel = df_corte_excel.rename(columns={'TICKET':'nro_incidencia'})
    #df_corte_excel['nro_incidencia'] = df_corte_excel['nro_incidencia'].astype(str).fillna('')

    merged_all = pd.merge(
        df_corte_excel,
        merge_sga_dinamico_335_sharepoint_cid_cuismp,
        on='nro_incidencia',
        how='left',
        suffixes=('_corte_excel', '_sga_dinamico_335_sharepoint_cid_cuismp')
    )
    return merged_all

merged_all = merge_df_corte_excel_merge_sga_dinamico_335_sharepoint_cid_cuismp(df_corte_excel, df)
merged_all.head(1)

In [None]:
pd.set_option('display.max_colwidth', None)

merged_all['CUISMP_match'] = merged_all['CUISMP_corte_excel'] == merged_all['CUISMP_sga_dinamico_335_sharepoint_cid_cuismp']

merged_all['DF_match'] = merged_all['DF'] == merged_all['Distrito Fiscal']

medidas_col = 'MEDIDAS CORRECTIVAS Y/O PREVENTIVAS TOMADAS'

merged_all['CUISMP_in_medias_tomadas'] = merged_all.apply(
    lambda row: (
        pd.notnull(row[medidas_col]) 
        and pd.notnull(row['CUISMP_corte_excel']) 
        #and str(int(row['CUISMP_corte_excel'])) in str(row[medidas_col])
        and row['CUISMP_corte_excel'] in row[medidas_col]
    ), 
    axis=1
)

merged_all['Validation_OK'] = merged_all['CUISMP_match'] & merged_all['DF_match'] & merged_all['CUISMP_in_medias_tomadas']

merged_all['fail_count'] = (~merged_all['CUISMP_match']).astype(int) + \
                               (~merged_all['DF_match']).astype(int) + \
                               (~merged_all['CUISMP_in_medias_tomadas']).astype(int)

merged_all.head(1)



In [None]:
import numpy as np
mensaje = np.where(
        merged_all['Validation_OK'],
        "Validation successful",
        (
            np.where(~merged_all['CUISMP_match'], 
                     "CUISMP mismatch: expected " + merged_all['CUISMP_sga_dinamico_335_sharepoint_cid_cuismp'].astype(str) +
                     " but got " + merged_all['CUISMP_corte_excel'].astype(str) + ". ", "") +
            np.where(~merged_all['DF_match'], 
                     "DF mismatch: expected " + merged_all['Distrito Fiscal'].astype(str) +
                     " but got " + merged_all['DF'].astype(str) + ". ", "") +
            np.where(~merged_all['CUISMP_in_medias_tomadas'], 
                     "CUISMP not found in MEDIDAS CORRECTIVAS Y/O PREVENTIVAS TOMADAS.", "")
        )
    )
merged_all['mensaje'] = mensaje
merged_all['objetivo'] = 1

df_failures = merged_all[merged_all['fail_count'] > 0]
df_failures[['nro_incidencia', 'mensaje', 'objetivo']]