In [None]:
import pandas as pd
from pathlib import Path
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_info_columns', 10000)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:.2f}'.format)


import seaborn as sns
%matplotlib inline
import re

In [None]:
def cut_decimal_part(df, column):
    """
    Converts a DataFrame column from float (or numeric string) to a string
    by removing the decimal part (i.e. converting 13.5 to "13", 12.0 to "12").
    Non-numeric values are converted to NaN and then to an empty string.
    """
    df[column] = pd.to_numeric(df[column], errors='coerce')

    df[column] = df[column].apply(lambda x: str(int(x)) if pd.notnull(x) else '')
    
    return df

In [None]:
def float_to_hhmm(hours_float):
    hours = int(hours_float)
    minutes = int(round((hours_float - hours)*60))
    return f"{hours}:{minutes:02d}"

In [None]:
def seconds_to_hhmm(total_seconds):
    hours = total_seconds // 3600
    minutes = (total_seconds % 3600) // 60
    return f"{hours}:{minutes:02d}"

In [None]:
def handle_null_values(df, fill_str="", fill_float=0.0, fill_datetime=""):
    """
    Fill null values in DataFrame columns based on data type.

    Parameters:
        df (pd.DataFrame): The input DataFrame.
        fill_str (str): Value to replace nulls in object/string columns. Default is "".
        fill_float (float): Value to replace nulls in float columns. Default is 0.0.
        fill_datetime: Value to replace nulls in datetime columns. 
                       Default is "", but you can also pass a default datetime.
    
    Returns:
        pd.DataFrame: The DataFrame with nulls handled.
    """

    obj_cols = df.select_dtypes(include=['object']).columns
    for col in obj_cols:
        df[col] = df[col].fillna(fill_str).astype(str)
    

    float_cols = df.select_dtypes(include=['float64']).columns
    for col in float_cols:
        df[col] = df[col].fillna(fill_float)
        

    datetime_cols = df.select_dtypes(include=['datetime64[ns]']) 
    for col in datetime_cols:
        df[col] = df[col].fillna(fill_datetime)
        
    return df

In [None]:
import pandas as pd

def get_dataframe_summary(df):
    """
    Returns a summary DataFrame for the given DataFrame.
    
    The summary includes:
      - Data Type
      - Non Null Count
      - Null Count
      - Null Percentage
      - Unique Values count
    """
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 1000)
    
    summary_df = pd.DataFrame({
        'Data Type': df.dtypes,
        'Non Null Count': df.count(),
        'Null Count': df.isna().sum(),
        'Null Percentage': (df.isna().sum() / len(df) * 100).round(2),
        'Unique Values': [df[col].nunique() for col in df.columns],
    })
    
    return summary_df

In [None]:
# BASE_DIR = Path.cwd().parent.parent.parent.parent.parent.parent.parent.parent
# SAVE_DIR_EXTRACT_EXCEL = BASE_DIR / "media" / "minpub" / "validator_report" / "extract" / "excel"/ "CORTE 1 - 23.03.25 AL 30.03.25_20250407_140635.xlsx"
# SAVE_DIR_EXTRACT_SGA_335 = BASE_DIR / "media" / "minpub" / "validator_report" / "extract" / "sga_335" / "sga_reporte_10-03-2025_16-03-2025_20250402_112253.xlsx"
# CID_CUISMP_PATH = BASE_DIR / "media" / "minpub" / "validator_report" / "extract" / "sharepoint_cid_cuismp" / "MINPU - CID-CUISMP - AB.xlsx"

BASE_DIR = Path.cwd().parent.parent.parent.parent.parent.parent.parent.parent
SAVE_DIR_EXTRACT_EXCEL = BASE_DIR / "media" / "minpub" / "validator_report" / "extract" / "excel"/ "CORTE 2_20250410_194426.xlsx"
SAVE_DIR_EXTRACT_SGA_335 = BASE_DIR / "media" / "minpub" / "validator_report" / "extract" / "sga_335" / "sga_reporte_30-03-2025_04-04-2025_20250410_194858.xlsx"
CID_CUISMP_PATH = BASE_DIR / "media" / "minpub" / "validator_report" / "extract" / "sharepoint_cid_cuismp" / "MINPU - CID-CUISMP - AB.xlsx"
DIR_PARADAS_RELOJ = BASE_DIR / "media" / "minpub" / "validator_report" / "extract" / "pausa_cliente" / "sga_reporte_30-03-2025_04-04-2025_20250410_195338.xlsx"

In [None]:
df_corte_excel = pd.read_excel(SAVE_DIR_EXTRACT_EXCEL, skipfooter=2, engine="openpyxl")

In [None]:
info_df_excel2 = get_dataframe_summary(df_corte_excel)
df_corte_excel = cut_decimal_part(df_corte_excel, 'CUISMP')

info_df_excel2

In [None]:
df_corte_excel = handle_null_values(df_corte_excel)
info_df_excel2 = get_dataframe_summary(df_corte_excel)
info_df_excel2

In [None]:
row = df_corte_excel[df_corte_excel['TICKET'] == 21784197 ]
row

In [None]:
df_sga_dinamico_335 = pd.read_excel(SAVE_DIR_EXTRACT_SGA_335)
info_df_335 = get_dataframe_summary(df_sga_dinamico_335)
info_df_335
#row = df_sga_dinamico_335[df_sga_dinamico_335['nro_incidencia'] == 21786971]
#row

In [None]:
df_sga_dinamico_335.head(1)

In [None]:
info_sga_dinamico_335 = get_dataframe_summary(df_sga_dinamico_335)
info_sga_dinamico_335

In [None]:
df_sga_dinamico_335['interrupcion_inicio'] = pd.to_datetime(df_sga_dinamico_335['interrupcion_inicio'], errors='coerce', dayfirst=True)
df_sga_dinamico_335['interrupcion_fin'] = pd.to_datetime(df_sga_dinamico_335['interrupcion_fin'], errors='coerce', dayfirst=True)
df_sga_dinamico_335['fecha_comunicacion_cliente'] = pd.to_datetime(df_sga_dinamico_335['fecha_comunicacion_cliente'], errors='coerce', dayfirst=True)
df_sga_dinamico_335['fecha_generacion'] = pd.to_datetime(df_sga_dinamico_335['fecha_generacion'], errors='coerce', dayfirst=True)
df_sga_dinamico_335['fg_padre'] = pd.to_datetime(df_sga_dinamico_335['fg_padre'], errors='coerce', dayfirst=True)
df_sga_dinamico_335['hora_sistema'] = pd.to_datetime(df_sga_dinamico_335['hora_sistema'], errors='coerce', dayfirst=True)

In [None]:
df_sga_dinamico_335 = handle_null_values(df_sga_dinamico_335)
info_sga_dinamico_335 = get_dataframe_summary(df_sga_dinamico_335)
info_sga_dinamico_335

In [None]:
df_sga_dinamico_335['tipificacion_interrupcion_hhmm'] = df_sga_dinamico_335['tipificacion_interrupcion'].apply(float_to_hhmm)


In [None]:
import numpy as np
df_sga_dinamico_335['Expected_Inicio'] = np.where(df_sga_dinamico_335['masivo'] == "Si",
                                     df_sga_dinamico_335['fecha_generacion'],
                                     df_sga_dinamico_335['interrupcion_inicio'])
df_sga_dinamico_335.head(1)

In [None]:
info_sga_dinamico_335 = get_dataframe_summary(df_sga_dinamico_335)
info_sga_dinamico_335

In [None]:
def merge_sga_335_corte_excel(
    df_corte_excel: pd.DataFrame, 
    df_sga_dinamico_335: pd.DataFrame, 
) -> pd.DataFrame:
    """
    Common merge function for Objective 1.
    
    Merges:
      - corte-excel  with sga_dinamico_335 on 'nro_incidencia'

    Returns a merged DataFrame with common columns needed.
    """

    df_corte_excel = df_corte_excel.rename(columns={'TICKET':'nro_incidencia'})
   
    df_corte_excel['nro_incidencia'] = df_corte_excel['nro_incidencia'].astype(str)
    df_sga_dinamico_335['nro_incidencia'] = df_sga_dinamico_335['nro_incidencia'].astype(str)

    merged_sga335_excel = pd.merge(
        df_corte_excel,
        df_sga_dinamico_335,
        on='nro_incidencia',
        how='left',
        suffixes=('_corte_excel', '_sga_dinamico_335')
    )

    return merged_sga335_excel

df_merge_sga_335_corte_excel = merge_sga_335_corte_excel(df_corte_excel, df_sga_dinamico_335)
#df_merge_sga_335_corte_excel
info = get_dataframe_summary(df_merge_sga_335_corte_excel)
#info

In [None]:
df_merge_sga_335_corte_excel

In [None]:
row = df_merge_sga_335_corte_excel[df_merge_sga_335_corte_excel['nro_incidencia'] == "21784197" ]
row

In [None]:

def validation_fin_inicio_HHMM(merged_df: pd.DataFrame) -> pd.DataFrame:

    df = merged_df.copy()
    
    df['start_335'] = np.where(df['masivo'] == "Si",
                                     df['fecha_generacion'],
                                     df['interrupcion_inicio'])
    
    df['diff_335_sec'] = (df['interrupcion_fin'] - df['start_335']).dt.total_seconds()
    # df['diff_335_sec_hhmm'] = df['diff_335_sec'].apply(seconds_to_hhmm)
    df['diff_corte_sec'] = (df['FECHA Y HORA FIN'] - df['FECHA Y HORA INICIO']).dt.total_seconds()

    def parse_hhmm_to_minutes(hhmm_str):
        if pd.isna(hhmm_str):
            return np.nan
        try:
            h,m = str(hhmm_str).split(':')
            total_minutes = int(h) * 60 + int(m)
            print(f"Converted {hhmm_str} to {total_minutes} seconds")
            return total_minutes
        except Exception as e: 
            print(f"Error with {hhmm_str}: {e}")
            return np.nan
    
    df['FIN-INICIO (HH:MM)_trimed'] = df['FIN-INICIO (HH:MM)'].apply(
    lambda x: str(x)[:5] if isinstance(x, str) and x.endswith(":00") else x
    )

    df['fin_inicio_hhmm_column_corte_to_seconds'] = df['FIN-INICIO (HH:MM)_trimed'].apply(parse_hhmm_to_minutes)
        


    df['non_negative_335'] = df['diff_335_sec'] >= 0
    df['non_negative_corte'] = df['diff_corte_sec'] >= 0

    df['non_negative_fin_inicio_column_corte_hhmm_to_seconds'] = df['fin_inicio_hhmm_column_corte_to_seconds'] >= 0

    tolerance = 1e-9

    df['match_335_corte'] = (abs(df['diff_335_sec'] - df['diff_corte_sec']) < tolerance  )
    df['match_corte_fin_inicio_hhmm_column'] = (abs(df['diff_corte_sec'] - df['fin_inicio_hhmm_column_corte_to_seconds']) < tolerance)

    df['Validation_OK'] = (
        df['non_negative_335'] &
        df['non_negative_corte'] &
        df['non_negative_fin_inicio_column_corte_hhmm_to_seconds'] &
        df['match_335_corte'] &
        df['match_corte_fin_inicio_hhmm_column']
    )

    df['fail_count'] = (
        (~df['non_negative_335']).astype(int) +
        (~df['non_negative_corte']).astype(int) +
        (~df['non_negative_fin_inicio_column_corte_hhmm_to_seconds']).astype(int)+
        (~df['match_335_corte']).astype(int) +
        (~df['match_corte_fin_inicio_hhmm_column']).astype(int)
    )
    return df

df_validation_fecha_inicio_fin_HHMM = validation_fin_inicio_HHMM(df_merge_sga_335_corte_excel)
df_validation_fecha_inicio_fin_HHMM.head(1)
row = df_validation_fecha_inicio_fin_HHMM[df_validation_fecha_inicio_fin_HHMM['nro_incidencia'] == '21784197']
row
