In [None]:
import pandas as pd
from pathlib import Path
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_info_columns', 10000)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:.2f}'.format)


import seaborn as sns
%matplotlib inline
import re

In [None]:
def cut_decimal_part(df, column):
    """
    Converts a DataFrame column from float (or numeric string) to a string
    by removing the decimal part (i.e. converting 13.5 to "13", 12.0 to "12").
    Non-numeric values are converted to NaN and then to an empty string.
    """
    df[column] = pd.to_numeric(df[column], errors='coerce')

    df[column] = df[column].apply(lambda x: str(int(x)) if pd.notnull(x) else '')
    
    return df

In [None]:
def float_to_hhmm(hours_float):
    hours = int(hours_float)
    minutes = int(round((hours_float - hours)*60))
    return f"{hours}:{minutes:02d}"

In [None]:
def seconds_to_hhmm(total_seconds):
    hours = total_seconds // 3600
    minutes = (total_seconds % 3600) // 60
    return f"{hours}:{minutes:02d}"

In [None]:
def handle_null_values(df, fill_str="", fill_float=0.0, fill_datetime=""):
    """
    Fill null values in DataFrame columns based on data type.

    Parameters:
        df (pd.DataFrame): The input DataFrame.
        fill_str (str): Value to replace nulls in object/string columns. Default is "".
        fill_float (float): Value to replace nulls in float columns. Default is 0.0.
        fill_datetime: Value to replace nulls in datetime columns. 
                       Default is "", but you can also pass a default datetime.
    
    Returns:
        pd.DataFrame: The DataFrame with nulls handled.
    """

    obj_cols = df.select_dtypes(include=['object']).columns
    for col in obj_cols:
        df[col] = df[col].fillna(fill_str).astype(str)
    

    float_cols = df.select_dtypes(include=['float64']).columns
    for col in float_cols:
        df[col] = df[col].fillna(fill_float)
        

    datetime_cols = df.select_dtypes(include=['datetime64[ns]']) 
    for col in datetime_cols:
        df[col] = df[col].fillna(fill_datetime)
        
    return df

In [None]:
import pandas as pd

def get_dataframe_summary(df):
    """
    Returns a summary DataFrame for the given DataFrame.
    
    The summary includes:
      - Data Type
      - Non Null Count
      - Null Count
      - Null Percentage
      - Unique Values count
    """
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 1000)
    
    summary_df = pd.DataFrame({
        'Data Type': df.dtypes,
        'Non Null Count': df.count(),
        'Null Count': df.isna().sum(),
        'Null Percentage': (df.isna().sum() / len(df) * 100).round(2),
        'Unique Values': [df[col].nunique() for col in df.columns],
    })
    
    return summary_df

In [None]:
BASE_DIR = Path.cwd().parent.parent.parent.parent.parent.parent.parent.parent
SAVE_DIR_EXTRACT_EXCEL = BASE_DIR / "media" / "minpub" / "validator_report" / "extract" / "excel"/ "CORTE 2_20250410_194426.xlsx"
SAVE_DIR_EXTRACT_SGA_335 = BASE_DIR / "media" / "minpub" / "validator_report" / "extract" / "sga_335" / "sga_reporte_30-03-2025_06-04-2025_20250410_173936.xlsx"
CID_CUISMP_PATH = BASE_DIR / "media" / "minpub" / "validator_report" / "extract" / "sharepoint_cid_cuismp" / "MINPU - CID-CUISMP - AB.xlsx"
DIR_PARADAS_RELOJ = BASE_DIR / "media" / "minpub" / "validator_report" / "extract" / "pausa_cliente" / "sga_reporte_30-03-2025_04-04-2025_20250410_195338.xlsx"

In [None]:
df_corte_excel = pd.read_excel(SAVE_DIR_EXTRACT_EXCEL, skipfooter=2, engine="openpyxl")

In [None]:
info_df_excel2 = get_dataframe_summary(df_corte_excel)
df_corte_excel = cut_decimal_part(df_corte_excel, 'CUISMP')

#info_df_excel2

In [None]:
df_corte_excel = handle_null_values(df_corte_excel)
info_df_excel2 = get_dataframe_summary(df_corte_excel)
info_df_excel2
#df_corte_excel.shape

In [None]:
row = df_corte_excel[df_corte_excel['TICKET'] == 21784197 ]
row

In [None]:
df_sga_dinamico_335 = pd.read_excel(SAVE_DIR_EXTRACT_SGA_335)
info_df_335 = get_dataframe_summary(df_sga_dinamico_335)
info_df_335
row = df_sga_dinamico_335[df_sga_dinamico_335['nro_incidencia'] == 21789943]
row

In [None]:
#df_sga_dinamico_335.head(1)

In [None]:
info_sga_dinamico_335 = get_dataframe_summary(df_sga_dinamico_335)
info_sga_dinamico_335

In [None]:
df_sga_dinamico_335['interrupcion_inicio'] = pd.to_datetime(df_sga_dinamico_335['interrupcion_inicio'], errors='coerce', dayfirst=True)
df_sga_dinamico_335['interrupcion_fin'] = pd.to_datetime(df_sga_dinamico_335['interrupcion_fin'], errors='coerce', dayfirst=True)
df_sga_dinamico_335['fecha_comunicacion_cliente'] = pd.to_datetime(df_sga_dinamico_335['fecha_comunicacion_cliente'], errors='coerce', dayfirst=True)
df_sga_dinamico_335['fecha_generacion'] = pd.to_datetime(df_sga_dinamico_335['fecha_generacion'], errors='coerce', dayfirst=True)
df_sga_dinamico_335['fg_padre'] = pd.to_datetime(df_sga_dinamico_335['fg_padre'], errors='coerce', dayfirst=True)
df_sga_dinamico_335['hora_sistema'] = pd.to_datetime(df_sga_dinamico_335['hora_sistema'], errors='coerce', dayfirst=True)


In [None]:
df_sga_dinamico_335 = handle_null_values(df_sga_dinamico_335)

In [None]:
info_sga_dinamico_335 = get_dataframe_summary(df_sga_dinamico_335)
info_sga_dinamico_335

In [None]:
#df_sga_dinamico_335['tipificacion_interrupcion_hhmm'] = df_sga_dinamico_335['tipificacion_interrupcion'].apply(float_to_hhmm)
row = df_sga_dinamico_335[df_sga_dinamico_335['nro_incidencia'] == 21789943 ]
row


In [None]:
df_paradas_reloj = pd.read_excel(DIR_PARADAS_RELOJ)
info_parada_reloj = get_dataframe_summary(df_paradas_reloj)
info_parada_reloj


In [None]:
df_paradas_reloj.head(3)

In [None]:
df_paradas_reloj['startdate'] = pd.to_datetime(df_paradas_reloj['startdate'], errors="coerce", dayfirst=True)
df_paradas_reloj['enddate'] = pd.to_datetime(df_paradas_reloj['enddate'], errors="coerce", dayfirst=True)

In [None]:
info_parada_reloj = get_dataframe_summary(df_paradas_reloj)
info_parada_reloj

In [None]:
df_paradas_reloj.head(3)

In [None]:
from typing import List, Dict
from datetime import datetime
import sys
import os

notebook_dir = os.getcwd()
project_root = os.path.join(notebook_dir, '..','..','..','..','..','..','..','..')
sys.path.append(os.path.abspath(project_root))


print("notebook dir:", notebook_dir)
print("project root:", project_root)
print("absolute project root:", os.path.abspath(project_root))
print("notebook dir:", sys.path)

In [None]:
from typing import List, Dict
from datetime import datetime
import sys
import os

notebook_dir = os.getcwd()
project_root = os.path.join(notebook_dir, '..')
sys.path.append(os.path.abspath(project_root))

sys.path.append(r"")

from utils.logger_config import get_sga_logger
 
logger = get_sga_logger()

def resolve_clock_stop_overlaps(clock_stops: List[Dict]) -> List[Dict]:
    """
    Eliminate overlaps in clock stops (paradas de reloj) by nro_incidencia.

    Args:
        clock_stops: List of clock stops with 'start' 'end' datetime and 'nro_incidencia'

    Returns:
        List of non-overlapping clock stops
            
    """
    if not clock_stops:
        return []
    
    incidents = {}
    for stop in clock_stops:
        nro_incidencia = stop.get('nro_incidencia', 'unknown')
        if nro_incidencia not in incidents:
            incidents[nro_incidencia] = []
        incidents[nro_incidencia].append(stop)

    
    resolved_all = []   

    for nro_incidencia, incident_stops in incidents.items():
        sorted_stops = sorted(incident_stops, key=lambda x: x['start'])

        for i, stop in enumerate(sorted_stops):
            if pd.isna(stop['end']):
                if i < len(sorted_stops) - 1 and not pd.isna(sorted_stops[i+1]['start']):
                    stop['end'] = sorted_stops[i+1]['start']
                else:
                    logger.warning(f"Removing stop with missing end date for nro_incidencia {nro_incidencia}")
                    continue
        
        valid_stops = [stop for stop in sorted_stops if not pd.isna(stop['end'])]

        if not valid_stops:
            continue

        resolved_stops = [valid_stops[0]]

        for current_stop in valid_stops[1:]:
            last_resolved = resolved_stops[-1]

            if current_stop['start'] <= last_resolved['end']:
                last_resolved['end'] = max(last_resolved['end'], current_stop['end'])
            else:
                resolved_stops.append(current_stop)

        resolved_all.extend(resolved_stops)

    return resolved_all


In [None]:

def calculate_total_clock_stop_minutes(
    nro_incidencia:str,
    interruption_start: datetime,
    interruption_end: datetime,
    df_sga_paradas: pd.DataFrame
) -> float:
    """
    Calculate the total clock minutes for a ticket, considering constraints.

    Args:
        nro_incidencia: The ticket identifier
        interrupcion_inicio: Start time of the interruption from REPORTE DINAMICO 335 
        interrupcion_fin: End time of the interruption from REPORTE DINAMICO 335 
    
    Returns:
        Total clock stop minutes
    
    """   
    df_sga_paradas['nro_incidencia'] = df_sga_paradas['nro_incidencia'].astype(str)
    nro_incidencia_stops = df_sga_paradas[df_sga_paradas['nro_incidencia'] == nro_incidencia].copy()

    if nro_incidencia_stops.empty:
        logger.info(f"No clock stops found for incident {nro_incidencia}")
        return 0.0
    
    clock_stops = []

    for _, stop in nro_incidencia_stops.iterrows():
        start_date = stop.get('startdate')
        end_date = stop.get('enddate')

        if pd.isna(start_date):
            logger.warning(f"Skipping record with missing start date for incident {nro_incidencia}")
            continue

        if start_date < interruption_start:
            logger.info(f"Adjusting start time to interruption en for incident {nro_incidencia}")
            start_date = interruption_start

        if not pd.isna(end_date):
            if end_date > interruption_end:
                logger.info(f"Adjusting end time to interruption en for incident {nro_incidencia}")
                end_date = interruption_end

            if start_date < end_date:
                clock_stops.append({
                    'start': start_date,
                    'end': end_date,
                    'nro_incidencia': nro_incidencia
                })
        else:
            clock_stops.append({
                'start': start_date,
                'end': end_date,
                'nro_incidencia': nro_incidencia
            })
    resolved_stops = resolve_clock_stop_overlaps(clock_stops)

    total_minutes = sum(
        (stop['end'] - stop['start']).total_seconds() / 60
        for stop in resolved_stops
        if not pd.isna(stop['end']) and not pd.isna(stop['start'])
    )
    return total_minutes

In [None]:
def merge_objetivos_tiempo(
        df_corte_excel: pd.DataFrame,
        df_sga_dinamico_335: pd.DataFrame,
        df_sga_paradas: pd.DataFrame
) -> pd.DataFrame:
    """
    Merges:
        - corte_excel with sga_dinamico 335 on 'nro_incidencia'
        - Summarizes the total "paradas de reloj" from sga_dinamico 380
           and merges that as well
    Returns a merged Dataframe with all columns needed for TIEMPO (HH:MM) validation.    
    """

    df_corte_excel = df_corte_excel.rename(columns={'TICKET':'nro_incidencia'})
   
    df_corte_excel['nro_incidencia'] = df_corte_excel['nro_incidencia'].astype(str)
    df_sga_dinamico_335['nro_incidencia'] = df_sga_dinamico_335['nro_incidencia'].astype(str)

    merged_sga_335_excel = pd.merge(
        df_corte_excel,
        df_sga_dinamico_335,
        on='nro_incidencia',
        how='left',
        indicator=True,
        suffixes=('_corte_excel', '_sga_dinamico_335')
    )

    merged_sga_335_excel['sum_paradas'] = merged_sga_335_excel.apply(
        lambda r: calculate_total_clock_stop_minutes(
            nro_incidencia = r["nro_incidencia"],
            interruption_start = r["interrupcion_inicio"],
            interruption_end = r["interrupcion_fin"],
            df_sga_paradas = df_sga_paradas
        ),
        axis= 1
    )
    return merged_sga_335_excel

df_merge_sga_335_corte_excel_paradas = merge_objetivos_tiempo(df_corte_excel, df_sga_dinamico_335, df_paradas_reloj)
df_merge_sga_335_corte_excel_paradas
unmatched_rows = df_merge_sga_335_corte_excel_paradas[df_merge_sga_335_corte_excel_paradas['_merge'] == 'both']
unmatched_rows
df_merge_sga_335_corte_excel_paradas_handleded_null = handle_null_values(df_merge_sga_335_corte_excel_paradas)
df_merge_sga_335_corte_excel_paradas_handleded_null
unmatched_rows['codincidencepadre'].dtype
df_sga_dinamico_335['codincidencepadre'].dtype

In [None]:
info_objetivo_14 = get_dataframe_summary(df_merge_sga_335_corte_excel_paradas_handleded_null)
info_objetivo_14

In [None]:

import numpy as np

def subvalidation_tiempo_HHMM_paradas_cliente(df_merged: pd.DataFrame) -> pd.DataFrame:
    """
    Validatess  the 'TIEMPO (HH:MM)' column CORTE-EXCEL by comparing:
    - (interruppcion_fin - interrupcion) - sum(paradas)
    vs.
    - The parsed minutes of 'TIEMPO(HH:MM)'.
    Returns a Dataframe with boolean flags and 'fail_count'. 
    """

    df = df_merged.copy()
    
    df['Expected_Inicio'] = np.where(df['masivo'] == "Si",
                                     df['fecha_generacion'],
                                     df['interrupcion_inicio'])
    
    df['Expected_Inicio_trimmed'] = df['Expected_Inicio'].apply(lambda x: x.replace(second=0))
    df['interrupcion_fin_trimmed'] = df['interrupcion_fin'].apply(lambda x: x.replace(second=0))

    df['diff_335_min'] = (
        (df['interrupcion_fin_trimmed'] - df['Expected_Inicio_trimmed'])
        .dt.total_seconds()/60
    )

    def parse_hhmm_to_minutes(hhmm_str):
        if pd.isna(hhmm_str):
            return np.nan
        try:
            h,m = str(hhmm_str).split(':')
            total_minutes = float(h)*60 + float(m)
            print(f"Converted {hhmm_str} to {total_minutes} minutes")
            return total_minutes
        except Exception as e:
            print(f"Error with {hhmm_str}: {e}")
            return np.nan
        
    df['TIEMPO (HH:MM)_trimed'] = df['TIEMPO (HH:MM)'].apply(
        lambda x: str(x)[:5] if isinstance(x, str) and x.endswith(":00") else x
    )
    
    
    df['tiempo_corte_min'] = df['TIEMPO (HH:MM)_trimed'].apply(parse_hhmm_to_minutes)
    
    df['effective_time_335'] = df['diff_335_min'] - df['sum_paradas']

    df['non_negative_335'] = df['diff_335_min'] >= 0
    df['non_negative_paradas'] = df['sum_paradas'] >= 0
    df['non_negative_effective'] = df['effective_time_335'] >= 0

    tolerance = 1e-9

    df['match_corte'] = (
        (df['tiempo_corte_min'] - df['effective_time_335'])
        .abs() < tolerance
    )

    df['Validation_OK'] = (
        df['non_negative_335'] &
        df['non_negative_paradas'] &
        df['non_negative_effective'] &
        df['match_corte']
    )

    df['fail_count'] = (
        (~df['non_negative_335']).astype(int) + 
        (~df['non_negative_paradas']).astype(int) + 
        (~df['non_negative_effective']).astype(int) +
        (~df['match_corte']).astype(int)
    
    )

    return df

df_validation_paradas = subvalidation_tiempo_HHMM_paradas_cliente(df_merge_sga_335_corte_excel_paradas)
#df_validation_paradas.head(1)
row = df_validation_paradas[df_validation_paradas['nro_incidencia'] == "21793172" ]
row

In [None]:
info_valida_paradas = get_dataframe_summary(df_validation_paradas)
#info_valida_paradas


In [None]:

def buid_failure_messages_tiempo_HHMM_paradas_cliente(df: pd.DataFrame) -> pd.DataFrame:
    """
    Builds a descriptive message for the 'TIEMPO (HH:MM)' validation.
    Returns rows that fail any check (fall_count > 0) with columns:
    -'numero de incidencia'
    -'mensaje'
    -'objetivo'
    
    """
    mensaje = np.where(
        df['Validation_OK'],
        "Validation de TIEMPO (HH:MM) exitosa",
        (
            np.where(~df['non_negative_335'],
                     "INTERRUPCION_FIN - INTERRUPCION_INICIO es negativo. ",  "")+
            np.where(~df['non_negative_paradas'],
                     "Suma de paradas de reloj es negativa. ", "")+
            np.where(~df['non_negative_effective'],
                     "Tiempo efectivo (INTERRUPCION - paradas) es negativo.", "")+
            np.where(~df['match_corte'],
                     "EL TIEMPO (HH:MM): " + df['tiempo_corte_min'].astype(str) +
                       " de CORTE EXCEL  no coincide con el tiempo efectivo calculado SGA: " +df['effective_time_335'].astype(str)  , "")
        )
    )
    df['mensaje'] = mensaje
    df['objetivo'] = 1.4

    df_failures = df[df['fail_count'] > 0 ]
    return df_failures[['nro_incidencia', 'mensaje', 'TIPO REPORTE','objetivo']]
    
df_mensaje_paradas = buid_failure_messages_tiempo_HHMM_paradas_cliente(df_validation_paradas)
df_mensaje_paradas

In [None]:
#row = df_merge_sga_335_corte_excel[df_merge_sga_335_corte_excel['nro_incidencia'] == "21784197" ]

