In [None]:
import pandas as pd
import os

input_dir = "00_Input"
output_dir = "01_output"

In [None]:
# Load each file into its own DataFrame
df_CALIB= pd.read_excel(os.path.join(output_dir, "NETECH_CAMPIONE_CALIB.xlsx"))

In [None]:
df_CALIB['CAUSALE ANALITICA'] = (df_CALIB['CAUSALE ANALITICA'].astype(str).str.strip())


Applying the filters of the perimeter

In [None]:
# da eliminare quando flg_SCEN_IN_USO != 1

print(df_CALIB.shape)

df_CALIB_ = df_CALIB[df_CALIB['flg_SCEN_IN_USO'] == 1]
print(df_CALIB_.shape)

df_CALIB_ = df_CALIB_[df_CALIB_["DESCRIZIONE"] == "Operazioni con paese controparte ad altissimo rischio"]
print(df_CALIB_.shape)

Changing the type to float

In [None]:
df_CALIB_["IMPORTO"] = df_CALIB_["IMPORTO"].astype(float)

In [None]:
df_CALIB_["flag_stato"].value_counts(dropna=False)

Assume missing stato as open alert

In [None]:
df_CALIB_.loc[df_CALIB_["flag_stato"].isna(), "flag_stato"] = 1
df_CALIB_["flag_stato"].value_counts(dropna=False)

Filter for false positive

In [None]:
df_CALIB_FP = df_CALIB_[df_CALIB_['flag_stato'] == 0]
print(df_CALIB_FP.shape)

In [None]:
from typing import List

def _pct_labels(percentiles: List[float]) -> List[str]:
    """Helper to build column labels like Percentile_00, Percentile_05, , Percentile_100."""
    return [f"Percentile_{int(round(q * 100)):02d}" for q in percentiles]

def compute_group_percentiles(
    df: pd.DataFrame,
    value_col: str,
    group_cols: List[str],
    percentiles: List[float],
) -> pd.DataFrame:
    """
    Compute specified percentiles of `value_col` within each group in `group_cols`.
    Returns a DataFrame with one row per group and one column per percentile cutoff.
    """
    tmp = df.copy()
    tmp[value_col] = pd.to_numeric(tmp[value_col], errors="coerce")

    # group-by quantiles → MultiIndex (group, percentile) → unstack to columns
    q_tbl = (
        tmp.groupby(group_cols)[value_col]
           .quantile(percentiles)
           .unstack()  # columns are the percentile floats
           .reset_index()
    )

    # rename percentile columns (floats) to labels like p05, p10, ...
    rename_map = {q: lbl for q, lbl in zip(q_tbl.columns[len(group_cols):], _pct_labels(percentiles))}
    q_tbl = q_tbl.rename(columns=rename_map)

    return q_tbl


def apply_group_percentile_flags(
    df: pd.DataFrame,
    percentiles_df: pd.DataFrame,
    value_col: str,
    group_cols: List[str],
    flag_operator: str = ">=",
) -> pd.DataFrame:
    """
    Merge group percentile cutoffs into `df` and add 1/0 flags for each cutoff.
    Flag is 1 if `value_col` {operator} cutoff (per row's group), else 0.
    """
    out = df.copy()
    out[value_col] = pd.to_numeric(out[value_col], errors="coerce")

    # Merge cutoffs on group columns (one-time join, avoids repeated transform calls)
    out = out.merge(percentiles_df, how="left", on=group_cols)

    # Identify percentile columns we just merged (those starting with 'Percentile_')
    pct_cols = [c for c in out.columns if c.startswith("Percentile_")]
   
    # Build flags
    if flag_operator == ">=":
        for c in pct_cols:
            out[f"flag_{c}"] = (out[value_col] >= out[c]).astype(int)
    elif flag_operator == ">":
        for c in pct_cols:
            out[f"flag_{c}"] = (out[value_col] > out[c]).astype(int)
    else:
        raise ValueError("flag_operator must be '>' or '>='")
    return out


In [None]:
percentiles = [0, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45,
               0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95, 1]

# 1) compute group cutoffs
cutoffs = compute_group_percentiles(
    df=df_CALIB_FP,
    value_col="IMPORTO",
    group_cols=["CAUSALE ANALITICA"],   # use your exact column names
    percentiles=percentiles
)

# 2) apply flags using those cutoffs
df_CALIB_threshold = apply_group_percentile_flags(
    df=df_CALIB_,
    percentiles_df=cutoffs,
    value_col="IMPORTO",
    group_cols=["CAUSALE ANALITICA"],
    flag_operator=">="  # or ">"
)

print(df_CALIB_threshold.shape)

In [None]:
output_file = os.path.join(output_dir, "analisi_00090_risultati.xlsx")
# Export DataFrame to Excel
df_CALIB_threshold.to_excel(output_file, index=False)