# AnÃ¡lisis de la demanda agregada para la compra 2027-2028

## Bronze table

In [0]:
# MÃ³dulos
%pip install openpyxl
import pandas as pd
import openpyxl
from functools import reduce
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType
import re
import operator
from pyspark.sql.functions import col, expr




In [0]:
# -----------------------------
# 1) Read messy Excel as raw grid (string-first)
# -----------------------------
xlsx_path = "/Volumes/workspace/default/eseotres/AnÃ¡lisis Plataforma Salud.xlsx"

pdf = pd.read_excel(
    xlsx_path,
    sheet_name="Info licitaciÃ³n",
    header=None,
    dtype=str,
    na_filter=False
)

# Add a true row index BEFORE Spark (preserves order 1:1 with Excel)
pdf["_row_id"] = range(len(pdf))

# 2) Convert to Spark (keep your existing schema build, but include _row_id)
data = pdf.astype(str).values.tolist()  # ok, but we must treat "nan" as empty later

schema = StructType(
    [StructField(f"_c{i}", StringType(), True) for i in range(pdf.shape[1] - 1)] +
    [StructField("_row_id", StringType(), True)]  # as string for now; we cast below
)

df_bronze = spark.createDataFrame(data, schema=schema).withColumn("_row_id", F.col("_row_id").cast("long"))

# 3) Drop ONLY rows that are entirely empty (treat "", null, "nan", "none" as empty)
data_cols = [c for c in df_bronze.columns if c != "_row_id"]

is_empty_col = lambda c: (
    F.lower(F.trim(F.coalesce(F.col(c), F.lit("")))).isin("", "nan", "none", "null")
)

all_empty_expr = reduce(lambda a, b: a & b, [is_empty_col(c) for c in data_cols])

df_bronze = df_bronze.filter(~all_empty_expr)

# Now you can inspect in the real order:
df_bronze.orderBy("_row_id").limit(30).show(truncate=False)
print("Spark bronze cols:", len(df_bronze.columns), "| rows:", df_bronze.count())

# 4) Persist Bronze Delta table
# (optional but safest) drop the old table first
spark.sql("DROP TABLE IF EXISTS workspace.default.bronze_licitacion_info")

# write the new one
(df_bronze.write
 .mode("overwrite")
 .format("delta")
 .saveAsTable("workspace.default.bronze_licitacion_info"))

print("âœ… Created table: workspace.default.bronze_licitacion_info")


In [0]:
%sql
SELECT
  *
FROM 
  workspace.default.bronze_licitacion_info
ORDER BY
  _row_id
LIMIT 
  10;

## Silver table


In [0]:
# -----------------------------
# Helpers
# -----------------------------
def norm(s: str) -> str:
    if s is None:
        return ""
    s = str(s).strip().lower()
    s = (s.replace("Ã¡","a").replace("Ã©","e").replace("Ã­","i")
          .replace("Ã³","o").replace("Ãº","u").replace("Ã±","n"))
    s = re.sub(r"[^a-z0-9]+", "_", s)
    s = re.sub(r"_+", "_", s).strip("_")
    return s

def to_double(colname):
    return F.when(
        F.trim(F.coalesce(F.col(colname), F.lit(""))) == "", None
    ).otherwise(
        F.regexp_replace(F.col(colname), ",", "").cast("double")
    )

# -----------------------------
# 1) Load Bronze
# -----------------------------
df_bronze = spark.table("workspace.default.bronze_licitacion_info")

# -----------------------------
# 2) Detect header rows
# -----------------------------
data_cols = [c for c in df_bronze.columns if c != "_row_id"]

contains_clave = reduce(
    lambda a, b: a | b,
    [F.lower(F.col(c)).contains("clave") for c in data_cols]
)

header_row_id = df_bronze.where(contains_clave).select(F.min("_row_id")).first()[0]
group_row_id  = header_row_id - 1

# -----------------------------
# 3) Extract header rows to pandas
# -----------------------------
pdf_head = (
    df_bronze
    .where(F.col("_row_id").isin([group_row_id, header_row_id]))
    .orderBy("_row_id")
    .drop("_row_id")
    .toPandas()
)

r_group = pdf_head.iloc[0].replace(["", "nan", "None", None], pd.NA).ffill().fillna("")
r_head  = pdf_head.iloc[1].fillna("")

# -----------------------------
# 4) Build FINAL column names (allow duplicates)
# -----------------------------
final_cols = []
for g, h in zip(r_group.tolist(), r_head.tolist()):
    g0 = norm(g)
    h0 = norm(h)

    if h0 in ("clave", "descripcion"):
        final_cols.append(h0)
    elif h0 in ("min", "max") and g0:
        final_cols.append(f"{g0}_{h0}")
    else:
        final_cols.append("col")

# -----------------------------
# 5) Create structured Silver base
# -----------------------------
df_data = (
    df_bronze
    .where(F.col("_row_id") > header_row_id)
    .drop("_row_id")
    .toDF(*final_cols)
)
# --- define aggregation prefixes ---
# Define the aggregation families
AGG_FAMILIES = ["imss_bienestar", "ccinshae", "salud_spps"]

# Start with your existing dataframe
df_aggregated = df_data

# For each family, find matching columns and aggregate them
for family in AGG_FAMILIES:
    # Find columns that match the pattern: family_*_min and family_*_max
    min_cols = [col for col in df_data.columns if col.startswith(f"{family}_") and col.endswith("_min")]
    max_cols = [col for col in df_data.columns if col.startswith(f"{family}_") and col.endswith("_max")]
    
    # Sum these columns to create aggregated columns
    if min_cols:
        # Create sum expression for min columns
        min_sum_expr = sum([F.col(c) for c in min_cols])
        df_aggregated = df_aggregated.withColumn(f"{family}_min", min_sum_expr)
    
    if max_cols:
        # Create sum expression for max columns
        max_sum_expr = sum([F.col(c) for c in max_cols])
        df_aggregated = df_aggregated.withColumn(f"{family}_max", max_sum_expr)

# Now select only the columns you need
# Get the base columns
base_cols = ['clave', 'descripcion', 'imss_min', 'imss_max', 'issste_min', 'issste_max', 
             'pemex_min', 'pemex_max']

# Agregar columnas de familias
for family in AGG_FAMILIES:
    base_cols.extend([f"{family}_min", f"{family}_max"])

# Agregar totales
base_cols.extend(['totales_min', 'totales_max'])

# Limpiar descripcion: remover comillas y normalizar espacios
df_clean = df_aggregated.withColumn(
    'descripcion',
    F.regexp_replace(F.col('descripcion'), '"', '')  # Quitar comillas dobles
).withColumn(
    'descripcion',
    F.regexp_replace(F.col('descripcion'), "'", '')  # Quitar comillas simples
).withColumn(
    'descripcion',
    F.trim(F.col('descripcion'))  # Quitar espacios al inicio/final
).withColumn(
    'descripcion',
    F.regexp_replace(F.col('descripcion'), '\\s+', ' ')  # Normalizar mÃºltiples espacios a uno solo
)

# Seleccionar SOLO las columnas que necesitamos
df_silver_clean = df_clean.select(*base_cols)

df_silver_clean.display()

# -----------------------------
# 7) Persist Silver
# -----------------------------
spark.sql("DROP TABLE IF EXISTS workspace.default.silver_licitacion_info")

(df_silver_clean.write
 .mode("overwrite")
 .format("delta")
 .saveAsTable("workspace.default.silver_licitacion_info"))

print("âœ… Silver table created cleanly")

In [0]:
%sql
-- ValidaciÃ³n de la agregaciÃ³n. 
WITH validacion AS (
  SELECT
    clave,
    totales_min,
    totales_max,

    /* sums with null-safe coalesce */
    COALESCE(imss_min,0) + COALESCE(issste_min,0) + COALESCE(pemex_min,0)
    + COALESCE(imss_bienestar_min,0) + COALESCE(ccinshae_min,0) + COALESCE(salud_spps_min,0)
      AS sum_min,

    COALESCE(imss_max,0) + COALESCE(issste_max,0) + COALESCE(pemex_max,0)
    + COALESCE(imss_bienestar_max,0) + COALESCE(ccinshae_max,0) + COALESCE(salud_spps_max,0)
      AS sum_max
  FROM workspace.default.silver_licitacion_info
),

deltas AS (
  SELECT
    clave,
    totales_min,
    sum_min,
    (sum_min - COALESCE(totales_min,0)) AS delta_min,

    totales_max,
    sum_max,
    (sum_max - COALESCE(totales_max,0)) AS delta_max
  FROM validacion
)

SELECT *
FROM deltas
WHERE delta_min <> 0 OR delta_max <> 0
ORDER BY ABS(delta_min) DESC, ABS(delta_max) DESC;


In [0]:
out_dir = "dbfs:/Volumes/workspace/default/eseotres/silver_licitacion_info_export_csv"

(spark.table("workspace.default.silver_licitacion_info")
 .coalesce(1)  # single CSV file (ok if not huge)
 .write.mode("overwrite")
 .option("header", "true")
 .csv(out_dir))

print("âœ… Exported to:", out_dir)

## Golden Tables

In [0]:
# Read silver table
df_silver = spark.table("workspace.default.silver_licitacion_info")

# Step 1: Select only the columns we need
max_columns = [c for c in df_silver.columns if c.endswith('_max') and not c.startswith('totales')]

print(f"ðŸ“Š Institution columns found: {max_columns}")

# Step 2: Fill nulls with 0 and ensure all columns are BIGINT
df_selected = df_silver.select(['clave', 'descripcion'] + max_columns)

for col_name in max_columns:
    df_selected = df_selected.withColumn(
        col_name,
        F.coalesce(col(col_name).cast('bigint'), F.lit(0))
    )

# Step 3: Build the CORRECT stack expression
# stack(N, 'name1', value1, 'name2', value2, ...) - alternating string/value pairs
stack_expr = f"stack({len(max_columns)}"

for col_name in max_columns:
    institution_name = col_name.replace('_max', '')
    # This is the key fix: wrap column name in backticks for SQL expression
    stack_expr += f", '{institution_name}', `{col_name}`"

stack_expr += ") as (institucion, cantidad)"

print(f"\nðŸ”§ Stack expression: {stack_expr[:200]}...")  # Debug: see first 200 chars

# Step 4: Apply the transformation
df_golden_cross = df_selected.select(
    'clave',
    'descripcion',
    expr(stack_expr)
)

# Step 5: Clean up - remove null/zero quantities (optional)
df_golden_cross = df_golden_cross.filter(
    (col('cantidad').isNotNull()) & 
    (col('cantidad') > 0)
)

# Step 6: Verify the result
print("\nâœ… Golden Cross Table Schema:")
df_golden_cross.printSchema()

print("\nðŸ“‹ Sample data for one product:")
sample_clave = df_golden_cross.select('clave').first()[0]
df_golden_cross.filter(col('clave') == sample_clave).show(truncate=False)

print(f"\nðŸ“Š Total rows: {df_golden_cross.count()}")

# Step 7: Save as golden table
spark.sql("DROP TABLE IF EXISTS workspace.default.gold_institution_distribution")

df_golden_cross.write \
    .mode("overwrite") \
    .format("delta") \
    .saveAsTable("workspace.default.gold_institution_distribution")

print("âœ… Golden table created: gold_institution_distribution")

In [0]:

# Exportar tabla golden cruzada
out_dir = "dbfs:/Volumes/workspace/default/eseotres"

(spark.table("workspace.default.gold_institution_distribution")
 .coalesce(1)  # single CSV file (ok if not huge)
 .write.mode("overwrite")
 .option("header", "true")
 .csv(out_dir))

print("âœ… Exported to:", out_dir)