# Análisis Histórico de compras 2022-2028

Integración con información de mercado

In [0]:
from pyspark.sql.functions import expr, regexp_replace, col

In [0]:
df_unique_products = spark.sql("""
    WITH all_claves AS (
        SELECT DISTINCT clave FROM workspace.default.silver_2027_2028  WHERE clave IS NOT NULL
        UNION
        SELECT DISTINCT clave FROM workspace.default.silver_2025_2026  WHERE clave IS NOT NULL
        UNION
        SELECT DISTINCT clave FROM workspace.default.silver_2023_2024  WHERE clave IS NOT NULL
        UNION
        SELECT DISTINCT clave FROM workspace.default.silver_market_data  WHERE clave IS NOT NULL
    ),
    with_descriptions AS (
        SELECT 
            ac.clave,
            COALESCE(
                t2728.descripcion,
                t2526.descripcion,
                t2324.descripcion,
                'Descripción no localizada en compra consolidada'
            ) AS descripcion
        FROM all_claves ac
        LEFT JOIN workspace.default.silver_2027_2028 t2728 ON ac.clave = t2728.clave
        LEFT JOIN workspace.default.silver_2025_2026 t2526 ON ac.clave = t2526.clave
        LEFT JOIN workspace.default.silver_2023_2024 t2324 ON ac.clave = t2324.clave
    )
    SELECT DISTINCT clave, descripcion
    FROM with_descriptions
""")

# Guardamos la tabla Delta
table = "workspace.default.gold_unique_products"

df_unique_products.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(table)

# Establecer clave como NOT NULL y luego agregar primary key
spark.sql(f"""
    ALTER TABLE {table}
    ALTER COLUMN clave SET NOT NULL
""")

# Optimizar la tabla y establecer la primary key
spark.sql(f"""
    ALTER TABLE {table}
    ADD CONSTRAINT golden_unique_products_pk PRIMARY KEY (clave)
""")

# Optimizar la tabla para mejorar el rendimiento de lecturas
spark.sql(f"OPTIMIZE {table}")

# Z-ordering por clave para consultas más rápidas
spark.sql(f"OPTIMIZE {table} ZORDER BY (clave)")

print(f"✅ Tabla Golden completada y optimizada: {table}")
print(f"   - Primary Key: clave")
print(f"   - Filas totales: {df_unique_products.count()}")


In [0]:
df_max_per_year = spark.sql(""" 
    SELECT
    cl.clave,
    cl.descripcion,
    CAST(COALESCE(tab2023.totales_max, 0) AS DOUBLE) AS max_2023_2024,
    CAST(COALESCE(tab2025.totales_max, 0) AS DOUBLE) AS max_2025_2026,
    CAST(COALESCE(tab2027.totales_max, 0) AS DOUBLE) AS max_2027_2028
    FROM workspace.default.gold_unique_products cl
    FULL OUTER JOIN workspace.default.silver_2023_2024 tab2023
    ON cl.clave = tab2023.clave
    FULL OUTER JOIN workspace.default.silver_2025_2026 tab2025
    ON cl.clave = tab2025.clave
    FULL OUTER JOIN workspace.default.silver_2027_2028 tab2027
    ON cl.clave = tab2027.clave
    ORDER BY
    cl.clave;
    """)

# Guardamos la tabla Delta
table = "workspace.default.gold_maximum_historics"

df_max_per_year.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(table)

# Establecer clave como NOT NULL y luego agregar primary key
spark.sql(f"""
    ALTER TABLE {table}
    ALTER COLUMN clave SET NOT NULL
""")

# Optimizar la tabla y establecer la primary key
spark.sql(f"""
    ALTER TABLE {table}
    ADD CONSTRAINT golden_max_per_year_pk PRIMARY KEY (clave)
""")

# Optimizar la tabla para mejorar el rendimiento de lecturas
spark.sql(f"OPTIMIZE {table}")

# Z-ordering por clave para consultas más rápidas
spark.sql(f"OPTIMIZE {table} ZORDER BY (clave)")

print(f"✅ Tabla Golden completada y optimizada: {table}")
print(f"   - Primary Key: clave")
print(f"   - Filas totales: {df_max_per_year.count()}")

  

In [0]:
df_unpivoted2023_2024 = spark.sql("""
    SELECT 
        clave,
        "CC 2023-2024" AS comprac,
        institucion,
        totales_max
    FROM workspace.default.silver_2023_2024
    UNPIVOT (
        max_value FOR institucion IN (
            imss_max,
            imss_bienestar_max,
            issste_max,
            ccinshae_max,
            sedena_max,
            semar_max,
            guardia_nacional_max,
            oadprs_max,
            salud_spps_max
        )
    )
""")

df_unpivoted2025_2026 = spark.sql("""
    SELECT 
        clave,
        "CC 2025-2026" AS comprac,
        "No determinada" AS institucion,
        totales_max
    FROM workspace.default.silver_2025_2026
    """)

df_unpivoted2027_2028 = spark.sql("""
    SELECT 
        clave,
        "CC 2027-2028" AS comprac,
        institucion,
        totales_max
    FROM workspace.default.silver_2027_2028
    UNPIVOT (
        max_value FOR institucion IN (
            imss_max,
            issste_max,
            pemex_max,
            imss_bienestar_max,
            ccinshae_max,
            salud_spps_max
        )
    )
""")

df_combined = df_unpivoted2023_2024 \
    .unionAll(df_unpivoted2027_2028) \
    .unionAll(df_unpivoted2025_2026) \
    .withColumn("institucion", regexp_replace(col("institucion"), "_max$", ""))

# Guardamos la tabla Delta
table = "workspace.default.gold_contribucion_p_institucion"

df_combined.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(table)

# Establecer clave como NOT NULL y luego agregar primary key
spark.sql(f"""
    ALTER TABLE {table}
    ALTER COLUMN clave SET NOT NULL
""")

spark.sql(f"""
    ALTER TABLE {table}
    ALTER COLUMN comprac SET NOT NULL
""")

# Optimizar la tabla y establecer la primary key
spark.sql(f"""
    ALTER TABLE {table}
    ADD CONSTRAINT golden_contr_por_inst_pk PRIMARY KEY (clave, comprac)
""")

# Optimizar la tabla para mejorar el rendimiento de lecturas
spark.sql(f"OPTIMIZE {table}")

# Z-ordering por clave para consultas más rápidas
spark.sql(f"OPTIMIZE {table} ZORDER BY (clave)")

print(f"✅ Tabla Golden completada y optimizada: {table}")
print(f"   - Primary Key: clave, comprac")
print(f"   - Filas totales: {df_combined.count()}")

  

