In [0]:
# =========================
# transform_hybrid (SILVER)
# Source: catalog_project.bronze.bronze_hybrid_manufacturing_categorical
# Target: catalog_project.silver.silver_hybrid_manufacturing
# =========================
from pyspark.sql import functions as F


#WIDGETS

In [0]:
dbutils.widgets.removeAll()
dbutils.widgets.text("catalog", "catalog_project")
dbutils.widgets.text("bronze_schema", "bronze")
dbutils.widgets.text("silver_schema", "silver")

catalog       = dbutils.widgets.get("catalog")
bronze_schema = dbutils.widgets.get("bronze_schema")
silver_schema = dbutils.widgets.get("silver_schema")

In [0]:
spark.sql(f"USE CATALOG {catalog}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {silver_schema}")

In [0]:
source_table = f"{catalog}.{bronze_schema}.bronze_hybrid_manufacturing_categorical"
target_table = f"{catalog}.{silver_schema}.silver_hybrid_manufacturing"

#LOAD BRONZE

In [0]:
df = spark.table(source_table)


#LIMPIAR Y METRICAS KPI

In [0]:
df_silver = (
    df
    # ---- basic string cleanup
    .withColumn("Job_ID", F.trim(F.col("Job_ID")))
    .withColumn("Machine_ID", F.trim(F.col("Machine_ID")))
    .withColumn("Operation_Type", F.trim(F.col("Operation_Type")))
    .withColumn("Material_Used", F.trim(F.col("Material_Used")))
    .withColumn("Job_Status", F.trim(F.col("Job_Status")))
    .withColumn("Optimization_Category", F.trim(F.col("Optimization_Category")))

    # ---- durations (minutes)
    .withColumn(
        "scheduled_duration_min",
        (F.unix_timestamp("Scheduled_End") - F.unix_timestamp("Scheduled_Start")) / 60.0
    )
    .withColumn(
        "actual_duration_min",
        (F.unix_timestamp("Actual_End") - F.unix_timestamp("Actual_Start")) / 60.0
    )

    # ---- delays (minutes)
    .withColumn(
        "delay_start_min",
        (F.unix_timestamp("Actual_Start") - F.unix_timestamp("Scheduled_Start")) / 60.0
    )
    .withColumn(
        "delay_end_min",
        (F.unix_timestamp("Actual_End") - F.unix_timestamp("Scheduled_End")) / 60.0
    )
)


#Reglas basicas

Duraciones negativas --> null

In [0]:
df_silver = (
    df_silver
    .withColumn(
        "scheduled_duration_min",
        F.when(F.col("scheduled_duration_min") >= 0, F.col("scheduled_duration_min")).otherwise(F.lit(None))
    )
    .withColumn(
        "actual_duration_min",
        F.when(F.col("actual_duration_min") >= 0, F.col("actual_duration_min")).otherwise(F.lit(None))
    )
)

#Columnas adicionales KPI

In [0]:
df_silver = (
    df_silver
    # total delay: solo acumulamos lo positivo
    .withColumn(
        "total_delay_min",
        F.greatest(F.lit(0.0), F.coalesce(F.col("delay_start_min"), F.lit(0.0))) +
        F.greatest(F.lit(0.0), F.coalesce(F.col("delay_end_min"),   F.lit(0.0)))
    )
    .withColumn("is_delayed", F.col("total_delay_min") > 0)

    # eficiencia: scheduled/actual (si actual > 0)
    .withColumn(
        "efficiency_ratio",
        F.when(F.col("actual_duration_min") > 0,
               F.col("scheduled_duration_min") / F.col("actual_duration_min"))
         .otherwise(F.lit(None))
    )

    # energía por minuto real (si actual > 0)
    .withColumn(
        "energy_per_min",
        F.when(F.col("actual_duration_min") > 0,
               F.col("Energy_Consumption") / F.col("actual_duration_min"))
         .otherwise(F.lit(None))
    )

    # día de la corrida (útil para KPIs por fecha)
    .withColumn("scheduled_date", F.to_date("Scheduled_Start"))
    .withColumn("actual_date", F.to_date("Actual_Start"))
)


Seleccion final de columnas

In [0]:
df_silver = df_silver.select(
    "Job_ID",
    "Machine_ID",
    "Operation_Type",
    "Material_Used",
    "Processing_Time",
    "Energy_Consumption",
    "Machine_Availability",
    "Scheduled_Start",
    "Scheduled_End",
    "Actual_Start",
    "Actual_End",
    "scheduled_date",
    "actual_date",
    "Job_Status",
    "Optimization_Category",

    # KPI-ready metrics
    "scheduled_duration_min",
    "actual_duration_min",
    "delay_start_min",
    "delay_end_min",
    "total_delay_min",
    "is_delayed",
    "efficiency_ratio",
    "energy_per_min",

    # audit
    "_ingestion_ts",
    "_source_file"
)



#SAVE SILVER

In [0]:
(df_silver.write
 .mode("overwrite")
 .option("overwriteSchema", "true")
 .format("delta")
 .saveAsTable(target_table)
)

#VALIDACION

In [0]:
print(f"OK: {target_table}")
display(spark.table(target_table).limit(10))