In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
# =========================================================
# CONFIG
# =========================================================
BRONZE_DB = "slainte_bronze"
SILVER_DB = "slainte_silver"
JIRA_TABLE = f"{BRONZE_DB}.jira_table"
SILVER_TABLE = "jira_tickets_silver"
SILVER_QUAL = f"{SILVER_DB}.{SILVER_TABLE}"
# =========================================================
# LOAD BRONZE
# =========================================================
jira = spark.table(JIRA_TABLE)
# =========================================================
# BASE â€“ ONLY CLOSED TICKETS
# =========================================================
base = (
   jira
   .filter(F.lower(F.col("status")).isin("closed", "done", "resolved"))
   .select(
       F.col("issue_key").cast("string").alias("ticket_id"),
       F.col("status").cast("string").alias("status"),
       F.col("project_key").cast("string").alias("project_key"),
       F.col("description").cast("string").alias("ticket_description")
   )
)
# =========================================================
# KEEP ONLY LATEST VERSION PER TICKET
# =========================================================
window_spec = Window.partitionBy("ticket_id").orderBy(F.col("ticket_id"))
base = (
   base
   .withColumn("row_num", F.row_number().over(window_spec))
   .filter(F.col("row_num") == 1)
   .drop("row_num")
)
# =========================================================
# ðŸ”¥ BALANCED PRIORITY DISTRIBUTION (KEY CHANGE)
# =========================================================
base = base.withColumn(
   "priority",
   F.when(F.rand() < 0.20, "High")
    .when(F.rand() < 0.60, "Medium")
    .otherwise("Low")
)
# =========================================================
# SLA TARGETS
# =========================================================
base = base.withColumn(
   "resolution_target_hours",
   F.when(F.col("priority") == "High", 4.0)
    .when(F.col("priority") == "Medium", 8.0)
    .when(F.col("priority") == "Low", 72.0)
)
# =========================================================
# GENERATE LOGICAL CREATED / RESOLVED DATES
# =========================================================
# Created date: random in last 30 days
base = base.withColumn(
   "created_at",
   F.current_timestamp() - F.expr("INTERVAL 1 DAY") * F.floor(F.rand() * 30)
)
# Resolution logic:
# 70% within SLA, 30% breached slightly
base = base.withColumn(
   "resolution_hours",
   F.when(
       F.rand() <= 0.7,
       F.col("resolution_target_hours") * (0.5 + F.rand() * 0.4)
   ).otherwise(
       F.col("resolution_target_hours") * (1.1 + F.rand() * 0.5)
   )
)
base = base.withColumn(
   "resolved_at",
   F.col("created_at") + F.expr("INTERVAL 1 HOUR") * F.col("resolution_hours")
)
# =========================================================
# SLA METRICS
# =========================================================
base = (
   base
   .withColumn("resolution_hours", F.round(F.col("resolution_hours"), 2))
   .withColumn(
       "resolution_breach",
       F.col("resolution_hours") > F.col("resolution_target_hours")
   )
   .withColumn(
       "breach_hours",
       F.when(
           F.col("resolution_hours") > F.col("resolution_target_hours"),
           F.round(F.col("resolution_hours") - F.col("resolution_target_hours"), 2)
       ).otherwise(0.0)
   )
)
# =========================================================
# ASSIGNEE â†’ RANDOM REAL PEOPLE
# =========================================================
people = ["Ali Ben Salah", "Sarah Martin", "Youssef Trabelsi", "Emma Dubois"]
people_array = F.array(*[F.lit(p) for p in people])
final_df = (
   base
   .withColumn(
       "assignee",
       people_array.getItem(F.floor(F.rand() * len(people)).cast("int"))
   )
   .select(
       "ticket_id",
       "priority",
       "status",
       "project_key",
       "assignee",
       "created_at",
       "resolved_at",
       "resolution_target_hours",
       "resolution_hours",
       "resolution_breach",
       "breach_hours",
       "ticket_description"
   )
)
# =========================================================
# WRITE SILVER
# =========================================================
spark.sql(f"CREATE DATABASE IF NOT EXISTS {SILVER_DB}")
spark.sql(f"DROP TABLE IF EXISTS {SILVER_QUAL}")
final_df.write.format("delta").mode("overwrite").saveAsTable(SILVER_QUAL)
print(f"âœ… FINAL JIRA SILVER TABLE CREATED: {SILVER_QUAL}")
print("Rows:", spark.table(SILVER_QUAL).count())
# =========================================================
# QUICK VALIDATION
# =========================================================
spark.table(SILVER_QUAL).groupBy("priority").count().show()
spark.table(SILVER_QUAL).groupBy("resolution_breach").count().show()