In [0]:
# Extract selected Jira columns from CSV
# Save into EXISTING slainte_bronze.jira_table
# ============================================================
import re
from pyspark.sql import functions as F
# -----------------------
# CONFIG
# -----------------------
BRONZE_DB = "slainte_bronze"
BRONZE_TABLE = "jira_table"
file_path = "/Volumes/workspace/slainte_bronze/vol/GFG_FINAL.csv"
desired_cols = [
   "Summary","Issue key","Issue id","Issue Type","Status",
   "Project key","Project name","Project type","Project lead",
   "Priority","Resolution","Assignee","Creator",
   "Created","Updated","Last Viewed","Resolved","Due Date",
   "Description","Environment","Time Spent","Work Ratio","Σ Time Spent","Security Level"
]
print("Using DB:", BRONZE_DB)
print("Using file:", file_path)
# -----------------------
# 1. Read CSV
# -----------------------
dbutils.fs.ls(file_path)
raw = (
   spark.read
   .option("header", True)
   .option("inferSchema", True)
   .csv(file_path)
)
print("Original columns:", len(raw.columns))
# -----------------------
# 2. Clean column names
# -----------------------
def clean_col(c):
   c = c.strip().lower()
   c = re.sub(r"[^\w]", "_", c)
   c = re.sub(r"_+", "_", c)
   return c.strip("_")
raw_clean = raw.toDF(*[clean_col(c) for c in raw.columns])
print("Cleaned columns:", len(raw_clean.columns))
# -----------------------
# 3. Select only wanted columns
# -----------------------
def canon(s):
   s = s.lower()
   s = re.sub(r"[^\w]", "_", s)
   s = re.sub(r"_+", "_", s)
   return s.strip("_")
wanted = [canon(c) for c in desired_cols]
available = [c for c in wanted if c in raw_clean.columns]
missing = [c for c in wanted if c not in raw_clean.columns]
print("Selected columns:", available)
if missing:
   print("Missing (ignored):", missing)
jira_df = raw_clean.select(*available)
# -----------------------
# 4. SAFE timestamp parsing (Jira formats)
# -----------------------
date_formats = [
   "dd/MMM/yyyy h:mm a",
   "dd/MMM/yyyy hh:mm a",
   "yyyy-MM-dd HH:mm:ss"
   
]
def parse_ts(colname):
   exprs = [f"try_to_timestamp(`{colname}`, '{fmt}')" for fmt in date_formats]
   return F.expr("coalesce(" + ", ".join(exprs) + ")")
for c in ["created","updated","resolved","due_date","last_viewed"]:
   if c in jira_df.columns:
       jira_df = jira_df.withColumn(c, parse_ts(c))
       print(f"Parsed timestamp: {c}")
# -----------------------
# 5. Save into slainte_bronze
# -----------------------
spark.sql(f"DROP TABLE IF EXISTS {BRONZE_DB}.{BRONZE_TABLE}")
(
   jira_df
   .write
   .format("delta")
   .mode("overwrite")
   .saveAsTable(f"{BRONZE_DB}.{BRONZE_TABLE}")
)
# -----------------------
# 6. Validation
# -----------------------
print("✅ Table saved:", f"{BRONZE_DB}.{BRONZE_TABLE}")
print("Row count:", spark.table(f"{BRONZE_DB}.{BRONZE_TABLE}").count())
spark.table(f"{BRONZE_DB}.{BRONZE_TABLE}").printSchema()