### Imports

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import datetime
import json

In [0]:
tb_name = "purchase_orders"
silver_tb_name = "silver_purchase_orders"

### Read Raw Tables

In [0]:
purchase_orders_schema = StructType([
    StructField("po_id", IntegerType(), True),                
    StructField("supplier_id", IntegerType(), True),          
    StructField("po_number", StringType(), True),            
    StructField("po_date", DateType(), True),              
    StructField("currency", StringType(), True),             
    StructField("total_amount", DoubleType(), True),         
    StructField("payment_terms", StringType(), True),        
    StructField("approval_status", StringType(), True),      
    StructField("delivery_country", StringType(), True),     
    StructField("buyer_name", StringType(), True),           
    StructField("freight_charges", DoubleType(), True),      
    StructField("discount_percent", DoubleType(), True),     
    StructField("tax_percent", DoubleType(), True),          
    StructField("priority_level", StringType(), True),       
    StructField("po_category", StringType(), True)   
])

audit_schema = StructType([
    StructField("env", StringType(), True),
    StructField("table_name", StringType(), True),
    StructField("source_path", StringType(), True),
    StructField("target_path", StringType(), True),
    StructField("quarantine_path", StringType(), True),
    StructField("load_timestamp", TimestampType(), True),
    StructField("total_records", LongType(), True),
    StructField("passed_records", LongType(), True),
    StructField("quarantine_records", LongType(), True),
    StructField("status", StringType(), True),
    StructField("message", StringType(), True)
])



In [0]:
dbutils.widgets.text("env", "dev")
env = dbutils.widgets.get("env").strip().lower()

In [0]:
with open("/Workspace/Users/avadhootd.business@gmail.com/sum/SCM/config/config.json", "r") as f:
    config = json.load(f)

In [0]:
table_name = config[env][tb_name]["pass_target"]
df_raw = spark.read.table(table_name)

In [0]:
df_parsed = df_raw \
    .withColumn("po_id", when(col("po_id").rlike("^[0-9]+$"), col("po_id").cast(IntegerType())).otherwise(None)) \
    .withColumn("supplier_id", when(col("supplier_id").rlike("^[0-9]+$"), col("supplier_id").cast(IntegerType())).otherwise(None)) \
    .withColumn("po_date", when(col("po_date").rlike("^[0-9]{4}-[0-9]{2}-[0-9]{2}$"), col("po_date").cast(DateType())).otherwise(None)) \
    .withColumn("total_amount",when(col("total_amount").rlike(r"^\d*\.?\d+$"), col("total_amount").cast(DoubleType()))
        .otherwise(None)
    )\
     .withColumn("freight_charges", when(col("freight_charges").rlike(r"^\d*\.?\d+$"), col("freight_charges").cast(DoubleType())).otherwise(None)) \
    .withColumn("discount_percent", when(col("discount_percent").rlike(r"^\d*\.?\d+$"), col("discount_percent").cast(DoubleType())).otherwise(None)) \
    .drop("failed_raw", "failed_reasons", "all_reasons", "reasons_str", "tax_percent", "po_category")


### Load DQ rules

In [0]:
with open("/Workspace/Users/avadhootd.business@gmail.com/sum/SCM/config/dq_rules.json", "r") as f:
    dq_config = json.load(f)
rules = dq_config.get(tb_name, {}).get("silver_rules", {})
quarantine_rules = dq_config.get(tb_name, {}).get("quarantine_rules", [])

In [0]:
%run /Workspace/Users/avadhootd.business@gmail.com/sum/SCM/utils/bronze_rule_validation

In [0]:
df_clean, df_quarantine = validate_silver_rules(df_parsed, rules, quarantine_rules)
df_clean = df_clean.drop("quarantine_raw", "quarantine_reasons", "reasons_str")

In [0]:

po_date_null_df = df_clean.filter(col("po_date").isNull()) \
    .withColumn("quarantine_raw", array(lit("po_date:null_check"))) \
    .withColumn("quarantine_reasons", array(lit("po_date:null_check"))) \
    .withColumn("reasons_str", concat_ws(",", col("quarantine_reasons")))
    
df_clean = df_clean.filter(col("po_date").isNotNull())


df_quarantine = df_quarantine.unionByName(po_date_null_df)

df_clean = df_clean.drop("quarantine_raw", "quarantine_reasons", "reasons_str", "payment_terms")\
                   .withColumn("amount_category", when(col("total_amount") < 1000, "Low Spend")
                        .when(col("total_amount") < 5000, "Moderate")
                        .when(col("total_amount") < 10000, "High")
                        .when(col("total_amount") < 15000, "Premium")
                        .otherwise("Ultra-Premium"))\
                   .withColumn("approval_status", initcap(lower(col("approval_status"))))\
                   .withColumn("year", month(col("po_date")))\
                   .withColumn("month", month(col("po_date")))\
                   .withColumn("day", dayofmonth(col("po_date")))\
                   .withColumn("freight_charges_bucket", when(col("freight_charges") < 40, "Low")
                        .when(col("freight_charges") < 80, "Medium").otherwise("High"))\
                   .withColumn(
                        "discount_tier",
                        when(col("discount_percent") == 0, "No Discount")
                        .when((col("discount_percent") > 0) & (col("discount_percent") <= 10), "Low")
                        .when((col("discount_percent") > 10) & (col("discount_percent") <= 25), "Moderate")
                        .when((col("discount_percent") > 25) & (col("discount_percent") <= 50), "High")
                        .when(col("discount_percent") > 50, "Very High")
                        .otherwise("Unknown")
                    )\
                   .withColumn(
                        "priority_tier",
                        when(col("priority_level") == "Raw Material", "High")
                        .when(col("priority_level") == "Logistics", "High")
                        .when(col("priority_level") == "Utilities", "Medium")
                        .when(col("priority_level") == "Office Supplies", "Low")
                        .otherwise("Unknown")
                    )
                   


                    

In [0]:
df_clean = df_clean.withColumn(
    "watermark_column",
    sha2(concat_ws("||", col("po_id").cast("string"), col("po_number"), col("po_date").cast("string")), 256)
)

In [0]:
catalog_name = config[env]["catalog"]
silver_table = f"{catalog_name}.silver.silver_purchase_orders"
silver_quarantine_table = f"{catalog_name}.silver.silver_purchase_orders_quarantine"
silver_supplier_audit_table = f"{catalog_name}.silver.silver_purchase_orders_audit"
if spark.catalog.tableExists(silver_table):
    existing_df = spark.table(silver_table).select("watermark_column")
    df_new = df_clean.join(existing_df, on="watermark_column", how="left_anti")
else:
    df_new = df_clean



In [0]:
df_new.write.format("delta").mode("append").option("mergeSchema", "true").saveAsTable(silver_table)
df_quarantine.write.format("delta").mode("overwrite").saveAsTable(silver_quarantine_table)


In [0]:
# %run /Workspace/Users/avadhootd.business@gmail.com/SCM/utils/send_alert_email


In [0]:
# failed_table = config[env][p_file_name]["failed_target"]
# quarantine_table = config[env][p_file_name]["quarantine_target"]

In [0]:
# app_password = app_password.strip()

# if df_quarantine.count() > 0 or df_failed.count() > 0:
#     send_gmail_email(
#     app_password="qfswthychmohjkim",
#     from_email="avadhootdarbhe@gmail.com",
#     to_email="avadhootd.in@mouritech.com",
#     tables=[failed_table, quarantine_table]
#     )


### Audit Logs

In [0]:
%run /Workspace/Users/avadhootd.business@gmail.com/sum/SCM/utils/audit_status



In [0]:
total = df_parsed.count()
passed = df_clean.count()
quarantine = df_quarantine.count()

status, message = get_audit_status_and_message(total, passed, 0, quarantined=quarantine)

print(status, message)
print(f"Total Records: {total}")
print(f"Passed Records: {passed}")
print(f"Quarantine Records: {quarantine}")

In [0]:
audit_data = [{
    "env": env,
    "table_name": silver_tb_name,
    "source_path": config[env][tb_name]["pass_target"],
    "target_path": silver_table,
    "quarantine_path": silver_quarantine_table,
    "load_timestamp": datetime.now(),
    "total_records": df_parsed.count(),
    "passed_records": df_clean.count(),
    "quarantine_records": df_quarantine.count(),
    "status": status,
    "message": message
}]

audit_df = spark.createDataFrame(audit_data, audit_schema)

audit_df.write.format("delta").mode("append").saveAsTable(silver_supplier_audit_table)