### Imports

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import datetime
import json

In [0]:
tb_name = "suppliers_raw"
silver_tb_name = "silver_suppliers"

### Read Raw Tables

In [0]:
supplier_schema = StructType([
    StructField("supplier_id", IntegerType()),
    StructField("supplier_name", StringType()),
    StructField("contact_name", StringType()),
    StructField("contact_email", StringType()),
    StructField("phone_number", StringType()),
    StructField("country", StringType()),
    StructField("registration_date", DateType()),
    StructField("credit_score", IntegerType()),
    StructField("supplier_type", StringType()),
    StructField("industry", StringType()),
    StructField("annual_revenue", DoubleType()),
    StructField("tax_id", StringType()),
    StructField("rating", IntegerType()),
    StructField("is_active", BooleanType()),
    StructField("supplier_gender", StringType())
])
audit_schema = StructType([
    StructField("env", StringType(), True),
    StructField("table_name", StringType(), True),
    StructField("source_path", StringType(), True),
    StructField("target_path", StringType(), True),
    StructField("quarantine_path", StringType(), True),
    StructField("load_timestamp", TimestampType(), True),
    StructField("total_records", LongType(), True),
    StructField("passed_records", LongType(), True),
    StructField("quarantine_records", LongType(), True),
    StructField("status", StringType(), True),
    StructField("message", StringType(), True)
])



In [0]:
dbutils.widgets.text("env", "dev")
env = dbutils.widgets.get("env").strip().lower()

In [0]:
with open("/Workspace/Users/avadhootd.business@gmail.com/SCM/config/config.json", "r") as f:
    config = json.load(f)

In [0]:
table_name = config[env][tb_name]["pass_target"]
df_raw = spark.read.table(table_name)

In [0]:
df_parsed = df_raw \
    .withColumn("supplier_id", when(col("supplier_id").rlike("^[0-9]+$"), col("supplier_id").cast(IntegerType())).otherwise(None)) \
    .withColumn("credit_score", when(col("credit_score").rlike("^[0-9]+$"), col("credit_score").cast(IntegerType())).otherwise(None)) \
    .withColumn(
        "rating",
        when(lower(col("rating")) == "a", 5)
        .when(lower(col("rating")) == "b", 4)
        .when(lower(col("rating")) == "c", 3)
        .when(lower(col("rating")) == "d", 2)
        .otherwise(1)
    )\
    .withColumn("registration_date", when(col("registration_date").rlike("^[0-9]{4}-[0-9]{2}-[0-9]{2}$"), col("registration_date").cast(DateType())).otherwise(None)) \
    .withColumn("annual_revenue", when(col("annual_revenue").rlike("^[0-9]*\\.?[0-9]+$"), col("annual_revenue").cast(DoubleType())).otherwise(None))\
    .drop("failed_raw", "failed_reasons", "all_reasons", "reasons_str")



### Load DQ rules

In [0]:
with open("/Workspace/Users/avadhootd.business@gmail.com/SCM/config/dq_rules.json", "r") as f:
    dq_config = json.load(f)
rules = dq_config.get(tb_name, {}).get("silver_rules", {})
quarantine_rules = dq_config.get(tb_name, {}).get("quarantine_rules", [])

In [0]:
%run /Workspace/Users/avadhootd.business@gmail.com/SCM/utils/bronze_rule_validation

In [0]:
df_clean, df_quarantine = validate_silver_rules(df_parsed, rules, quarantine_rules)

In [0]:

df_clean = df_clean.drop("quarantine_raw", "quarantine_reasons", "reasons_str")\
                   .withColumn("domain", split(col("contact_email"), "@")[1])\
                   .withColumn("contact_username", split(col("contact_email"), "@")[0])\
                   .withColumn("year", year(col("registration_date")))\
                   .withColumn("month", month(col("registration_date")))\
                   .withColumn("day", dayofmonth(col("registration_date")))\
                   .withColumn(
                    "credit_score_rank",
                    when(col("credit_score") >= 800, "Excellent")
                    .when(col("credit_score") >= 700, "Good")
                    .when(col("credit_score") >= 600, "Fair")
                    .when(col("credit_score") >= 500, "Poor")
                    .otherwise("Very Poor")
                    )\
                   .withColumn("supplier_type",
                               when(lower(col("supplier_type")) == "domestik",  "Domestic")
                               .when(lower(col("supplier_type")) == "Internationl", "International")
                               .otherwise("Unknown")
                               )\
                   .withColumn(
                        "annual_revenue_rank",
                        when(col("annual_revenue") >= 1_000_000_000_000, "Trillion+")
                        .when(col("annual_revenue") >= 1_000_000_000, "Large Enterprise (Billion+)")
                        .when(col("annual_revenue") >= 100_000_000, "Enterprise (100M–1B)")
                        .when(col("annual_revenue") >= 10_000_000, "Mid Market (10M–100M)")
                        .when(col("annual_revenue") >= 1_000_000, "SMB (1M–10M)")
                        .when(col("annual_revenue") >= 100_000, "Small Business (100K–1M)")
                        .when(col("annual_revenue") >= 10_000, "Micro Business (10K–100K)")
                        .when(col("annual_revenue") > 0, "Nano Business (<10K)")
                        .otherwise("Unknown")
                    )\
                   .withColumn("rating_rank",
                               when(col("rating") == 5, "Excellent")
                               .when(col("rating") == 4, "Very Good")
                               .when(col("rating") == 3, "Good")
                               .when(col("rating") == 2, "Fair")
                               .when(col("rating") == 1, "Poor")
                               .otherwise("Unknown")
                               )\
                   .withColumn("is_active", when(lower(col("is_active")).isin("yes", "y","true" ), True)
                               .when(lower(col("is_active")).isin("no", "n","false" ), False))\
                   .withColumn("supplier_gender", when(col("supplier_gender").isNull(), "Unknown")
                               .when(lower(col("supplier_gender")).isin("m", "male"), "Male")
                               .when(lower(col("supplier_gender")).isin("f", "female"), "Female")
                               .otherwise("Unknown"))
                    

In [0]:
df_clean = df_clean.withColumn(
    "watermark_column",
    sha2(concat_ws("||", col("supplier_id").cast("string"), col("tax_id"), col("registration_date").cast("string")), 256)
)



In [0]:
catalog_name = config[env]["catalog"]
silver_table = f"{catalog_name}.silver.silver_suppliers"
silver_quarantine_table = f"{catalog_name}.silver.silver_suppliers_quarantine"
silver_supplier_audit_table = f"{catalog_name}.silver.silver_suppliers_audit"
df_quarantine = df_clean
if spark.catalog.tableExists(silver_table):
    existing_df = spark.table(silver_table).select("watermark_column")
    df_new = df_clean.join(existing_df, on="watermark_column", how="left_anti")
else:
    df_new = df_clean


In [0]:
df_new.write.format("delta").mode("append").option("mergeSchema", "true").saveAsTable(silver_table)
df_quarantine.write.format("delta").mode("overwrite").saveAsTable(silver_quarantine_table)


In [0]:
# %run /Workspace/Users/avadhootd.business@gmail.com/SCM/utils/send_alert_email


In [0]:
# failed_table = config[env][p_file_name]["failed_target"]
# quarantine_table = config[env][p_file_name]["quarantine_target"]

In [0]:
# app_password = app_password.strip()

# if df_quarantine.count() > 0 or df_failed.count() > 0:
#     send_gmail_email(
#     app_password="qfswthychmohjkim",
#     from_email="avadhootdarbhe@gmail.com",
#     to_email="avadhootd.in@mouritech.com",
#     tables=[failed_table, quarantine_table]
#     )


### Audit Logs

In [0]:
%run /Workspace/Users/avadhootd.business@gmail.com/SCM/utils/audit_status



In [0]:
total = df_parsed.count()
passed = df_new.count()
quarantine = df_quarantine.count()

status, message = get_audit_status_and_message(total, passed, quarantine)


In [0]:
audit_data = [{
    "env": env,
    "table_name": silver_tb_name,
    "source_path": config[env][tb_name]["pass_target"],
    "target_path": silver_table,
    "quarantine_path": silver_quarantine_table,
    "load_timestamp": datetime.now(),
    "total_records": df_parsed.count(),
    "passed_records": df_clean.count(),
    "quarantine_records": df_quarantine.count(),
    "status": status,
    "message": message
}]

audit_df = spark.createDataFrame(audit_data, audit_schema)

audit_df.write.format("delta").mode("append").saveAsTable(silver_supplier_audit_table)