In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import (
    StringIndexer,
    OneHotEncoder,
    VectorAssembler
)
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql import functions as F


In [0]:
# Databricks notebook source
import random
import math
from pyspark.sql import functions as F
from pyspark.sql.types import (
    StructType, StructField,
    LongType, IntegerType, StringType, DoubleType
)

# =========================
# 0) 参数
# =========================
SEED = 42
N = 50_000   # Python loop 建议 <= 100k；你可先 20k 试跑

random.seed(SEED)

# =========================
# 1) 显式声明 schema（你要求的）
# =========================
schema = StructType([
    StructField("app_id", LongType(), False),

    StructField("age", IntegerType(), False),
    StructField("province", StringType(), False),
    StructField("employment_type", StringType(), False),

    StructField("income_annual", DoubleType(), False),
    StructField("credit_score", DoubleType(), False),
    StructField("dti", DoubleType(), False),
    StructField("utilization", DoubleType(), False),

    StructField("delinq_12m", IntegerType(), False),
    StructField("num_cards", IntegerType(), False),
    StructField("requested_limit", IntegerType(), False),

    StructField("risk_score", DoubleType(), False),
    StructField("p_approve", DoubleType(), False),
    StructField("approve", IntegerType(), False),
])

# =========================
# 2) helper functions
# =========================
def clamp(x, lo, hi):
    return max(lo, min(x, hi))

# =========================
# 3) Python loop 生成数据（deterministic）
# =========================
provinces = ["ON", "BC", "AB", "QC", "MB", "NS"]
employment_types = ["Salaried", "SelfEmployed", "Student", "Unemployed"]

rows = []

for app_id in range(N):

    # demographics
    age = random.randint(18, 72)
    province = random.choice(provinces)
    employment_type = random.choice(employment_types)

    # income (lognormal-ish)
    income_raw = math.exp(random.random() * 2.0 + 10.2)
    if employment_type == "Student":
        income_annual = income_raw * 0.35
    elif employment_type == "Unemployed":
        income_annual = income_raw * 0.15
    else:
        income_annual = income_raw
    income_annual = clamp(income_annual, 15_000.0, 350_000.0)

    # credit score
    credit_score = 300.0 + random.random() * 550.0
    if employment_type == "Salaried":
        credit_score += 35.0
    elif employment_type == "SelfEmployed":
        credit_score += 10.0
    elif employment_type == "Student":
        credit_score -= 30.0
    else:
        credit_score -= 60.0
    credit_score += (age - 35) * 0.8
    credit_score = clamp(credit_score, 300.0, 850.0)

    # core risk features
    dti = random.random() * 1.2
    if income_annual < 45_000.0:
        dti *= 1.15
    dti = clamp(dti, 0.0, 1.2)

    utilization = random.random() * 1.5
    if credit_score < 600.0:
        utilization *= 1.15
    utilization = clamp(utilization, 0.0, 1.5)

    delinq_12m = int(random.random() * 2.5)
    if credit_score < 580.0:
        delinq_12m += int(random.random() * 3.0)
    delinq_12m = min(delinq_12m, 6)

    num_cards = int(random.random() * 6.0)
    if age > 45:
        num_cards += int(random.random() * 3.0)
    num_cards = min(num_cards, 8)

    requested_limit = random.randint(1_000, 30_000)

    # approval policy -> risk_score
    risk_score = 0.0

    # credit score bucket
    if credit_score < 560.0:
        risk_score += 2.2
    elif credit_score < 620.0:
        risk_score += 1.4
    elif credit_score < 680.0:
        risk_score += 0.8
    else:
        risk_score += 0.2

    # DTI bucket
    if dti > 0.55:
        risk_score += 1.6
    elif dti > 0.40:
        risk_score += 0.9
    else:
        risk_score += 0.2

    # utilization bucket
    if utilization > 0.95:
        risk_score += 1.2
    elif utilization > 0.70:
        risk_score += 0.7
    else:
        risk_score += 0.2

    # delinquency
    risk_score += delinq_12m * 0.55

    # employment adjustments
    if employment_type == "Unemployed":
        risk_score += 1.8
    elif employment_type == "Student":
        risk_score += 0.9
    elif employment_type == "SelfEmployed":
        risk_score += 0.35

    # income adjustments
    if income_annual < 35_000.0:
        risk_score += 0.9
    elif income_annual < 50_000.0:
        risk_score += 0.4

    # requested limit adjustments
    if requested_limit > 20_000:
        risk_score += 0.35

    # risk_score -> p_approve (sigmoid-ish)
    a, b = 1.15, 2.4
    p_approve = 1.0 / (1.0 + math.exp(a * (risk_score - b)))

    # noise override
    if random.random() < 0.02:
        p_approve = 0.5

    approve = 1 if random.random() < p_approve else 0

    # IMPORTANT: 按 schema 顺序写 tuple（避免 dict 顺序问题）
    rows.append((
        int(app_id),

        int(age),
        str(province),
        str(employment_type),

        float(round(income_annual, 2)),
        float(round(credit_score, 1)),
        float(round(dti, 3)),
        float(round(utilization, 3)),

        int(delinq_12m),
        int(num_cards),
        int(requested_limit),

        float(round(risk_score, 3)),
        float(round(p_approve, 6)),
        int(approve),
    ))

# =========================
# 4) 创建 Spark DataFrame（强制 schema）
# =========================
df = spark.createDataFrame(rows, schema=schema)

df.printSchema()
df.show(5, truncate=False)

# =========================
# 5) sanity checks
# =========================
df.groupBy("approve").count().show()

df.select(
    F.avg("income_annual").alias("avg_income"),
    F.avg("credit_score").alias("avg_score"),
    F.avg("dti").alias("avg_dti"),
    F.avg("utilization").alias("avg_util"),
    F.avg("delinq_12m").alias("avg_delinq"),
    F.avg("p_approve").alias("avg_p_approve"),
).show()

# 额外：批准率在不同信用分段
(df
 .withColumn("score_bucket",
             F.when(F.col("credit_score") < 560, "<560")
              .when(F.col("credit_score") < 620, "560-619")
              .when(F.col("credit_score") < 680, "620-679")
              .otherwise(">=680"))
 .groupBy("score_bucket")
 .agg(F.avg("approve").alias("approval_rate"), F.count("*").alias("n"))
 .orderBy("score_bucket")
 .show()
)


root
 |-- app_id: long (nullable = false)
 |-- age: integer (nullable = false)
 |-- province: string (nullable = false)
 |-- employment_type: string (nullable = false)
 |-- income_annual: double (nullable = false)
 |-- credit_score: double (nullable = false)
 |-- dti: double (nullable = false)
 |-- utilization: double (nullable = false)
 |-- delinq_12m: integer (nullable = false)
 |-- num_cards: integer (nullable = false)
 |-- requested_limit: integer (nullable = false)
 |-- risk_score: double (nullable = false)
 |-- p_approve: double (nullable = false)
 |-- approve: integer (nullable = false)

+------+---+--------+---------------+-------------+------------+-----+-----------+----------+---------+---------------+----------+---------+-------+
|app_id|age|province|employment_type|income_annual|credit_score|dti  |utilization|delinq_12m|num_cards|requested_limit|risk_score|p_approve|approve|
+------+---+--------+---------------+-------------+------------+-----+-----------+----------+-------

In [0]:
df.display()

app_id,age,province,employment_type,income_annual,credit_score,dti,utilization,delinq_12m,num_cards,requested_limit,risk_score,p_approve,approve
0,58,ON,Salaried,118551.29,488.1,0.167,0.177,2,3,4070,3.7,0.183172,0
1,19,MB,SelfEmployed,112649.81,682.9,0.503,0.674,0,5,25864,2.0,0.613014,0
2,39,AB,SelfEmployed,41383.09,733.1,0.141,0.57,0,2,9667,1.35,0.769856,1
3,52,ON,Unemployed,15000.0,414.8,0.868,1.5,0,1,26331,8.05,0.001505,0
4,32,ON,Unemployed,15000.0,587.2,0.503,0.639,0,1,23398,5.55,0.026021,0
5,28,MB,SelfEmployed,37302.1,513.1,1.2,1.104,3,5,26427,7.75,0.002124,0
6,69,AB,Unemployed,15000.0,383.2,1.2,1.5,1,4,16035,8.25,0.001196,0
7,65,MB,Student,41948.82,529.6,0.805,0.624,2,4,29216,6.75,0.006676,0
8,68,NS,Unemployed,15000.0,478.0,0.822,0.807,1,7,4753,7.75,0.002124,0
9,35,NS,Student,15000.0,509.1,0.626,1.5,2,3,6854,7.9,0.001788,0


In [0]:
label_col = "approve"

cat_cols = [
    "province",
    "employment_type"
]

num_cols = [
    "age",
    "income_annual",
    "credit_score",
    "dti",
    "utilization",
    "delinq_12m",
    "num_cards",
    "requested_limit"
]


In [0]:
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

print("train:", train_df.count())
print("test :", test_df.count())


train: 40145
test : 9855


In [0]:
indexers = [
    StringIndexer(
        inputCol=c,
        outputCol=f"{c}_idx",
        handleInvalid="keep"
    )
    for c in cat_cols
]

encoder = OneHotEncoder(
    inputCols=[f"{c}_idx" for c in cat_cols],
    outputCols=[f"{c}_ohe" for c in cat_cols]
)


In [0]:
assembler = VectorAssembler(
    inputCols=num_cols + [f"{c}_ohe" for c in cat_cols],
    outputCol="features",
    handleInvalid="keep"
)


In [0]:
dt = DecisionTreeClassifier(
    labelCol=label_col,
    featuresCol="features",
    maxDepth=5,              # 控制可解释性（很关键）
    minInstancesPerNode=200, # 防止过拟合，像 policy
    seed=42
)


In [0]:
pipeline = Pipeline(
    stages=indexers + [encoder, assembler, dt]
)


dt_model = pipeline.fit(train_df)


In [0]:
import mlflow
from pyspark.sql import functions as F
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

In [0]:
label_col = "approve"
pred_col = "prediction"
positive_label = 1.0   # ✅ 银行里你可以改成 0.0（例如把“拒绝/坏客户”当正类）
label_col = "approve"
seed = 42


In [0]:
sample_output_sdf.display()

prediction,probability
0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.9473087104500189, 0.05269128954998109))"
0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.7223230490018149, 0.2776769509981851))"
1.0,"Map(vectorType -> dense, length -> 2, values -> List(0.31030444964871196, 0.689695550351288))"
0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.9473087104500189, 0.05269128954998109))"
0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.9473087104500189, 0.05269128954998109))"
0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.9473087104500189, 0.05269128954998109))"
0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.9473087104500189, 0.05269128954998109))"
0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.9473087104500189, 0.05269128954998109))"
0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.9473087104500189, 0.05269128954998109))"
0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.9473087104500189, 0.05269128954998109))"


In [0]:
from pyspark.ml.functions import vector_to_array

In [0]:
with mlflow.start_run(run_name="dt_cc_approval") as run:
    # ---- train
    model = pipeline.fit(train_df)

    # ---- predict
    pred_df = model.transform(test_df)

    # ---- AUC (threshold-free)
    auc = BinaryClassificationEvaluator(
        labelCol=label_col,
        rawPredictionCol="rawPrediction",
        metricName="areaUnderROC"
    ).evaluate(pred_df)

    # ---- Accuracy/Precision/Recall/F1 for chosen positive label
    rdd = pred_df.select("prediction", label_col).rdd.map(lambda r: (float(r["prediction"]), float(r[label_col])))
    m = MulticlassMetrics(rdd)

    accuracy = m.accuracy
    precision = m.precision(positive_label)
    recall = m.recall(positive_label)
    f1 = m.fMeasure(positive_label)

    # ---- log metrics
    mlflow.log_metric("AUC", float(auc))
    mlflow.log_metric("Accuracy", float(accuracy))
    mlflow.log_metric(f"Precision_pos{int(positive_label)}", float(precision))
    mlflow.log_metric(f"Recall_pos{int(positive_label)}", float(recall))
    mlflow.log_metric(f"F1_pos{int(positive_label)}", float(f1))

    # ---- log params (optional but useful)
    mlflow.log_params({
        "algo": "DecisionTree",
        "maxDepth": 5,
        "minInstancesPerNode": 200,
        "positive_label": int(positive_label),
        "seed": seed
    })

    # ---- signature (production contract)
    # input = model raw input columns (drop label); include only cols model expects
    input_cols = ["age","province","employment_type","income_annual","credit_score","dti","utilization","delinq_12m","num_cards","requested_limit"]
    sample_input_sdf = df.select(*input_cols).limit(200)
    # sample_output_sdf = model.transform(sample_input_sdf).select("prediction", "probability")
    sample_output_sdf = (
    dt_model.transform(sample_input_sdf)
    .select(
        F.col("prediction").cast("double").alias("prediction"),
        vector_to_array("probability").alias("probability")   # ✅ Array[Double]
    )
)

    signature = infer_signature(
        sample_input_sdf.toPandas(),
        sample_output_sdf.toPandas()
    )
    input_example = sample_input_sdf.limit(5).toPandas()

    # ---- log model to this run (artifact path = model)
    mlflow.spark.log_model(
        spark_model=model,
        artifact_path="dt_cc_model",
        registered_model_name="dbx.default.dt_cc_model",
        signature=signature,
        input_example=input_example
    )

    


2026/01/08 04:14:42 INFO mlflow.spark: Inferring pip requirements by reloading the logged model from the databricks artifact repository, which can be time-consuming. To speed up, explicitly specify the conda_env or pip_requirements when calling log_model().


Downloading artifacts:   0%|          | 0/50 [00:00<?, ?it/s]



Uploading artifacts:   0%|          | 0/6 [00:00<?, ?it/s]

Registered model 'dbx.default.dt_cc_model' already exists. Creating a new version of this model...


Downloading artifacts:   0%|          | 0/56 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/56 [00:00<?, ?it/s]

🔗 Created version '3' of model 'dbx.default.dt_cc_model': https://adb-7405605317278017.17.azuredatabricks.net/explore/data/models/dbx/default/dt_cc_model/version/3?o=7405605317278017
