

先做 fake data（更像银行贷款定价/利差 spread 场景）

再用 PySpark ML 的 LinearRegression 训练、评估、解释系数

最后给一个 单笔预测示例

场景：预测企业贷款 spread_bps（利差，单位 bps）
特征：credit_score、log_revenue、industry_risk、tenor_years、secured、utilization、relationship_years

In [0]:
# Databricks Notebook - PySpark
from pyspark.sql import functions as F
from pyspark.sql import types as T
import random
import math

# ================
# 1) 生成 fake data
# ================

random.seed(42)

N = 5000  # 你可以改大一点，比如 20000

industry_list = ["Retail", "Manufacturing", "Tech", "Energy", "Healthcare", "RealEstate"]

def clamp(x, lo, hi):
    return max(lo, min(hi, x))

rows = []
for i in range(N):
    loan_id = float(i)  # loan_id 作为 DecimalType 存储

    # 模拟一些更“银行风格”的分布
    credit_score = int(clamp(random.gauss(680, 55), 520, 820))     # 520-820
    revenue = clamp(math.exp(random.gauss(14.2, 1.0)), 2e5, 5e9)   # 对数正态，单位随意（更像年收入）
    industry = random.choice(industry_list)
    industry_risk = {
        "Tech": 0.9,
        "Healthcare": 0.8,
        "Manufacturing": 1.0,
        "Retail": 1.1,
        "RealEstate": 1.2,
        "Energy": 1.4
    }[industry]

    tenor_years = int(clamp(random.gauss(3.0, 1.5), 1, 10))        # 1-10年
    secured = 1 if random.random() < 0.55 else 0                   # 55% 有抵押
    utilization = clamp(random.random() ** 0.7, 0.05, 0.98)         # 偏高一些的利用率
    relationship_years = int(clamp(random.gauss(4.0, 3.0), 0, 20))  # 关系年限

    # 构造一个“真实世界风味”的线性关系 + 噪声（spread 单位 bps）
    # - 分数高 => spread低
    # - revenue大 => spread低（用 log）
    # - 行业风险高 => spread高
    # - 期限长 => spread高
    # - 有抵押 => spread低
    # - utilization高 => spread高
    # - 关系年限长 => spread低
    log_revenue = math.log(revenue)

    noise = random.gauss(0, 50)  # 噪声：标准差 18 bps / 50 bps
    spread_bps = (
        320
        - 0.55 * (credit_score - 650)
        - 12.0 * (log_revenue - 14.0)
        + 45.0 * (industry_risk - 1.0)
        + 9.0  * (tenor_years - 3.0)
        - 28.0 * secured
        + 70.0 * (utilization - 0.5)
        - 3.5  * relationship_years
        + noise
    )
    spread_bps = clamp(spread_bps, 40, 900)  # 控制在合理范围

    rows.append((
        loan_id,
        float(credit_score),
        float(revenue),
        float(log_revenue),
        industry,
        float(industry_risk),
        float(tenor_years),
        float(secured),
        float(utilization),
        float(relationship_years),
        float(spread_bps)
    ))

schema = T.StructType([
    T.StructField("loan_id", T.DoubleType(), False),
    T.StructField("credit_score", T.DoubleType(), False),
    T.StructField("revenue", T.DoubleType(), False),
    T.StructField("log_revenue", T.DoubleType(), False),
    T.StructField("industry", T.StringType(), False),
    T.StructField("industry_risk", T.DoubleType(), False),
    T.StructField("tenor_years", T.DoubleType(), False),
    T.StructField("secured", T.DoubleType(), False),
    T.StructField("utilization", T.DoubleType(), False),
    T.StructField("relationship_years", T.DoubleType(), False),
    T.StructField("spread_bps", T.DoubleType(), False),
])

df = spark.createDataFrame(rows, schema)
df.display()

loan_id,credit_score,revenue,log_revenue,industry,industry_risk,tenor_years,secured,utilization,relationship_years,spread_bps
0.0,672.0,1235636.0301788626,14.02709639966848,Tech,0.9,3.0,1.0,0.8104689895689138,5.0,215.19050033164697
1.0,659.0,2268797.0935317622,14.634760334259427,Manufacturing,1.0,3.0,1.0,0.0788287198156489,5.0,301.1794868410116
2.0,642.0,1206648.4509656157,14.003357199135053,Healthcare,0.8,2.0,0.0,0.2768445547328181,9.0,223.5693009410037
3.0,700.0,2120500.270340948,14.56716259538602,Retail,1.1,5.0,1.0,0.1949171322385725,6.0,182.1771373996592
4.0,711.0,323023.2587980084,12.68547960827486,Healthcare,0.8,5.0,1.0,0.4236354123331995,8.0,177.98143336184714
5.0,597.0,963447.770690924,13.778273557469928,Retail,1.1,3.0,1.0,0.3551556494982693,3.0,327.440983795766
6.0,682.0,2323923.6066222247,14.658767524908592,Tech,0.9,1.0,1.0,0.4846091103880789,5.0,188.0199177622092
7.0,604.0,1256592.8372707951,14.043914518847831,RealEstate,1.2,2.0,1.0,0.8016081836118806,5.0,362.1764659250413
8.0,758.0,1337045.0147395567,14.105972523983697,Healthcare,0.8,3.0,0.0,0.1328600982203029,6.0,243.04965931552417
9.0,590.0,1087087.234084565,13.899012415031027,Healthcare,0.8,3.0,0.0,0.44514997336051,2.0,292.793294092763


In [0]:
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)


In [0]:
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline

feature_cols = [
    "credit_score",
    "log_revenue",
    "industry_risk",
    "tenor_years",
    "secured",
    "utilization",
    "relationship_years"
]

# 1) assemble -> raw_features
assembler = VectorAssembler(inputCols=feature_cols, outputCol="raw_features")

# 2) scale -> features
scaler = StandardScaler(
    inputCol="raw_features",
    outputCol="features",
    withStd=True,
    withMean=True
)

lr = LinearRegression(
    featuresCol="features",
    labelCol="spread_bps",
    predictionCol="prediction",
    regParam=0.05,
    elasticNetParam=0.0
)

pipeline = Pipeline(stages=[assembler, scaler, lr])

model = pipeline.fit(train_df)
pred_test = model.transform(test_df)

display(pred_test.select("loan_id", "spread_bps", "prediction").limit(10))


loan_id,spread_bps,prediction
2.0,223.5693009410037,258.72824335613296
6.0,188.0199177622092,225.5479509583465
8.0,243.04965931552417,205.24398488318485
13.0,312.662933926732,244.6726913596464
19.0,260.6385562872268,260.5034356705634
23.0,179.47905107823595,254.65050015666253
29.0,245.72640669868255,270.8090898689085
35.0,303.6721398145316,313.32030025225515
45.0,289.1299347737401,299.7040103571456
46.0,138.3085032911897,271.55994290178904


In [0]:
scaled_df = model.transform(train_df)

scaled_df.select("raw_features", "features").limit(5).display()

raw_features,features
"Map(vectorType -> dense, length -> 7, values -> List(672.0, 14.02709639966848, 0.9, 3.0, 1.0, 0.8104689895689138, 5.0))","Map(vectorType -> dense, length -> 7, values -> List(-0.10409699349425486, -0.1954029699990374, -0.8784463130787723, 0.2684366369263994, 0.9103729544958232, 0.8291917487000033, 0.502645126658179))"
"Map(vectorType -> dense, length -> 7, values -> List(659.0, 14.634760334259429, 1.0, 3.0, 1.0, 0.07882871981564894, 5.0))","Map(vectorType -> dense, length -> 7, values -> List(-0.3414681515195793, 0.42548645855546013, -0.3769705698069462, 0.2684366369263994, 0.9103729544958232, -1.9028953835276805, 0.502645126658179))"
"Map(vectorType -> dense, length -> 7, values -> List(700.0, 14.567162595386021, 1.1, 5.0, 1.0, 0.19491713223857252, 6.0))","Map(vectorType -> dense, length -> 7, values -> List(0.4071639622525979, 0.3564174895554501, 0.1245051734648805, 1.748662542582962, 0.9103729544958232, -1.4693986626755482, 0.8742568775657362))"
"Map(vectorType -> dense, length -> 7, values -> List(711.0, 12.685479608274859, 0.8, 5.0, 1.0, 0.42363541233319957, 8.0))","Map(vectorType -> dense, length -> 7, values -> List(0.6080164805817186, -1.566219362958474, -1.3799220563505985, 1.748662542582962, 0.9103729544958232, -0.615320100047347, 1.6174803793808508))"
"Map(vectorType -> dense, length -> 7, values -> List(597.0, 13.778273557469928, 1.1, 3.0, 1.0, 0.35515564949826933, 3.0))","Map(vectorType -> dense, length -> 7, values -> List(-1.4735459821018961, -0.4496413137769365, 0.1245051734648805, 0.2684366369263994, 0.9103729544958232, -0.8710368594903597, -0.2405783751569356))"


In [0]:
from mlflow.models.signature import infer_signature

sample_input = train_df.select(feature_cols).limit(50).toPandas()
sample_output = pred_test.select("prediction").limit(50).toPandas()

signature = infer_signature(sample_input, sample_output)

In [0]:
import mlflow

mlflow.set_experiment("/Shared/Spread")
with mlflow.start_run(run_name="Linear_regression"):
    mlflow.spark.log_model(model, artifact_path="spread_LR_model",\
                           registered_model_name="dbx.default.spread_LR_model",\
                           signature=signature)
    mlflow.log_param("regParam", lr.regParam)

    
    # ==================================
    # 4) 评估：RMSE / R2
    # ==================================
    from pyspark.ml.evaluation import RegressionEvaluator

    rmse_eval = RegressionEvaluator(
        labelCol="spread_bps", predictionCol="prediction", metricName="rmse"
    )
    r2_eval = RegressionEvaluator(
        labelCol="spread_bps", predictionCol="prediction", metricName="r2"
    )

    rmse = rmse_eval.evaluate(pred_test)
    r2 = r2_eval.evaluate(pred_test)

    mlflow.log_metric("test_rmse", rmse)
    mlflow.log_metric("test_r2", r2)

    mae_eval = RegressionEvaluator(
        labelCol="spread_bps", predictionCol="prediction", metricName="mae"
    )
    mae = mae_eval.evaluate(pred_test)
    mlflow.log_metric("test_mae", mae)
    # ==================================
    lr_model = model.stages[-1]  # 最后一层是 LinearRegressionModel

    print("Intercept (bps):", lr_model.intercept)

    coef = lr_model.coefficients.toArray().tolist()
    coef_df = spark.createDataFrame(list(zip(feature_cols, coef)), ["feature", "coef_bps_per_unit"]) \
        .orderBy(F.desc(F.abs("coef_bps_per_unit")))
    display(coef_df)

    # ==================================
    # 6) 残差分析（简单版）：看误差分布 & 大误差样本
    # ==================================
    resid_df = pred_test.withColumn("residual", F.col("spread_bps") - F.col("prediction"))

    display(
        resid_df.select("residual")
                .summary("count","mean","stddev","min","max")
    )

    display(
        resid_df.select("loan_id", "spread_bps", "prediction", "residual",
                        "credit_score","industry","tenor_years","secured","utilization")
                .orderBy(F.desc(F.abs("residual")))
                .limit(20)
    )

    # ==================================
    # 7) 单笔预测（模拟新客户/新贷款）
    # ==================================
    new_data = [
        ("NEW001", 720.0, math.log(2e8), 1.0, 5.0, 1.0, 0.70, 6.0),  # credit_score, log_revenue, industry_risk, tenor, secured, util, relationship_years
    ]
    new_schema = T.StructType([
        T.StructField("loan_id", T.StringType(), False),
        T.StructField("credit_score", T.DoubleType(), False),
        T.StructField("log_revenue", T.DoubleType(), False),
        T.StructField("industry_risk", T.DoubleType(), False),
        T.StructField("tenor_years", T.DoubleType(), False),
        T.StructField("secured", T.DoubleType(), False),
        T.StructField("utilization", T.DoubleType(), False),
        T.StructField("relationship_years", T.DoubleType(), False),
    ])

    new_df = spark.createDataFrame(new_data, new_schema)
    new_pred = model.transform(new_df)

    display(new_pred.select("loan_id", "prediction"))


2026/01/07 00:44:21 INFO mlflow.spark: Inferring pip requirements by reloading the logged model from the databricks artifact repository, which can be time-consuming. To speed up, explicitly specify the conda_env or pip_requirements when calling log_model().


Downloading artifacts:   0%|          | 0/30 [00:00<?, ?it/s]



Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Registered model 'dbx.default.spread_LR_model' already exists. Creating a new version of this model...


Downloading artifacts:   0%|          | 0/34 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/34 [00:00<?, ?it/s]

🔗 Created version '2' of model 'dbx.default.spread_lr_model': https://adb-7405605317278017.17.azuredatabricks.net/explore/data/models/dbx/default/spread_lr_model/version/2?o=7405605317278017


Intercept (bps): 281.6071778298083


feature,coef_bps_per_unit
credit_score,-29.81936232037033
utilization,18.823409696539272
secured,-14.860595303052037
tenor_years,12.007219485095826
log_revenue,-11.648892962401025
relationship_years,-10.881382557270385
industry_risk,8.68736988380695


summary,residual
count,936.0
mean,-1.956131521811787
stddev,49.62336478571721
min,-169.445916599165
max,179.53822007591626


loan_id,spread_bps,prediction,residual,credit_score,industry,tenor_years,secured,utilization
3534.0,426.9510476668098,247.4128275908936,179.53822007591626,727.0,Retail,1.0,0.0,0.4049596856675202
3746.0,78.57963599216197,248.02555259132697,-169.445916599165,698.0,Retail,4.0,1.0,0.6254437315087518
4445.0,436.3935273004778,283.0542890144155,153.3392382860623,725.0,Energy,3.0,1.0,0.9212360340322242
1189.0,408.0064069392881,257.0731295519594,150.9332773873287,703.0,Energy,2.0,1.0,0.4589642717382999
2842.0,174.583172262133,324.9208986879281,-150.33772642579513,634.0,Retail,1.0,0.0,0.8189708022412492
1588.0,393.66841050483504,244.30418896905624,149.3642215357788,741.0,Healthcare,3.0,1.0,0.692059771655627
292.0,217.0654565157831,364.3521162714128,-147.2866597556297,640.0,Energy,4.0,0.0,0.7983091708631631
2548.0,198.77445729797256,339.2347184111559,-140.46026111318332,666.0,RealEstate,5.0,1.0,0.9318470656828144
4492.0,190.8815574804173,330.5425841151465,-139.66102663472924,663.0,Manufacturing,2.0,0.0,0.950325935131394
3876.0,213.30090234634528,349.85123471567226,-136.55033236932698,567.0,RealEstate,2.0,1.0,0.8203882509860676


loan_id,prediction
NEW001,202.8311746939441


In [0]:
from databricks.feature_store import FeatureStoreClient
fs = FeatureStoreClient()

# 选择你要保存的列（原始特征 + scaled_features + label）
feature_table_df = df.select(
    "loan_id",
    *feature_cols
)

fs.create_table(
    name="pricing_features",
    primary_keys=["loan_id"],
    df=feature_table_df,
    description="Spread pricing features without vector columns"
)




<FeatureTable: name='dbx.default.pricing_features', table_id='e703fef7-2dfd-4c37-9d84-1613ed66333c', description='Spread pricing features without vector columns', primary_keys=['loan_id'], partition_columns=[], features=['loan_id',
 'credit_score',
 'log_revenue',
 'industry_risk',
 'tenor_years',
 'secured',
 'utilization',
 'relationship_years'], creation_timestamp=1767745960529, online_stores=[], notebook_producers=[], job_producers=[], table_data_sources=[], path_data_sources=[], custom_data_sources=[], timestamp_keys=[], tags={}>