

先做 fake data（更像银行贷款定价/利差 spread 场景）

再用 PySpark ML 的 LinearRegression 训练、评估、解释系数

最后给一个 单笔预测示例

场景：预测企业贷款 spread_bps（利差，单位 bps）
特征：credit_score、log_revenue、industry_risk、tenor_years、secured、utilization、relationship_years

0) 配置区（集中管理参数）

In [0]:
# Databricks Notebook - PySpark
from pyspark.sql import functions as F
from pyspark.sql import types as T

# ============== Config ==============
SEED = 42
N = 50000          # 想大就大：5w / 20w
TRAIN_FRAC = 0.8

REG_PARAM = 0.05
ELASTIC_NET = 0.0

EXPERIMENT_PATH = "/Shared/Spread"
RUN_NAME = "Linear_regression"


In [0]:
industry_map = {
    "Tech": 0.9,
    "Healthcare": 0.8,
    "Manufacturing": 1.0,
    "Retail": 1.1,
    "RealEstate": 1.2,
    "Energy": 1.4
}

# 方便：把 map 变成 create_map
industry_kv = []
for k, v in industry_map.items():
    industry_kv += [F.lit(k), F.lit(float(v))]
industry_risk_map = F.create_map(*industry_kv)

industry_list = list(industry_map.keys())

def clamp(col, lo, hi):
    return F.least(F.greatest(col, F.lit(lo)), F.lit(hi))

base = (
    spark.range(N)
    .withColumnRenamed("id", "loan_id_long")
    .withColumn("loan_id", F.concat(F.lit("LN"), F.lpad(F.col("loan_id_long").cast("string"), 8, "0")))
)

df = (
    base
    # credit_score ~ N(680,55) clamp 520-820
    .withColumn("credit_score", clamp((F.randn(SEED) * 55 + 680).cast("double"), 520, 820))

    # revenue: lognormal approx: exp(N(14.2,1.0)) clamp [2e5, 5e9]
    .withColumn("revenue", clamp(F.exp(F.randn(SEED + 1) * 1.0 + 14.2), 2e5, 5e9))

    .withColumn("log_revenue", F.log(F.col("revenue")))

    # industry: 从列表里抽
    .withColumn("industry_idx", (F.floor(F.rand(SEED + 2) * F.lit(len(industry_list))) + 1).cast("int"))
    .withColumn("industry", F.element_at(F.array(*[F.lit(x) for x in industry_list]), F.col("industry_idx")))
    .drop("industry_idx")

    .withColumn("industry_risk", industry_risk_map[F.col("industry")].cast("double"))

    # tenor_years ~ N(3,1.5) clamp 1-10
    .withColumn("tenor_years", clamp((F.randn(SEED + 3) * 1.5 + 3.0), 1, 10))

    # secured Bernoulli(p=0.55)
    .withColumn("secured", (F.when(F.rand(SEED + 4) < 0.55, 1.0).otherwise(0.0)).cast("double"))

    # utilization: rand()**0.7 偏高 + clamp 0.05-0.98
    .withColumn("utilization", clamp(F.pow(F.rand(SEED + 5), 0.7), 0.05, 0.98))

    # relationship_years ~ N(4,3) clamp 0-20
    .withColumn("relationship_years", clamp((F.randn(SEED + 6) * 3.0 + 4.0), 0, 20))

    # noise ~ N(0,50)
    .withColumn("noise", (F.randn(SEED + 7) * 50.0))

    # spread_bps = 线性关系 + 噪声，最后 clamp 40-900
    .withColumn(
        "spread_bps_raw",
        F.lit(320.0)
        - F.lit(0.55) * (F.col("credit_score") - 650.0)
        - F.lit(12.0) * (F.col("log_revenue") - 14.0)
        + F.lit(45.0) * (F.col("industry_risk") - 1.0)
        + F.lit(9.0)  * (F.col("tenor_years") - 3.0)
        - F.lit(28.0) * (F.col("secured"))
        + F.lit(70.0) * (F.col("utilization") - 0.5)
        - F.lit(3.5)  * (F.col("relationship_years"))
        + F.col("noise")
    )
    .withColumn("spread_bps", clamp(F.col("spread_bps_raw"), 40, 900).cast("double"))
    .drop("noise", "spread_bps_raw", "loan_id_long")
)

display(df.limit(10))
df.select("spread_bps").summary("count","mean","stddev","min","max").display()


loan_id,credit_score,revenue,log_revenue,industry,industry_risk,tenor_years,secured,utilization,relationship_years,spread_bps
LN00000000,811.1463479832641,4424666.419441249,15.302705448145536,RealEstate,1.2,4.11005931749252,1.0,0.7565255582094959,2.4302696999265008,214.09481142640269
LN00000001,690.5651372271144,2602815.918160719,14.772104462367544,Energy,1.4,2.0370999513670447,1.0,0.8015486040278841,7.622087958540161,221.759405535142
LN00000002,720.3553509330761,755125.1282918431,13.53463874733587,Retail,1.1,5.680645071559674,1.0,0.98,8.153283044161084,352.579183118226
LN00000003,651.2653589235572,670795.4914555486,13.41621958776303,RealEstate,1.2,2.3729360101200374,1.0,0.05,1.392852607306346,281.99337009562174
LN00000004,793.3046298624807,2173900.370366336,14.592033517814793,RealEstate,1.2,1.0,1.0,0.2616198506968052,4.654034622587965,140.6582953672569
LN00000005,691.5298611046486,3939780.436376631,15.18663555291336,Healthcare,0.8,4.041895808850663,1.0,0.8926228767662583,7.218972515716068,228.63535376488497
LN00000006,649.8310673725493,2963915.8797220173,14.902021884334788,Healthcare,0.8,4.30508685609737,0.0,0.4746183736099128,5.64237540362373,262.1016940676133
LN00000007,784.9382582694418,2580058.138869136,14.763322491089305,Energy,1.4,2.155478950858633,1.0,0.8698281838263124,6.416397231622928,188.08949850503143
LN00000008,657.0617737940571,1181013.6238039825,13.98188363093344,Healthcare,0.8,1.411475991653437,1.0,0.8556033100871968,0.8997805196180035,288.30097868263096
LN00000009,739.7974309536336,811753.7172434053,13.606952269250456,Energy,1.4,6.511019172940071,1.0,0.7595476877354659,1.4049453758729973,208.97636928172784


summary,spread_bps
count,50000.0
mean,281.0553364440356
stddev,66.64654830885308
min,40.0
max,580.874710631018


2) Train/Test split（可复现）

In [0]:
train_df, test_df = df.randomSplit([TRAIN_FRAC, 1-TRAIN_FRAC], seed=SEED)
print(train_df.count(), test_df.count())


40144 9856


3) Pipeline：Assembler + Scaler + LinearRegression

In [0]:
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline

feature_cols = [
    "credit_score",
    "log_revenue",
    "industry_risk",
    "tenor_years",
    "secured",
    "utilization",
    "relationship_years"
]

assembler = VectorAssembler(inputCols=feature_cols, outputCol="raw_features")

scaler = StandardScaler(
    inputCol="raw_features",
    outputCol="features",
    withStd=True,
    withMean=False   # ✅ 默认建议 False（大数据更稳）
)

lr = LinearRegression(
    featuresCol="features",
    labelCol="spread_bps",
    predictionCol="prediction",
    regParam=REG_PARAM,
    elasticNetParam=ELASTIC_NET
)

pipeline = Pipeline(stages=[assembler, scaler, lr])

model = pipeline.fit(train_df)
pred_test = model.transform(test_df)

display(pred_test.select("loan_id", "spread_bps", "prediction").limit(10))


loan_id,spread_bps,prediction
LN00000002,352.579183118226,292.8614375850167
LN00000006,262.1016940676133,291.2794686599738
LN00000008,288.30097868263096,286.95366996205445
LN00000013,232.7080763423873,209.40667524935716
LN00000019,266.6349454597876,239.8248180424733
LN00000023,214.17465146077572,282.29120640766814
LN00000029,268.71961471382747,306.07401048883827
LN00000035,305.28351310540995,287.0616115630519
LN00000045,312.2401674552684,295.695075401704
LN00000046,221.7313305738205,244.9091618075049


4) 评估：RMSE / MAE / R2 + 残差分析

In [0]:
from pyspark.ml.evaluation import RegressionEvaluator

rmse = RegressionEvaluator(labelCol="spread_bps", predictionCol="prediction", metricName="rmse").evaluate(pred_test)
mae  = RegressionEvaluator(labelCol="spread_bps", predictionCol="prediction", metricName="mae").evaluate(pred_test)
r2   = RegressionEvaluator(labelCol="spread_bps", predictionCol="prediction", metricName="r2").evaluate(pred_test)

print(f"Test RMSE: {rmse:.2f} bps")
print(f"Test MAE : {mae:.2f} bps")
print(f"Test R2  : {r2:.4f}")

resid_df = pred_test.withColumn("residual", F.col("spread_bps") - F.col("prediction"))
resid_df.select("residual").summary("count","mean","stddev","min","max").display()

display(
    resid_df.select("loan_id", "spread_bps", "prediction", "residual",
                    "credit_score","industry","tenor_years","secured","utilization")
            .orderBy(F.desc(F.abs("residual")))
            .limit(20)
)


Test RMSE: 50.16 bps
Test MAE : 40.15 bps
Test R2  : 0.4211


summary,residual
count,9856.0
mean,0.5763119205302574
stddev,50.15712177374886
min,-173.17570889609374
max,176.1091341124456


loan_id,spread_bps,prediction,residual,credit_score,industry,tenor_years,secured,utilization
LN00015516,502.765258685688,326.6561245732424,176.1091341124456,630.8759851495157,Manufacturing,3.944163358739313,0.0,0.1595375614899194
LN00035637,393.1217075658336,218.28093774335275,174.8407698224809,778.5230204425612,Retail,5.635064610899338,1.0,0.2937141968898667
LN00048992,123.0624273806058,296.23813627669955,-173.17570889609374,720.223772859712,Retail,5.00665332409752,0.0,0.6146268905464013
LN00046194,103.41517648895598,274.08149693100427,-170.6663204420483,643.9254649442256,Tech,2.3412272645361987,1.0,0.7263193881122663
LN00003147,495.3770413393606,325.666497331743,169.71054400761756,620.6762433128278,Tech,4.536313641182646,1.0,0.949969941987909
LN00034237,443.5705831539972,274.4134665435149,169.15711661048226,591.1107879285441,Manufacturing,5.700319031258373,1.0,0.05
LN00047900,420.1646210184706,253.38709323969292,166.77752777877765,716.765608136853,Energy,2.828559542511981,1.0,0.4180312045635258
LN00030594,509.8069532349723,343.6773102774412,166.12964295753113,647.8889987052436,Retail,3.3250584747214003,0.0,0.916402891554646
LN00005252,389.9616336539566,224.772526672195,165.1891069817616,712.5536336935046,Healthcare,1.0,1.0,0.8754579495038212
LN00005971,105.36452334347,268.1498175539188,-162.7852942104488,756.8833621177037,Energy,4.139914685010919,0.0,0.5517220275949626


5) 系数解释（从 PipelineModel 里取出 LR 模型）


In [0]:
import numpy as np

lr_model = model.stages[-1]  # LinearRegressionModel
print("Intercept (bps):", lr_model.intercept)

coef = lr_model.coefficients.toArray().tolist()

coef_df = (
    spark.createDataFrame(list(zip(feature_cols, coef)), ["feature", "coef_per_1std"])
    .withColumn("abs_coef", F.abs("coef_per_1std"))
    .orderBy(F.desc("abs_coef"))
    .drop("abs_coef")
)

display(coef_df)


Intercept (bps): 739.8726463181339


feature,coef_per_1std
credit_score,-30.360383629725195
utilization,18.88741270758431
secured,-14.275365440272068
tenor_years,12.512759839009403
log_revenue,-11.72506652823599
relationship_years,-9.540059271970987
industry_risk,8.506268231072035


5) 系数解释（从 PipelineModel 里取出 LR 模型）

注意：这里系数是在 scaled 后的特征空间。想“回到原始单位”的解释我也给你了（见下方 可选）。

In [0]:
scaler_model = model.stages[1]  # StandardScalerModel
stds = np.array(scaler_model.std.toArray())  # 每个特征 std

coef_scaled = np.array(lr_model.coefficients.toArray())
coef_raw = coef_scaled / stds

coef_raw_df = spark.createDataFrame(
    [(feature_cols[i], float(coef_raw[i])) for i in range(len(feature_cols))],
    ["feature", "coef_bps_per_unit_raw"]
).orderBy(F.desc(F.abs("coef_bps_per_unit_raw")))

display(coef_raw_df)


feature,coef_bps_per_unit_raw
utilization,71.57258546067567
industry_risk,43.16991279386702
secured,-28.70386895443565
log_revenue,-11.925921027078584
tenor_years,9.0516167966886
relationship_years,-3.441609795508712
credit_score,-0.552339824799903


6) 单笔预测示例（输入 revenue 自动算 log_revenue，防止手算出错）

In [0]:
new_df = spark.createDataFrame(
    [("NEW001", 720.0, 2e8, 1.0, 5.0, 1.0, 0.70, 6.0)],
    ["loan_id","credit_score","revenue","industry_risk","tenor_years","secured","utilization","relationship_years"]
).withColumn("log_revenue", F.log("revenue")) \
 .drop("revenue")

new_pred = model.transform(new_df)
display(new_pred.select("loan_id", "prediction"))


loan_id,prediction
NEW001,203.41324898159564


In [0]:
scaled_df = model.transform(train_df)

scaled_df.select("raw_features", "features").limit(5).display()


raw_features,features
"Map(vectorType -> dense, length -> 7, values -> List(811.1463479832641, 15.302705448145536, 1.2, 4.11005931749252, 1.0, 0.7565255582094959, 2.4302696999265008))","Map(vectorType -> dense, length -> 7, values -> List(14.757008251157352, 15.564846155519888, 6.090084858059036, 2.9731795728724695, 2.0107274363330485, 2.8668029341251056, 0.8767283060356754))"
"Map(vectorType -> dense, length -> 7, values -> List(690.5651372271144, 14.772104462367544, 1.4, 2.0370999513670447, 1.0, 0.8015486040278841, 7.622087958540161))","Map(vectorType -> dense, length -> 7, values -> List(12.563300634169154, 15.025155788898966, 7.105099001068875, 1.4736195989986514, 2.0107274363330485, 3.0374147508109632, 2.749694926677453))"
"Map(vectorType -> dense, length -> 7, values -> List(651.2653589235572, 13.41621958776303, 1.2, 2.3729360101200374, 1.0, 0.05, 1.392852607306346))","Map(vectorType -> dense, length -> 7, values -> List(11.8483283555709, 13.646044131203617, 6.090084858059036, 1.71656035303321, 2.0107274363330485, 0.18947165122286824, 0.5024764564188074))"
"Map(vectorType -> dense, length -> 7, values -> List(793.3046298624807, 14.592033517814793, 1.2, 1.0, 1.0, 0.26161985069680527, 4.654034622587965))","Map(vectorType -> dense, length -> 7, values -> List(14.432417772290206, 14.84200016595764, 6.090084858059036, 0.7233909156051689, 2.0107274363330485, 0.9913909020840789, 1.678959290409757))"
"Map(vectorType -> dense, length -> 7, values -> List(691.5298611046486, 15.186635552913359, 0.8, 4.041895808850663, 1.0, 0.8926228767662583, 7.218972515716068))","Map(vectorType -> dense, length -> 7, values -> List(12.580851644854535, 15.446787942304061, 4.0600565720393575, 2.923870709945176, 2.0107274363330485, 3.3825346076041956, 2.604269618805377))"


7) MLflow：记录参数/指标/模型（注册名做成可选）

In [0]:
import mlflow
from mlflow.models.signature import infer_signature

mlflow.set_experiment(EXPERIMENT_PATH)

sample_input = train_df.select(feature_cols).limit(50).toPandas()
sample_output = pred_test.select("prediction").limit(50).toPandas()
signature = infer_signature(sample_input, sample_output)

with mlflow.start_run(run_name=RUN_NAME):
    mlflow.log_param("regParam", REG_PARAM)
    mlflow.log_param("elasticNetParam", ELASTIC_NET)
    mlflow.log_param("withMean", False)
    mlflow.log_metric("test_rmse", rmse)
    mlflow.log_metric("test_mae", mae)
    mlflow.log_metric("test_r2", r2)

    # （可选）如果你确定 UC 注册名格式正确，再打开
    mlflow.spark.log_model(model, artifact_path="spread_LR_model",\
                           registered_model_name="dbx.default.spread_LR_model",\
                           signature=signature,\
                               )


2026/01/07 05:54:32 INFO mlflow.spark: Inferring pip requirements by reloading the logged model from the databricks artifact repository, which can be time-consuming. To speed up, explicitly specify the conda_env or pip_requirements when calling log_model().


Downloading artifacts:   0%|          | 0/30 [00:00<?, ?it/s]



Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Registered model 'dbx.default.spread_LR_model' already exists. Creating a new version of this model...


Downloading artifacts:   0%|          | 0/34 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/34 [00:00<?, ?it/s]

🔗 Created version '5' of model 'dbx.default.spread_lr_model': https://adb-7405605317278017.17.azuredatabricks.net/explore/data/models/dbx/default/spread_lr_model/version/5?o=7405605317278017


8) Feature Store（强烈建议加“能用再用”的防护）

In [0]:
feature_table_df = df.select("loan_id", *feature_cols)

try:
    from databricks.feature_store import FeatureStoreClient
    fs = FeatureStoreClient()

    table_name = "pricing_features"   # UC 下通常需要 catalog.schema.table

    # 如果表已存在：用 write_table（append/merge），否则 create_table
    if fs.table_exists(table_name):
        fs.write_table(name=table_name, df=feature_table_df, mode="merge")
        print(f"Feature table exists -> merged into {table_name}")
    else:
        fs.create_table(
            name=table_name,
            primary_keys=["loan_id"],
            df=feature_table_df,
            description="Spread pricing features without vector columns"
        )
        print(f"Created feature table: {table_name}")

except Exception as e:
    print("Feature Store step skipped due to environment limitation:")
    print(str(e)[:500])


  from google.protobuf import service as _service


Feature Store step skipped due to environment limitation:
'FeatureStoreClient' object has no attribute 'table_exists'
