In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
import random

# from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType
from pyspark.sql.types import *

import mlflow
import mlflow.spark

# Databricks Feature Store
from databricks.feature_store import FeatureStoreClient

In [0]:
random.seed(42)

data = []
n = 5000
fraud_rate = 0.10

# 噪声强度参数（你可以调）
amount_noise_std = 120        # 金额噪声
behavior_flip_prob = 0.15    # 行为特征翻转概率
label_noise_prob = 0.05      # 标签噪声

for i in range(n):
    # === 真实标签（先生成） ===
    true_fraud = 1 if random.random() < fraud_rate else 0

    # === 金额（分布开始重叠） ===
    if true_fraud:
        amount = random.gauss(700, 350)   # 均值下移、方差变大
    else:
        amount = random.gauss(250, 180)   # 均值上移、方差变大

    # 加 measurement noise
    amount += random.gauss(0, amount_noise_std)
    amount = max(amount, 1)

    # === 行为特征（有噪声） ===
    night_txn = 1 if random.random() < (0.55 if true_fraud else 0.25) else 0
    cross_border = 1 if random.random() < (0.45 if true_fraud else 0.20) else 0
    device_change = 1 if random.random() < (0.35 if true_fraud else 0.15) else 0

    # 行为翻转噪声（传感器 / 规则误差）
    if random.random() < behavior_flip_prob:
        night_txn = 1 - night_txn
    if random.random() < behavior_flip_prob:
        cross_border = 1 - cross_border
    if random.random() < behavior_flip_prob:
        device_change = 1 - device_change

    # === 标签噪声（非常真实） ===
    is_fraud = true_fraud
    if random.random() < label_noise_prob:
        is_fraud = 1 - true_fraud

    data.append((
        f"TXN{i:04d}",
        float(amount),
        night_txn,
        cross_border,
        device_change,
        is_fraud
    ))

schema = StructType([
    StructField("transaction_id", StringType()),
    StructField("amount", DoubleType()),
    StructField("night_txn", IntegerType()),
    StructField("cross_border", IntegerType()),
    StructField("device_change", IntegerType()),
    StructField("is_fraud", IntegerType())
])

df = spark.createDataFrame(data, schema)


In [0]:
df.display()

transaction_id,amount,night_txn,cross_border,device_change,is_fraud
TXN0000,407.6482464553536,1,1,0,1
TXN0001,207.3410002836952,1,0,0,1
TXN0002,93.76459134308914,0,1,0,0
TXN0003,74.10753245975471,0,0,0,0
TXN0004,398.6382517545056,1,1,0,0
TXN0005,377.23746944436624,0,0,0,0
TXN0006,1.0,0,1,0,0
TXN0007,508.4840577379378,0,0,0,0
TXN0008,389.470287895919,0,0,1,1
TXN0009,1.0,0,1,0,0


In [0]:
feature_cols = ["amount", "night_txn", "cross_border", "device_change"]

assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features"
)

df_feat = assembler.transform(df)

scaler = StandardScaler(
    inputCol="features",
    outputCol="scaled_features",
    withMean=True,
    withStd=True
)

df_scaled = scaler.fit(df_feat).transform(df_feat)


In [0]:
lr = LogisticRegression(
    featuresCol="scaled_features",
    labelCol="is_fraud",
    probabilityCol="probability",
    predictionCol="prediction"
)

paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1, 0.5]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

evaluator = BinaryClassificationEvaluator(
    labelCol="is_fraud",
    metricName="areaUnderROC"
)

cv = CrossValidator(
    estimator=lr,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=3,
    seed=42
)

In [0]:
# df_scaled.count() / df_scaled.groupBy("is_fraud").count()

fraud_ratio = df_scaled.where("is_fraud = 1").count() / df_scaled.count()

In [0]:
label_col = "is_fraud"
key_col = "transaction_id"     # ✅ 强烈建议用唯一键（txn_id / row_id / pk）

train_frac = 0.8
fractions = {0: train_frac, 1: train_frac}

# 1) Stratified sample → train
train_df = df_scaled.sampleBy(label_col, fractions=fractions, seed=42)

# 2) Anti-join → test (no overlap)
test_df = df_scaled.join(
    train_df.select(key_col).distinct(),
    on=key_col,
    how="left_anti"
)

# 3) quick checks
print("train count:", train_df.count())
print("test  count:", test_df.count())

train_df.groupBy(label_col).count().show()
test_df.groupBy(label_col).count().show()


train count: 4064
test  count: 936
+--------+-----+
|is_fraud|count|
+--------+-----+
|       1|  548|
|       0| 3516|
+--------+-----+

+--------+-----+
|is_fraud|count|
+--------+-----+
|       1|  134|
|       0|  802|
+--------+-----+



In [0]:
lr = LogisticRegression(
    featuresCol="scaled_features",
    labelCol="is_fraud",
    probabilityCol="probability",
    predictionCol="prediction"
)

paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1, 0.5]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

evaluator = BinaryClassificationEvaluator(
    labelCol="is_fraud",
    metricName="areaUnderROC"
)

cv = CrossValidator(
    estimator=lr,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=3,      # 3-fold CV
    seed=42
)

cvModel = cv.fit(train_df)


In [0]:
input_cols = [
    "amount",
    "night_txn",
    "cross_border",
    "device_change"
]

sample_input = train_df.select(input_cols).limit(50).toPandas()
pred_sample = cvModel.bestModel.transform(train_df.limit(50)) \
    .select("prediction", "probability") \
    .toPandas()

pred_sample["fraud_prob"] = pred_sample["probability"].apply(lambda x: float(x[1]))
pred_sample = pred_sample[["prediction", "fraud_prob"]]

from mlflow.models.signature import infer_signature

signature = infer_signature(
    sample_input,
    pred_sample
)




In [0]:
# 记录最优参数
best_model = cvModel.bestModel
mlflow.log_param("regParam", best_model._java_obj.getRegParam())
mlflow.log_param("elasticNetParam", best_model._java_obj.getElasticNetParam())

# 记录指标（AUC）
predictions = cvModel.transform(test_df)

# ROC AUC
evaluator_auc = BinaryClassificationEvaluator(labelCol="is_fraud", metricName="areaUnderROC")
auc = evaluator_auc.evaluate(predictions)

# Accuracy, Precision, Recall, F1
evaluator_acc = MulticlassClassificationEvaluator(labelCol="is_fraud", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator_acc.evaluate(predictions)

evaluator_precision = MulticlassClassificationEvaluator(labelCol="is_fraud", predictionCol="prediction", metricName="weightedPrecision")
precision = evaluator_precision.evaluate(predictions)

evaluator_recall = MulticlassClassificationEvaluator(labelCol="is_fraud", predictionCol="prediction", metricName="weightedRecall")
recall = evaluator_recall.evaluate(predictions)

evaluator_f1 = MulticlassClassificationEvaluator(labelCol="is_fraud", predictionCol="prediction", metricName="f1")

f1 = evaluator_f1.evaluate(predictions)

In [0]:
# ==============================
# 8️⃣ MLflow Experiment
# ==============================
mlflow.set_experiment("/Shared/fraud_detection")

if mlflow.active_run():
    mlflow.end_run()
    
with mlflow.start_run(run_name="LogReg_FraudDetection"):
    
    # 训练 CV 模型
    cvModel = cv.fit(train_df)
    
    # 记录模型
    mlflow.spark.log_model(
        spark_model=cvModel.bestModel,
        artifact_path="logreg_model",
        registered_model_name="dbx.default.FraudDetectionLR",
        signature=signature,
        pip_requirements=[
            "pyspark==4.0.0",
            "mlflow"
        ]
    )
    mlflow.log_metric("AUC", auc)
    mlflow.log_metric("Accuracy", accuracy)
    mlflow.log_metric("Precision", precision)
    mlflow.log_metric("Recall", recall)
    mlflow.log_metric("F1", f1)

    print(f"Best regParam: {best_model._java_obj.getRegParam()}")
    print(f"Best elasticNetParam: {best_model._java_obj.getElasticNetParam()}")
    print(f"AUC on test set: {auc}")


Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Registered model 'dbx.default.FraudDetectionLR' already exists. Creating a new version of this model...


Downloading artifacts:   0%|          | 0/19 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/19 [00:00<?, ?it/s]

🔗 Created version '13' of model 'dbx.default.frauddetectionlr': https://adb-7405605317278017.17.azuredatabricks.net/explore/data/models/dbx/default/frauddetectionlr/version/13?o=7405605317278017


Best regParam: 0.5
Best elasticNetParam: 0.0
AUC on test set: 0.7916216920385607


In [0]:
from databricks.feature_store import FeatureStoreClient

fs = FeatureStoreClient()
feature_table_df = df.select(
    "transaction_id",
    *input_cols
)

fs.create_table(
    name="fraud_features",
    primary_keys=["transaction_id"],
    df=feature_table_df,
    description="Fraud features without vector columns"
)

<FeatureTable: name='dbx.default.fraud_features', table_id='e6e6cb17-dcf7-476c-b7ca-8667449c672a', description='Fraud features without vector columns', primary_keys=['transaction_id'], partition_columns=[], features=['transaction_id', 'amount', 'night_txn', 'cross_border', 'device_change'], creation_timestamp=1767414454076, online_stores=[], notebook_producers=[], job_producers=[], table_data_sources=[], path_data_sources=[], custom_data_sources=[], timestamp_keys=[], tags={}>