In [0]:
# Multiple sklearn Models with MLflow (Path-based)
import mlflow
import mlflow.sklearn

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Read GOLD Delta data via PATH
gold_df = spark.read.format("delta") \
    .load("/Volumes/workspace/ecommerce/ecommerce_data/gold/products")

# Convert to Pandas
pdf = gold_df.select("views", "revenue", "purchases") \
             .dropna() \
             .toPandas()

X = pdf[["views", "revenue"]]
y = pdf["purchases"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Models
models = {
    "linear": LinearRegression(),
    "decision_tree": DecisionTreeRegressor(max_depth=5, random_state=42),
    "random_forest": RandomForestRegressor(n_estimators=100, random_state=42)
}

# Train & log
for name, model in models.items():
    with mlflow.start_run(run_name=f"{name}_model"):
        
        mlflow.log_param("model_type", name)
        mlflow.log_param("features", "views,revenue")

        model.fit(X_train, y_train)
        score = model.score(X_test, y_test)

        mlflow.log_metric("r2_score", score)
        mlflow.sklearn.log_model(model, artifact_path="model")

        print(f"{name}: R² = {score:.4f}")




linear: R² = 0.8106




decision_tree: R² = 0.8394




random_forest: R² = 0.8570


In [0]:
# Spark ML Pipeline
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression as SparkLR
from pyspark.ml.evaluation import RegressionEvaluator

# Read GOLD Delta data via PATH
spark_df = spark.read.format("delta") \
    .load("/Volumes/workspace/ecommerce/ecommerce_data/gold/products") \
    .select("views", "revenue", "purchases") \
    .dropna()

# Feature vector
assembler = VectorAssembler(
    inputCols=["views", "revenue"],
    outputCol="features"
)

# Model
lr = SparkLR(
    featuresCol="features",
    labelCol="purchases"
)

# Pipeline
pipeline = Pipeline(stages=[assembler, lr])

# Train-test split
train, test = spark_df.randomSplit([0.8, 0.2], seed=42)

# Fit model
spark_model = pipeline.fit(train)

# Evaluate
predictions = spark_model.transform(test)

evaluator = RegressionEvaluator(
    labelCol="purchases",
    predictionCol="prediction",
    metricName="r2"
)

r2 = evaluator.evaluate(predictions)
print(f"Spark Linear Regression R2: {r2:.4f}")


Spark Linear Regression R2: 0.8723
