# 05 – Model Training & Ensemble Learning (PySpark)

In this notebook we train several classification models using Spark ML.  We start with Logistic Regression, Random Forest and Gradient‑Boosted Trees.  To avoid overfitting, we use cross‑validation with parameter grids and evaluate models on a held‑out test set.

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
import os

spark = SparkSession.builder.appName('CTR_Model_Training').getOrCreate()
processed_dir = os.path.join('..', 'data', 'processed')

# Load training and test data
train_df = spark.read.parquet(os.path.join(processed_dir, 'train_df.parquet'))
test_df = spark.read.parquet(os.path.join(processed_dir, 'test_df.parquet'))

# BinaryClassificationEvaluator for AUC
evaluator = BinaryClassificationEvaluator(labelCol='clk', rawPredictionCol='rawPrediction', metricName='areaUnderROC')

models_info = []
best_models = {}

# Logistic Regression
lr = LogisticRegression(featuresCol='features', labelCol='clk', maxIter=50)
paramGrid_lr = ParamGridBuilder()     .addGrid(lr.regParam, [0.0, 0.01, 0.1])     .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])     .build()
cv_lr = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid_lr, evaluator=evaluator, numFolds=3)
print('Training Logistic Regression...')
cv_lr_model = cv_lr.fit(train_df)
auc_lr = evaluator.evaluate(cv_lr_model.transform(test_df))
print('Logistic Regression AUC:', auc_lr)
models_info.append(('LogisticRegression', auc_lr))
best_models['LogisticRegression'] = cv_lr_model.bestModel

# Random Forest
rf = RandomForestClassifier(featuresCol='features', labelCol='clk', numTrees=50)
paramGrid_rf = ParamGridBuilder()     .addGrid(rf.maxDepth, [5, 10])     .addGrid(rf.numTrees, [50, 100])     .build()
cv_rf = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid_rf, evaluator=evaluator, numFolds=3)
print('Training Random Forest...')
cv_rf_model = cv_rf.fit(train_df)
auc_rf = evaluator.evaluate(cv_rf_model.transform(test_df))
print('Random Forest AUC:', auc_rf)
models_info.append(('RandomForest', auc_rf))
best_models['RandomForest'] = cv_rf_model.bestModel

# Gradient‑Boosted Trees
gbt = GBTClassifier(featuresCol='features', labelCol='clk', maxIter=50)
paramGrid_gbt = ParamGridBuilder()     .addGrid(gbt.maxDepth, [5, 10])     .addGrid(gbt.maxIter, [50, 100])     .build()
cv_gbt = CrossValidator(estimator=gbt, estimatorParamMaps=paramGrid_gbt, evaluator=evaluator, numFolds=3)
print('Training Gradient Boosted Trees...')
cv_gbt_model = cv_gbt.fit(train_df)
auc_gbt = evaluator.evaluate(cv_gbt_model.transform(test_df))
print('GBT AUC:', auc_gbt)
models_info.append(('GBT', auc_gbt))
best_models['GBT'] = cv_gbt_model.bestModel

# Display results
models_info.sort(key=lambda x: x[1], reverse=True)
for model_name, auc in models_info:
    print(f"{model_name}: AUC = {auc:.4f}")

# Save best models
models_dir = os.path.join('..', 'models')
os.makedirs(models_dir, exist_ok=True)
for name, model in best_models.items():
    model.save(os.path.join(models_dir, f'{name}_spark_model'))
print('Model training complete – best models saved.')
