#Import Packages

In [0]:
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.functions import to_date
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml import Pipeline
from pyspark.sql.functions import col, log, lit
from pyspark.sql import functions as F
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier
from pyspark.sql.functions import skewness
from pyspark.sql import Row
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import countDistinct
from pyspark.sql.functions import approx_count_distinct
import matplotlib.pyplot as plt
from pyspark.sql.types import NumericType

from pyspark.ml.evaluation import BinaryClassificationEvaluator



# Data Prepareation

### Load Data

In [0]:
df_train_undersampled_raw = spark.read.parquet("dbfs:/student-groups/Group_4_4/cp6_train_2015_2017_undersampled_0_5_ratio.parquet")
df_val_2018_raw = spark.read.parquet("dbfs:/student-groups/Group_4_4/cp6_val_2018_refined.parquet")
df_test_2019_raw = spark.read.parquet("dbfs:/student-groups/Group_4_4/cp6_test_2019_refined.parquet")

df_train_undersampled_raw = df_train_undersampled_raw.cache()
df_train_undersampled_raw.count()  # force materialization


9009126

In [0]:
print(f"Columns: {len(df_train_undersampled_raw.columns)}")

Columns: 113


In [0]:
df_train_undersampled_raw.columns

['DEST',
 'ORIGIN',
 'OP_UNIQUE_CARRIER',
 'FL_DATE',
 'prediction_utc',
 'origin_obs_utc',
 'asof_minutes',
 'DAY_OF_MONTH',
 'DAY_OF_WEEK',
 'OP_CARRIER_FL_NUM',
 'CRS_ARR_TIME',
 'ORIGIN_AIRPORT_ID',
 'ORIGIN_STATE_ABR',
 'DEST_AIRPORT_ID',
 'DEST_STATE_ABR',
 'HourlyDryBulbTemperature',
 'HourlyDewPointTemperature',
 'HourlyWindDirection',
 'HourlyWindGustSpeed',
 'HourlyVisibility',
 'HourlyRelativeHumidity',
 'HourlyStationPressure',
 'HourlyAltimeterSetting',
 'origin_airport_lat',
 'origin_airport_lon',
 'dest_airport_lat',
 'dest_airport_lon',
 'origin_station_dis',
 'dest_station_dis',
 'origin_type',
 'DEP_DEL15',
 'DEP_DELAY',
 'season',
 'rolling_origin_num_delays_24h',
 'dep_delay15_24h_rolling_avg_by_origin_dayofweek',
 'dep_delay15_24h_rolling_avg_by_origin_log',
 'dep_delay15_24h_rolling_avg_by_origin_carrier_log',
 'dep_delay15_24h_rolling_avg_by_origin_dayofweek_log',
 'is_superbowl_week',
 'is_major_event',
 'distance_very_long',
 'weather_condition_category',
 'is_

## Festure Selection

#### Identify Column Category 

In [0]:
# ============================================================
# Label Column
# ============================================================
# DEP_DEL15 is the target variable indicating whether a flight
# departed with a delay of 15 minutes or more (1 = delayed, 0 = on time).
label_col = "DEP_DEL15"

# Identify string columns (excluding the label)
string_cols = [c for c, t in df_train_undersampled_raw.dtypes if t == "string" and c != label_col]

# ============================================================
# Leakage Columns
# ============================================================
# These columns must be removed because they reveal information
# that would NOT be available at prediction time, or they encode
# outcomes of the flight. Including them would lead to severe
# data leakage and unrealistic model performance.
leakage_cols = [
    # --------------------------------------------------------
    # Direct outcome information — known only after the flight
    # --------------------------------------------------------
    "CANCELLED",                # Cancellation outcome
    "CANCELLATION_CODE",        # Reason for cancellation
    "DIVERTED",                 # Diversion outcome
    "DEP_DELAY",           # true departure delay (leakage)
    "rf_prob_delay",       # RF model prediction
    "rf_prob_delay_binned", # binned RF prediction

    # Arrival-related data (results of the flight)
    "ARR_DEL15_removed",
    "ARR_TIME_removed",
    "ARR_DELAY_removed",
    "TAXI_IN_removed",
    "AIR_TIME_removed",
    "WHEELS_OFF_removed",
    "WHEELS_ON_removed",
    "ACTUAL_ELAPSED_TIME_removed",

    # Delay reason codes — these explicitly encode true causes
    # and are not available prior to departure
    "CARRIER_DELAY_removed",
    "WEATHER_DELAY_removed",
    "NAS_DELAY_removed",
    "SECURITY_DELAY_removed",
    "LATE_AIRCRAFT_DELAY_removed",

    # --------------------------------------------------------
    # Ground-truth weather measurements (removed versions)
    # These represent real observed values rather than forecasted
    # or lagged values, so they cannot be used for prediction.
    # --------------------------------------------------------
    "HourlyDryBulbTemperature_removed",
    "HourlyWetBulbTemperature_removed",
    "HourlyStationPressure_removed",
    "HourlySeaLevelPressure_removed",

    # --------------------------------------------------------
    # Geographic / distance fields duplicated with future info
    # The "_removed" versions often reflect information derived
    # from ground truth sources and should not be used.
    # --------------------------------------------------------
    "origin_station_lat_removed",
    "origin_station_lon_removed",
    "origin_airport_lat_removed",
    "origin_airport_lon_removed",
    "dest_station_lat_removed",
    "dest_station_lon_removed",
    "dest_airport_lat_removed",
    "dest_airport_lon_removed",
    "origin_station_dis_removed",
    "dest_station_dis_removed",

    # --------------------------------------------------------
    # Historical features where the removed versions may contain
    # lookahead information or future aggregates.
    # --------------------------------------------------------
    "rolling_origin_num_flights_24h_removed",
    "rolling_origin_delay_ratio_24h_removed",
    "rolling_origin_stddev_dep_delay_24h_removed",
    "total_flights_per_origin_day_removed",
    "prior_flights_today_removed",
    "prior_delays_today_removed",
    "same_day_prior_delay_percentage_removed",

    # --------------------------------------------------------
    # Distance / route fields duplicated in a removed form.
    # Clean versions (e.g., log_distance) should be used instead.
    # --------------------------------------------------------
    "DISTANCE_removed",
    "DISTANCE_GROUP_removed",
    "distance_short_removed",

    # --------------------------------------------------------
    # Miscellaneous fields removed for leakage or redundancy
    # --------------------------------------------------------
    "flight_id_removed",
    "HourlySkyConditions_removed",
    "HourlyPresentWeatherType_removed",
    "temp_humidity_interaction_removed",
    "precip_anomaly_removed",
    "extreme_precipitation_removed",
    "extreme_weather_score_removed",
    "num_airport_wide_cancellations_removed",
]


#### High Cardinality Column

In [0]:
# Compute cardinality for each categorical column
indexed_cols = [c for c in df_train_undersampled_raw.columns if c.endswith("_indexed")]

cardinality_exprs = [
    approx_count_distinct(c).alias(c)
    for c in indexed_cols
]
cardinality_row = df_train_undersampled_raw.select(cardinality_exprs).first()
cardinality_dict  = {c: cardinality_row[c] for c in indexed_cols}

# Sort by cardinality (high → low)
sorted_cardinality = sorted(cardinality_dict.items(), key=lambda x: x[1], reverse=True)
for col, cnt in sorted_cardinality:
    print(f"{col:40}  {cnt}")

THRESHOLD = 100   

drop_high_card_cols = [c for c, cnt in cardinality_dict.items() if cnt > THRESHOLD]
print("High-cardinality categorical columns (to drop):")
for c in drop_high_card_cols:
    print(f"  {c:40s} → {cardinality_dict[c]} distinct values")


ORIGIN_indexed                            328
DEST_indexed                              320
day_hour_interaction_indexed              158
ORIGIN_STATE_ABR_indexed                  50
DEST_STATE_ABR_indexed                    50
OP_UNIQUE_CARRIER_indexed                 14
sky_condition_parsed_indexed              6
airline_reputation_category_indexed       5
season_indexed                            4
turnaround_category_indexed               4
origin_type_indexed                       3
weather_condition_category_indexed        3
High-cardinality categorical columns (to drop):
  DEST_indexed                             → 320 distinct values
  ORIGIN_indexed                           → 328 distinct values
  day_hour_interaction_indexed             → 158 distinct values


#### Drop Columns

In [0]:
print("Train rows:", len(df_val_2018_raw.columns))
print("Test rows :", len(df_test_2019_raw.columns))

df_train_undersampled =  (
    df_train_undersampled_raw
      .drop(*leakage_cols)
      .drop(*string_cols)
      .drop(*drop_high_card_cols)
      .drop("FL_DATE", "prediction_utc", "origin_obs_utc")
)

df_val_2018 =  (
    df_val_2018_raw
      .drop(*leakage_cols)
      .drop(*string_cols)
      .drop(*drop_high_card_cols)
      .drop("FL_DATE", "prediction_utc", "origin_obs_utc")
)

df_test_2019 =  (
    df_test_2019_raw
      .drop(*leakage_cols)
      .drop(*string_cols)
      .drop(*drop_high_card_cols)
      .drop("FL_DATE", "prediction_utc", "origin_obs_utc")
)

print("Train column cleaned:",len(df_val_2018.columns))
print("Test column cleaned:", len(df_test_2019.columns))

Train rows: 113
Test rows : 113
Train column cleaned: 92
Test column cleaned: 92


In [0]:
feature_cols = [c for c in df_train_undersampled.columns if c != label_col and  c not in drop_high_card_cols]

df_train_undersampled = df_train_undersampled.na.drop(subset=feature_cols)
df_val_2018   = df_val_2018.na.drop(subset=feature_cols)
df_test_2019  = df_test_2019.na.drop(subset=feature_cols)

# Define Evaluator

In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import GBTClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql import functions as F

label_col = "DEP_DEL15"

# ============================================================
# 1. Feature columns & assembler
# ============================================================
# Use all non-label columns as features
feature_cols = [c for c in df_train_undersampled.columns if c != label_col and  c not in drop_high_card_cols]

assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features",
    handleInvalid="keep"
)

# ============================================================
# 2. Evaluators for AUC-PR & AUC-ROC
# ============================================================
evaluator_pr = BinaryClassificationEvaluator(
    labelCol=label_col,
    rawPredictionCol="rawPrediction",
    metricName="areaUnderPR"
)

evaluator_roc = BinaryClassificationEvaluator(
    labelCol=label_col,
    rawPredictionCol="rawPrediction",
    metricName="areaUnderROC"
)

# ============================================================
# 3. F_beta (here F0.5) helper
# ============================================================
def compute_fbeta(df, label_col, prediction_col="prediction", beta=0.5):
    """
    Compute F_beta, precision, and recall from a DataFrame
    that already has hard predictions in `prediction_col`.
    """
    agg = (
        df
        .select(
            F.col(label_col).cast("int").alias("y"),
            F.col(prediction_col).cast("int").alias("yhat")
        )
        .selectExpr(
            "sum(CASE WHEN y = 1 AND yhat = 1 THEN 1 ELSE 0 END) AS tp",
            "sum(CASE WHEN y = 0 AND yhat = 1 THEN 1 ELSE 0 END) AS fp",
            "sum(CASE WHEN y = 1 AND yhat = 0 THEN 1 ELSE 0 END) AS fn"
        )
        .collect()[0]
    )

    tp = agg["tp"]
    fp = agg["fp"]
    fn = agg["fn"]

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall    = tp / (tp + fn) if (tp + fn) > 0 else 0.0

    beta2 = beta ** 2
    denom = beta2 * precision + recall

    if denom == 0:
        fbeta = 0.0
    else:
        fbeta = (1 + beta2) * precision * recall / denom

    return fbeta, precision, recall

#  GB with Early Stop

**MODELING - GBDT & RF IMPROVEMENTS**
- • Implement early stopping for GradientBoostedTrees using validation set
- • Grid search over max_depth, n_trees, learning_rate for GBDT
- • Tune Random Forest with different tree counts and max_depth values
- • Decide early stopping criteria: quit at first poor performance vs persevere for more epochs

### Define GB Model

In [0]:

# ============================================================
# 4. Train + evaluate one GBT config
# ============================================================
def train_gbt_and_eval(
    train_df,
    val_df,
    assembler,
    label_col,
    num_iters,
    max_depth=5,
    step_size=0.1,
    subsampling_rate=0.8,
    seed=42
):
    gbt = GBTClassifier(
        labelCol=label_col,
        featuresCol="features",
        maxIter=num_iters,
        maxDepth=max_depth,
        maxBins=64,
        stepSize=step_size,
        subsamplingRate=subsampling_rate,
        seed=seed
    )

    pipeline = Pipeline(stages=[assembler, gbt])

    model = pipeline.fit(train_df)

    val_pred = model.transform(val_df)

    auc_pr  = evaluator_pr.evaluate(val_pred)
    auc_roc = evaluator_roc.evaluate(val_pred)
    f05, prec, rec = compute_fbeta(val_pred, label_col, prediction_col="prediction")

    metrics = {
        "auc_pr": auc_pr,
        "auc_roc": auc_roc,
        "f0_5": f05,
        "precision": prec,
        "recall": rec,
    }

    return model, metrics


### Define Early Stop for GB

In [0]:
num_iter_grid = [ 70, 80,90]
#num_iter_grid = [20]

def run_early_stop_for_one_config(
    train_df,
    val_df,
    assembler,
    label_col,
    max_depth,
    step_size,
    subsampling_rate,
    num_iter_grid=num_iter_grid,
    patience=2,
    min_delta=0.001
):
    best_model = None
    best_num_iter = None
    best_metrics = None
    best_score = float("-inf")
    no_improve_rounds = 0

    for num_iters in num_iter_grid:
        model, metrics = train_gbt_and_eval(
            train_df=train_df,
            val_df=val_df,
            assembler=assembler,
            label_col=label_col,
            num_iters=num_iters,
            max_depth=max_depth,
            step_size=step_size,
            subsampling_rate=subsampling_rate
        )

        auc_pr  = metrics["auc_pr"]
        auc_roc = metrics["auc_roc"]
        f05     = metrics["f0_5"]

        print(
            f"[config: depth={max_depth}, step={step_size}, subs={subsampling_rate}] "
            f"numIters={num_iters:3d} | "
            f"val AUC-PR={auc_pr:.4f}, AUC-ROC={auc_roc:.4f}, F0.5={f05:.4f}"
        )

        if auc_pr > best_score + min_delta:
            best_score = auc_pr
            best_model = model
            best_num_iter = num_iters
            best_metrics = metrics
            no_improve_rounds = 0
        else:
            no_improve_rounds += 1

        if no_improve_rounds >= patience:
            print(
                f"Early stopping triggered for this config at numIters={num_iters}, "
                f"best numIters so far={best_num_iter}"
            )
            break

    print(
        f"--> Best for this config: depth={max_depth}, step={step_size}, subs={subsampling_rate}, "
        f"numIters={best_num_iter}, AUC-PR={best_metrics['auc_pr']:.4f}, "
        f"AUC-ROC={best_metrics['auc_roc']:.4f}, F0.5={best_metrics['f0_5']:.4f}"
    )

    return {
        "max_depth": max_depth,
        "step_size": step_size,
        "subsampling_rate": subsampling_rate,
        "best_num_iters": best_num_iter,
        "auc_pr": best_metrics["auc_pr"],
        "auc_roc": best_metrics["auc_roc"],
        "f0_5": best_metrics["f0_5"],
        "model": best_model,
    }

### Grid Search GB

In [0]:
configs = [
    {"max_depth": 3, "step_size": 0.1, "subsampling_rate": 0.8},
    {"max_depth": 5, "step_size": 0.1, "subsampling_rate": 0.8},
    {"max_depth": 5, "step_size": 0.05, "subsampling_rate": 0.8},
]

all_results = []

for cfg in configs:
    print("\n==============================")
    print(f"Testing config: {cfg}")
    print("==============================")

    res = run_early_stop_for_one_config(
        train_df=df_train_undersampled,
        val_df=df_val_2018,
        assembler=assembler,
        label_col=label_col,
        max_depth=cfg["max_depth"],
        step_size=cfg["step_size"],
        subsampling_rate=cfg["subsampling_rate"],
    )

    all_results.append(res)



Testing config: {'max_depth': 3, 'step_size': 0.1, 'subsampling_rate': 0.8}
[config: depth=3, step=0.1, subs=0.8] numIters= 70 | val AUC-PR=0.6668, AUC-ROC=0.8761, F0.5=0.6259
[config: depth=3, step=0.1, subs=0.8] numIters= 80 | val AUC-PR=0.6706, AUC-ROC=0.8779, F0.5=0.6275
[config: depth=3, step=0.1, subs=0.8] numIters= 90 | val AUC-PR=0.6771, AUC-ROC=0.8802, F0.5=0.6295
--> Best for this config: depth=3, step=0.1, subs=0.8, numIters=90, AUC-PR=0.6771, AUC-ROC=0.8802, F0.5=0.6295

Testing config: {'max_depth': 5, 'step_size': 0.1, 'subsampling_rate': 0.8}
[config: depth=5, step=0.1, subs=0.8] numIters= 70 | val AUC-PR=0.7128, AUC-ROC=0.8916, F0.5=0.6384
[config: depth=5, step=0.1, subs=0.8] numIters= 80 | val AUC-PR=0.7162, AUC-ROC=0.8933, F0.5=0.6400
[config: depth=5, step=0.1, subs=0.8] numIters= 90 | val AUC-PR=0.7191, AUC-ROC=0.8945, F0.5=0.6415
--> Best for this config: depth=5, step=0.1, subs=0.8, numIters=90, AUC-PR=0.7191, AUC-ROC=0.8945, F0.5=0.6415

Testing config: {'max_d

In [0]:
best_overall_gb = max(all_results, key=lambda x: x["auc_pr"])

print("\n====== Best overall GBT config on validation set ======")
print(
    f"maxDepth={best_overall_gb['max_depth']}, "
    f"stepSize={best_overall_gb['step_size']}, "
    f"subsamplingRate={best_overall_gb['subsampling_rate']}, "
    f"numIters={best_overall_gb['best_num_iters']}, "
    f"AUC-PR={best_overall_gb['auc_pr']:.4f}, "
    f"AUC-ROC={best_overall_gb['auc_roc']:.4f}, "
    f"F0.5={best_overall_gb['f0_5']:.4f}"
)



maxDepth=5, stepSize=0.1, subsamplingRate=0.8, numIters=90, AUC-PR=0.7191, AUC-ROC=0.8945, F0.5=0.6415


### Train Final GB

In [0]:
final_gbt = GBTClassifier(
    labelCol=label_col,
    featuresCol="features",
    maxIter=best_overall_gb["best_num_iters"],
    maxDepth=best_overall_gb["max_depth"],
    stepSize=best_overall_gb["step_size"],
    subsamplingRate=best_overall_gb["subsampling_rate"],
    maxBins=64,
    seed=42
)

final_pipeline_gbt = Pipeline(stages=[assembler, final_gbt])

final_model_gbt = final_pipeline_gbt.fit(df_train_undersampled)


### Test Final GB

In [0]:
test_pred_gbt = final_model_gbt.transform(df_test_2019)

test_auc_pr_gbt  = evaluator_pr.evaluate(test_pred_gbt)
test_auc_roc_gbt = evaluator_roc.evaluate(test_pred_gbt)

test_f05_gbt, test_prec_gbt, test_rec_gbt = compute_fbeta(
    test_pred_gbt,
    label_col,
    prediction_col="prediction"
)

print("===== Final Test Metrics =====")
print(f"AUC-PR  = {test_auc_pr_gbt:.4f}")
print(f"AUC-ROC = {test_auc_roc_gbt:.4f}")
print(f"F0.5    = {test_f05_gbt:.4f}")
print(f"Precision = {test_prec_gbt:.4f}")
print(f"Recall    = {test_rec_gbt:.4f}")


===== Final Test Metrics =====
AUC-PR  = 0.6832
AUC-ROC = 0.8818
F0.5    = 0.6366
Precision = 0.6407
Recall    = 0.6206


# RF with Early Stop

#### Define RF Model

In [0]:
from pyspark.ml.classification import RandomForestClassifier

def train_rf_and_eval(
    train_df,
    val_df,
    assembler,
    label_col,
    num_trees,
    max_depth,
    seed=42
):
    rf = RandomForestClassifier(
        labelCol=label_col,
        featuresCol="features",
        numTrees=num_trees,
        maxDepth=max_depth,
        maxBins=64,
        seed=seed
    )

    pipeline = Pipeline(stages=[assembler, rf])
    model = pipeline.fit(train_df)

    val_pred = model.transform(val_df)

    auc_pr  = evaluator_pr.evaluate(val_pred)
    auc_roc = evaluator_roc.evaluate(val_pred)
    f05, prec, rec = compute_fbeta(val_pred, label_col, prediction_col="prediction")

    metrics = {
        "auc_pr": auc_pr,
        "auc_roc": auc_roc,
        "f0_5": f05,
        "precision": prec,
        "recall": rec,
    }

    return model, metrics


#### Grid Search RF

In [0]:
numTrees_grid = [10,15,20]
maxDepth_grid = [5, 8, 10,15]   

best_rf = None
best_cfg = None
best_score = float("-inf")

for maxDepth in maxDepth_grid:
    no_improve_rounds = 0
    last_best_for_this_depth = float("-inf")

    for numTrees in numTrees_grid:
        model, metrics = train_rf_and_eval(
            df_train_undersampled,
            df_val_2018,
            assembler,
            label_col,
            num_trees=numTrees,
            max_depth=maxDepth
        )

        auc_pr = metrics["auc_pr"]

        print(
            f"[RF] depth={maxDepth}, numTrees={numTrees} | "
            f"val AUC-PR={auc_pr:.4f}, AUC-ROC={metrics['auc_roc']:.4f}, "
            f"F0.5={metrics['f0_5']:.4f}"
        )

        if auc_pr > best_score:
            best_score = auc_pr
            best_rf = model
            best_cfg = {
                "maxDepth": maxDepth,
                "numTrees": numTrees,
                "metrics": metrics
            }

        if auc_pr > last_best_for_this_depth + 0.001:
            last_best_for_this_depth = auc_pr
            no_improve_rounds = 0
        else:
            no_improve_rounds += 1

        if no_improve_rounds >= 1:
            print(f"  -> early stop on numTrees for depth={maxDepth}")
            break

print("\nBest RF config on val:")
print(best_cfg)


[RF] depth=5, numTrees=10 | val AUC-PR=0.5722, AUC-ROC=0.8318, F0.5=0.5747
[RF] depth=5, numTrees=15 | val AUC-PR=0.5642, AUC-ROC=0.8299, F0.5=0.5557
  -> early stop on numTrees for depth=5
[RF] depth=8, numTrees=10 | val AUC-PR=0.6113, AUC-ROC=0.8485, F0.5=0.6036
[RF] depth=8, numTrees=15 | val AUC-PR=0.6075, AUC-ROC=0.8480, F0.5=0.6026
  -> early stop on numTrees for depth=8
[RF] depth=10, numTrees=10 | val AUC-PR=0.6340, AUC-ROC=0.8576, F0.5=0.6142
[RF] depth=10, numTrees=15 | val AUC-PR=0.6318, AUC-ROC=0.8582, F0.5=0.6211
  -> early stop on numTrees for depth=10
[RF] depth=15, numTrees=10 | val AUC-PR=0.6728, AUC-ROC=0.8726, F0.5=0.6353
[RF] depth=15, numTrees=15 | val AUC-PR=0.6757, AUC-ROC=0.8747, F0.5=0.6412
[RF] depth=15, numTrees=20 | val AUC-PR=0.6796, AUC-ROC=0.8758, F0.5=0.6417

Best RF config on val:
{'maxDepth': 15, 'numTrees': 20, 'metrics': {'auc_pr': 0.6796402894156596, 'auc_roc': 0.8757719392859618, 'f0_5': 0.6417282423660131, 'precision': 0.6499919152552913, 'recall'

[RF] depth=5, numTrees=20 | val AUC-PR=0.5694, AUC-ROC=0.8377, F0.5=0.5662
[RF] depth=5, numTrees=30 | val AUC-PR=0.5678, AUC-ROC=0.8310, F0.5=0.5734
  -> early stop on numTrees for depth=5
[RF] depth=8, numTrees=20 | val AUC-PR=0.6142, AUC-ROC=0.8525, F0.5=0.6121
[RF] depth=8, numTrees=30 | val AUC-PR=0.6130, AUC-ROC=0.8507, F0.5=0.6085
  -> early stop on numTrees for depth=8
[RF] depth=10, numTrees=20 | val AUC-PR=0.6364, AUC-ROC=0.8599, F0.5=0.6249
[RF] depth=10, numTrees=30 | val AUC-PR=0.6357, AUC-ROC=0.8584, F0.5=0.6224
  -> early stop on numTrees for depth=10

Best RF config on val:
{'maxDepth': 10, 'numTrees': 20, 'metrics': {'auc_pr': 0.636412025003994, 'auc_roc': 0.8599396284860885, 'f0_5': 0.6249493724262329, 'precision': 0.643160930363743, 'recall': 0.5613673173509668}}

### Train/Test Final RF

In [0]:
# ============================================================
# 1. Extract the best hyperparameters from validation results
# ============================================================
best_depth = best_cfg["maxDepth"]
best_trees = best_cfg["numTrees"]

print("Using best RF config from validation:")
print(f"  maxDepth  = {best_depth}")
print(f"  numTrees  = {best_trees}")

# ============================================================
# 2. Define the final RF model using the selected hyperparameters
#    Note: This model will be re-trained on the full training set
#    (2015–2017 undersampled data) before being evaluated on test.
# ============================================================
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline

rf_final = RandomForestClassifier(
    labelCol=label_col,
    featuresCol="features",
    numTrees=best_trees,      # best number of trees found on validation set
    maxDepth=best_depth,      # best max depth from validation
    maxBins=64,               # consistent with earlier settings
    seed=42,
    featureSubsetStrategy="sqrt"  # common choice; ensures reproducibility
)

# ============================================================
# 3. Create a pipeline with the same VectorAssembler
#    This ensures consistent feature preprocessing.
# ============================================================
rf_final_pipeline = Pipeline(stages=[assembler, rf_final])

# ============================================================
# 4. Fit the final model on the entire training dataset
#    (Only training data should be used for learning parameters)
# ============================================================
rf_final_model = rf_final_pipeline.fit(df_train_undersampled)

# ============================================================
# 5. Generate predictions on the test set (2019 data)
# ============================================================
rf_test_pred = rf_final_model.transform(df_test_2019)

# ============================================================
# 6. Evaluate the final model using PR-AUC, ROC-AUC, and F0.5
#    These metrics quantify performance on the unseen test set.
# ============================================================
rf_test_auc_pr  = evaluator_pr.evaluate(rf_test_pred)
rf_test_auc_roc = evaluator_roc.evaluate(rf_test_pred)
rf_test_f05, rf_test_prec, rf_test_rec = compute_fbeta(
    rf_test_pred,
    label_col,
    prediction_col="prediction"
)

# ============================================================
# 7. Print the final test performance
# ============================================================
print("===== Final Random Forest performance on TEST 2019 =====")
print(f"AUC-PR   = {rf_test_auc_pr:.4f}")
print(f"AUC-ROC  = {rf_test_auc_roc:.4f}")
print(f"F0.5     = {rf_test_f05:.4f}")
print(f"Precision= {rf_test_prec:.4f}")
print(f"Recall   = {rf_test_rec:.4f}")


Using best RF config from validation:
  maxDepth  = 15
  numTrees  = 20
===== Final Random Forest performance on TEST 2019 =====
AUC-PR   = 0.6639
AUC-ROC  = 0.8711
F0.5     = 0.6376
Precision= 0.6474
Recall   = 0.6013
