# Modeling Pipeline (Experimental)

## Environment Setup

### Imports

In [0]:
from typing import List, Dict, Tuple, Any, Union,Callable
import numpy as np
import random
from datetime import timedelta

import pyspark.sql.functions as F
from pyspark.sql import DataFrame
from pyspark.ml import Pipeline
from pyspark.mllib.evaluation import MulticlassMetrics,BinaryClassificationMetrics
from pyspark.ml.classification import (
    LogisticRegression,
    RandomForestClassifier,
    MultilayerPerceptronClassifier
)
from xgboost.spark import SparkXGBClassifier
import mlflow
from hyperopt import hp, STATUS_OK, fmin, tpe, Trials

### Data and Variables

In [0]:
# Variables and directories
data_BASE_DIR = "dbfs:/mnt/mids-w261/datasets_final_project_2022"
team_BASE_DIR = f"dbfs:/student-groups/Group_4_1"
spark.sparkContext.setCheckpointDir(f"{team_BASE_DIR}/checkpoints")
period = "_1y" # one of the following values ("", "_3m", "_6m", "_1y")
k = 5 # cv folds
overlap = 0.2 # cv overlap

# Datasets
df = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined{period}_cleaned_engineered_timefeat.parquet")

In [0]:
# Directory Inspection
display(dbutils.fs.ls(f"{team_BASE_DIR}/interim/join_checkpoints/"))

## STEP : Features Selection and Preperation

In [0]:
features = [
    "ORIGIN",
    "DEST",
    "QUARTER",
    "MONTH",
    "DAY_OF_MONTH",
    "DAY_OF_WEEK"
]

label = "outcome"

## STEP : Time-series CV split

In [0]:
def train_test_split_timeseries(df, time_col: str, test_fraction: float = 0.2, verbose: bool = True):
    """
    Splits a PySpark DataFrame into a train/test set based on a timestamp column.
    The most recent `test_fraction` of the data (by time) is used as test set.

    Args:
        df (DataFrame): Input PySpark DataFrame.
        time_col (str): Timestamp column name (must be sortable).
        test_fraction (float): Fraction of time span to allocate to the test set.
        verbose (bool): Print boundaries and sizes.

    Returns:
        (train_df, test_df): Tuple of train and test DataFrames.
    """
    # Get min and max time
    min_time, max_time = df.selectExpr(f"min({time_col})", f"max({time_col})").first()
    total_days = (max_time - min_time).days
    test_days = int(total_days * test_fraction)

    test_start = max_time - timedelta(days=test_days)

    train_df = df.filter(F.col(time_col) < test_start)
    test_df = df.filter(F.col(time_col) >= test_start)

    if verbose:
        print(f"📅 Total date range: {min_time.date()} → {max_time.date()} ({total_days} days)")
        print(f"✅ Train: {min_time.date()} → {test_start.date()} ({train_df.count():,} rows)")
        print(f"🧪 Test: {test_start.date()} → {max_time.date()} ({test_df.count():,} rows)")

    return train_df, test_df


In [0]:
def time_series_cv_folds(
    df,
    time_col: str,
    k: int=3,
    blocking: bool=False,
    overlap: float=0.0,
    verbose: bool=False
):
    """
    Split a time-series PySpark DataFrame into k train/test folds with optional overlap and blocking.
    
    Args:
        df (DataFrame): PySpark DataFrame with a timestamp column.
        dep_utc_time_colvarname (str): Name of the timestamp column.
        k (int): Number of folds.
        blocking (bool): Whether to block the training set to avoid cumulative data.
        overlap (float): Fraction of overlap between validation windows (e.g. 0.2 = 20% overlap).
        verbose (bool): Whether to print the time splits.
        
    Returns:
        List of (train_df, val_df) tuples.
    """
    # Get time boundaries
    min_date = df.select(F.min(time_col)).first()[0]
    max_date = df.select(F.max(time_col)).first()[0]
    n_days = (max_date - min_date).days + 1

    # Adjust chunk sizing
    total_width = k + 1 - overlap * (k - 1)
    chunk_size = int(np.ceil(n_days / total_width))

    if verbose:
        print(f"Splitting data into {k} folds with {overlap*100:.0f}% overlap")
        print(f"Min date: {min_date}, Max date: {max_date}")
        print(f"{chunk_size:,} days per fold")
        print("************************************************************")

    folds = []
    for i in range(k):
        # Offset calculation with overlap
        train_start_offset = 0 if not blocking else int(i * (1 - overlap) * chunk_size)
        train_end_offset = int((i + 1) * chunk_size)
        val_start_offset = train_end_offset
        val_end_offset = int(val_start_offset + chunk_size)

        # Compute actual timestamps
        train_start = min_date + timedelta(days=train_start_offset)
        train_end = min_date + timedelta(days=train_end_offset)
        val_start = min_date + timedelta(days=val_start_offset)
        val_end = min_date + timedelta(days=val_end_offset)

        if val_start >= max_date:
            break
        if val_end > max_date:
            val_end = max_date + timedelta(days=1)

        # Apply filters
        train_df = df.filter((F.col(time_col) >= train_start) & (F.col(time_col) < train_end))
        val_df = df.filter((F.col(time_col) >= val_start) & (F.col(time_col) < val_end))

        if verbose:
            print(f"Fold {i + 1}:")
            print(f"  TRAIN: {train_start.date()} → {train_end.date()} ({train_df.count():,} rows)")
            print(f"  VAL:   {val_start.date()} → {val_end.date()} ({val_df.count():,} rows)")
            print("------------------------------------------------------------")

        folds.append((train_df, val_df))

    return folds

In [0]:
def add_class_weights(df, label_col: str):
    label_counts = df.groupBy(label_col).count().toPandas()
    neg, pos = label_counts.sort_values(label_col)["count"].tolist()
    pos_weight = float(neg) / pos

    df_weighted = df.withColumn("weight", F.when(F.col(label_col) == 1, pos_weight).otherwise(1.0))
    return df_weighted, pos_weight

In [0]:
def cv_eval(predictions: DataFrame, label_col="outcome", prediction_col="prediction", metric:str="F2"):
  """
  Input: transformed df with prediction and label
  Output: desired score 
  """
  rdd_preds_m = predictions.select(['prediction', label_col]).rdd
  rdd_preds_b = predictions.select('outcome','probability').rdd.map(lambda row: (float(row['probability'][1]), float(row['outcome'])))
  metrics_m = MulticlassMetrics(rdd_preds_m)
  metrics_b = BinaryClassificationMetrics(rdd_preds_b)
  if metric == "F2":
    score = np.round(metrics_m.fMeasure(label=1.0, beta=2.0), 4)
  elif metric == "pr":
    score = metrics_b.areaUnderPR
  return score

In [0]:
def model_tuner(
    model_name: str,
    model_params: Dict[str, Any],
    stages,
    folds: List[Tuple[DataFrame, DataFrame]],
    mlflow_run_name: str = "/Users/m.bakr@berkeley.edu/flight_delay_tuning",
    metric: str = "F2",
    verbose: bool = True
) -> Dict[str, Union[float, str, Dict[str, Any]]]:
    """
    Universal tuning function for PySpark classification models using time-series cross-validation.

    Args:
        model_name (str): One of ['logreg', 'rf', 'mlp', 'xgb']
        model_params (Dict[str, Any]): Parameters to apply to the model
        folds (List of (train_df, val_df)): Time-aware CV folds
        mlflow_run_name (str): Optional MLflow parent run name
        verbose (bool): Whether to log outputs during tuning

    Returns:
        Dict with best average F2 or pr score, model name, and parameters
    """

    # Model factory
    model_factory = {
        "logreg": LogisticRegression,
        "rf": RandomForestClassifier,
        "mlp": MultilayerPerceptronClassifier,
        "xgb": SparkXGBClassifier
    }

    assert model_name in model_factory, f"Unsupported model: {model_name}"

    ModelClass = model_factory[model_name]

    # Apply required fields
    model = ModelClass(
        featuresCol=features,
        labelCol=label,
        weightCol="weight",  # Handles imbalance
        **model_params
    )

    pipeline = Pipeline(stages=[model] + stages) 

    scores = []

    with mlflow.start_run(run_name=mlflow_run_name):
        for i, (train_df, val_df) in enumerate(folds):
            fitted_model = pipeline.fit(train_df)
            preds = fitted_model.transform(val_df)
            score = cv_eval(preds, metric)
            scores.append(score)

            if verbose:
                print(f"[Fold {i+1}] {metric} Score: {score:.4f}")

            mlflow.log_metric(f"{metric}_fold_{i+1}", score)

        avg_score = float(np.mean(scores))
        mlflow.log_param("model", model_name)
        mlflow.log_params(model_params)
        mlflow.log_metric("avg_{metric}_score", avg_score)

        if verbose:
            print(f"✅ Average {metric} Score: {avg_score:.4f} | Model: {model_name}")

    return {
        "model": model_name,
        "params": model_params,
        "avg_f2_score": avg_score
    }

In [0]:


def make_hyperopt_objective(
    model_name: str,
    folds: List[Tuple[DataFrame, DataFrame]],
    stages: List,
    param_space_converter: Callable[[Dict[str, Any]], Dict[str, Any]],
    mlflow_experiment_name: str = "Hyperopt_Universal_Tuning",
    verbose: bool = True
) -> Callable[[Dict[str, Any]], Dict[str, Any]]:
    """
    Creates a Hyperopt-compatible objective function for any PySpark classifier.

    Args:
        model_name (str): One of 'logreg', 'rf', 'mlp', 'xgb'.
        folds (List of (train_df, val_df)): Time-series CV folds.
        param_space_converter (Callable): Converts Hyperopt sample into model params.
        mlflow_experiment_name (str): MLflow experiment name.
        verbose (bool): Logging toggle.

    Returns:
        Callable that can be passed as fn to hyperopt.fmin()
    """

    def objective(sampled_params: Dict[str, Any]) -> Dict[str, Any]:
        # Convert sampled param space to Spark-friendly params
        model_params = param_space_converter(sampled_params)

        result = model_tuner(
            model_name=model_name,
            model_params=model_params,
            stages=stages,
            folds=folds,
            mlflow_run_name=f"hyperopt_{model_name}",
            verbose=verbose
        )

        return {
            "loss": -result["avg_f2_score"],  # Minimize negative F2
            "status": STATUS_OK,
            "params": result["params"]
        }

    return objective


## Testing

### Testing Time Series CV function

In [0]:
train_df, test_df = train_test_split_timeseries(
    df=df,
    time_col="sched_depart_utc",
    test_fraction=0.2,
    verbose=True
)

In [0]:
train_df, pos_weight = add_class_weights(train_df, label_col=label)

In [0]:
# Testing Time Series CV function
folds = time_series_cv_folds(
    train_df,
    time_col="sched_depart_utc",
    k=k,
    overlap=overlap,
    blocking=True,
    verbose=True
)

## Experiments

### Random Forest Experiment

In [0]:
# Define random search param grid
param_grid = []
for _ in range(10):  # 10 random configs
    param_grid.append({
        "numTrees": random.choice([50, 100, 200]),
        "maxDepth": random.choice([5, 10, 15]),
        "featureSubsetStrategy": random.choice(["auto", "sqrt", "log2"])
    })

# Define other stages
stages= []

# Run custom tuner
best_model, best_params, best_score = model_tuner(
    model_class=RandomForestClassifier,
    param_grid_list=param_grid,
    folds=folds,
    experiment_name="/Users/m.bakr@berkeley.edu/flight_delay_tuning",
    verbose=True
)

print("Best F2 Score:", best_score)
print("Best Params:", best_params)

In [0]:
# Define Hyperopt search space
rf_space = {
    "numTrees": hp.choice("numTrees", [50, 100, 200]),
    "maxDepth": hp.quniform("maxDepth", 5, 15, 1),
    "featureSubsetStrategy": hp.choice("featureSubsetStrategy", ["auto", "sqrt", "log2"])
}

def rf_param_mapper(sampled: Dict[str, Any]) -> Dict[str, Any]:
    return {
        "numTrees": int(sampled["numTrees"]),
        "maxDepth": int(sampled["maxDepth"]),
        "featureSubsetStrategy": sampled["featureSubsetStrategy"]
    }

objective = make_hyperopt_objective(
    model_name="rf",
    folds=folds,
    stages=stages,
    param_space_converter=rf_param_mapper,
    mlflow_experiment_name="RF_Hyperopt_Flight_Delay"
)

trials = Trials()

best = fmin(
    fn=objective,
    space=rf_space,
    algo=tpe.suggest,
    max_evals=20,
    trials=trials
)

print("Best Hyperopt Config:", best)


### Logistic Regression

In [0]:
logreg_space = {
    "regParam": hp.uniform("regParam", 0.0, 0.5),
    "elasticNetParam": hp.uniform("elasticNetParam", 0.0, 1.0)
}

def logreg_param_mapper(sampled):
    return {
        "regParam": sampled["regParam"],
        "elasticNetParam": sampled["elasticNetParam"],
        "maxIter": 100
    }

logreg_obj = make_hyperopt_objective(
    model_name="logreg",
    folds=folds,
    param_space_converter=logreg_param_mapper,
    mlflow_experiment_name="LogReg_Hyperopt",
    verbose=True
)

best_logreg = fmin(
    fn=logreg_obj,
    space=logreg_space,
    algo=tpe.suggest,
    max_evals=20,
    trials=Trials()
)

print("Best Logistic Regression params:", best_logreg)

### XGBoost

In [0]:
xgb_space = {
    "eta": hp.uniform("eta", 0.01, 0.3),
    "max_depth": hp.quniform("max_depth", 3, 10, 1),
    "subsample": hp.uniform("subsample", 0.5, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1.0),
    "num_round": hp.quniform("num_round", 50, 200, 10)
}

def xgb_param_mapper(sampled):
    return {
        "eta": sampled["eta"],
        "max_depth": int(sampled["max_depth"]),
        "subsample": sampled["subsample"],
        "colsample_bytree": sampled["colsample_bytree"],
        "num_round": int(sampled["num_round"]),
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "num_workers": 2,
        "verbosity": 0
    }
xgb_obj = make_hyperopt_objective(
    model_name="xgb",
    folds=folds,
    param_space_converter=xgb_param_mapper,
    mlflow_experiment_name="XGBoost_Hyperopt",
    verbose=True
)

best_xgb = fmin(
    fn=xgb_obj,
    space=xgb_space,
    algo=tpe.suggest,
    max_evals=20,
    trials=Trials()
)

print("Best XGBoost params:", best_xgb)

### MLP

In [0]:
mlp_space = {
    "hidden_layers": hp.choice("hidden_layers", [[64, 32], [128, 64], [100, 50]]),
    "stepSize": hp.uniform("stepSize", 0.01, 0.3),
    "maxIter": hp.choice("maxIter", [100, 200]),
    "blockSize": hp.choice("blockSize", [64, 128])
}

def mlp_param_mapper(sampled):
    return {
        "layers": [input_dim] + sampled["hidden_layers"] + [2],
        "stepSize": sampled["stepSize"],
        "maxIter": sampled["maxIter"],
        "blockSize": sampled["blockSize"]
    }

mlp_obj = make_hyperopt_objective(
    model_name="mlp",
    folds=folds,
    param_space_converter=mlp_param_mapper,
    mlflow_experiment_name="MLP_Hyperopt",
    verbose=True
)

best_mlp = fmin(
    fn=mlp_obj,
    space=mlp_space,
    algo=tpe.suggest,
    max_evals=20,
    trials=Trials()
)

print("Best MLP params:", best_mlp)