# Modeling Hyperparameter tuning Pipeline 

## Environment Setup

### Imports

In [0]:
%load_ext autoreload
%autoreload 2

In [0]:
from hyperopt import hp, fmin

from flightdelays.features import select_features
from flightdelays.modeling.train import *

### Data and Variables

In [0]:
# Variables and directories
data_BASE_DIR = "dbfs:/mnt/mids-w261/datasets_final_project_2022"
team_BASE_DIR = f"dbfs:/student-groups/Group_4_1"
spark.sparkContext.setCheckpointDir(f"{team_BASE_DIR}/checkpoints")
period = "" # one of the following values ("", "_3m", "_6m", "_1y")
k = 5 # cv folds
overlap = 0.2 # cv overlap

# Datasets
df = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined_{period}_timefeat_seasfeat_cleaned_pr_v2.parquet")

In [0]:
# Directory Inspection
display(dbutils.fs.ls(f"{team_BASE_DIR}/interim/join_checkpoints/"))

## Step 1 : Features Selection and Preperation

In [0]:
numeric_cols, categorical_cols = select_features(df)
features = numeric_cols + categorical_cols
label = "outcome"

## Step 2: Train Test Split

In [0]:
train_df, test_df = train_test_split_timeseries(
    df=df,
    time_col="sched_depart_utc",
    split_method="date",
    test_start="2019-01-01",
    max_date="2020-01-01",
    verbose=True
)

In [0]:
train_df, pos_weight = add_class_weights(train_df, label_col=label)

## Step 3: Time-series CV split

In [0]:
keep_me = ["outcome", "sched_depart_utc"]
filter_cols = [*keep_me, *numeric_cols, *categorical_cols]
rename_map = {
        "daily_{i}": "daily",
        "weekly_{i}": "weekly",
        "yearly_{i}": "yearly",
        "holidays_{i}": "holidays",
        "train_{i}": "pagerank"
    }
# Testing Time Series CV function
folds = time_series_cv_folds(
    train_df,
    time_col="sched_depart_utc",
    k=k,
    overlap=overlap,
    blocking=True,
    keep_cols= filter_cols,
    sampling_fn=downsample,
    rename_seasonals=True,
    rename_map=rename_map,
    verbose=True
)

## Step 4: Experiments

### Random Forest Experiment

In [0]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, MinMaxScaler
#  List to hold the stages of the pipeline
stages = []

# 1. Index and encode categorical columns
for column in categorical_cols:
    indexer = StringIndexer(
        inputCol=column, 
        outputCol=column + "_index", 
        handleInvalid="keep"
    )
    encoder = OneHotEncoder(
        inputCol=column + "_index", 
        outputCol=column + "_vec", 
        handleInvalid="keep"
    )
    stages += [indexer, encoder]

# 4. Update feature list to include imputed columns
categorical_vec_columns = [col + "_vec" for col in categorical_cols]

features = numeric_cols + categorical_vec_columns

# 5. Assemble features
assembler = VectorAssembler(
    inputCols=features, 
    outputCol="features", 
    handleInvalid="skip"
)

# 6. Scale features
scaler = MinMaxScaler(
    inputCol="features", 
    outputCol="features_final"
)

stages += [assembler,scaler]

In [0]:
# Define random search param grid
param_grid = []
for _ in range(10):  # 10 random configs
    param_grid.append({
        "numTrees": random.choice([50, 100, 200]),
        "maxDepth": random.choice([5, 10, 15]),
        "featureSubsetStrategy": random.choice(["auto", "sqrt", "log2"])
    })

# Run custom tuner
best_model, best_params, best_score = model_tuner(
    model_class=RandomForestClassifier,
    param_grid_list=param_grid,
    folds=folds,
    experiment_name="/Users/m.bakr@berkeley.edu/flight_delay_tuning",
    verbose=True
)

print("Best F2 Score:", best_score)
print("Best Params:", best_params)

In [0]:
# Define Hyperopt search space
rf_space = {
    "numTrees": hp.choice("numTrees", [20, 40, 60]),
    "maxDepth": hp.quniform("maxDepth", 5, 12, 1),
    "featureSubsetStrategy": hp.choice("featureSubsetStrategy", ["auto", "sqrt", "log2"])
}

def rf_param_mapper(sampled: Dict[str, Any]) -> Dict[str, Any]:
    return {
        "numTrees": int(sampled["numTrees"]),
        "maxDepth": int(sampled["maxDepth"]),
        "featureSubsetStrategy": sampled["featureSubsetStrategy"]
    }

objective = make_hyperopt_objective(
    model_name="rf",
    folds=folds,
    stages=stages,
    features="features_final",
    label=label,
    param_space_converter=rf_param_mapper,
    mlflow_experiment_name="RF_Hyperopt_Flight_Delay_mas_test"
)

trials = Trials()

best = fmin(
    fn=objective,
    space=rf_space,
    algo=tpe.suggest,
    max_evals=10,
    trials=trials
)

print("Best Hyperopt Config:", best)


### Logistic Regression

In [0]:
logreg_space = {
    "regParam": hp.uniform("regParam", 0.0, 0.5),
    "elasticNetParam": hp.uniform("elasticNetParam", 0.0, 1.0)
}

def logreg_param_mapper(sampled):
    return {
        "regParam": sampled["regParam"],
        "elasticNetParam": sampled["elasticNetParam"],
        "maxIter": 100
    }

logreg_obj = make_hyperopt_objective(
    model_name="logreg",
    folds=folds,
    param_space_converter=logreg_param_mapper,
    mlflow_experiment_name="LogReg_Hyperopt",
    verbose=True
)

best_logreg = fmin(
    fn=logreg_obj,
    space=logreg_space,
    algo=tpe.suggest,
    max_evals=20,
    trials=Trials()
)

print("Best Logistic Regression params:", best_logreg)

### XGBoost

In [0]:
xgb_space = {
    "eta": hp.uniform("eta", 0.01, 0.3),
    "max_depth": hp.quniform("max_depth", 3, 10, 1),
    "subsample": hp.uniform("subsample", 0.5, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1.0),
    "num_round": hp.quniform("num_round", 50, 200, 10)
}

def xgb_param_mapper(sampled):
    return {
        "eta": sampled["eta"],
        "max_depth": int(sampled["max_depth"]),
        "subsample": sampled["subsample"],
        "colsample_bytree": sampled["colsample_bytree"],
        "num_round": int(sampled["num_round"]),
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "num_workers": 2,
        "verbosity": 0
    }
xgb_obj = make_hyperopt_objective(
    model_name="xgb",
    folds=folds,
    param_space_converter=xgb_param_mapper,
    mlflow_experiment_name="XGBoost_Hyperopt",
    verbose=True
)

best_xgb = fmin(
    fn=xgb_obj,
    space=xgb_space,
    algo=tpe.suggest,
    max_evals=20,
    trials=Trials()
)

print("Best XGBoost params:", best_xgb)

### MLP

In [0]:
mlp_space = {
    "hidden_layers": hp.choice("hidden_layers", [[64, 32], [128, 64], [100, 50]]),
    "stepSize": hp.uniform("stepSize", 0.01, 0.3),
    "maxIter": hp.choice("maxIter", [100, 200]),
    "blockSize": hp.choice("blockSize", [64, 128])
}

def mlp_param_mapper(sampled):
    return {
        "layers": [input_dim] + sampled["hidden_layers"] + [2],
        "stepSize": sampled["stepSize"],
        "maxIter": sampled["maxIter"],
        "blockSize": sampled["blockSize"]
    }

mlp_obj = make_hyperopt_objective(
    model_name="mlp",
    folds=folds,
    param_space_converter=mlp_param_mapper,
    mlflow_experiment_name="MLP_Hyperopt",
    verbose=True
)

best_mlp = fmin(
    fn=mlp_obj,
    space=mlp_space,
    algo=tpe.suggest,
    max_evals=20,
    trials=Trials()
)

print("Best MLP params:", best_mlp)