# Feature Engineering Pipeline (Experimental)

In [0]:
from pyspark.sql import functions as F

In [0]:
# Load your pipeline
%run /Workspace/Users/m.bakr@berkeley.edu/261-Final-Project/flightdelays/features.py

In [0]:
# Dirs
team_BASE_DIR = f"dbfs:/student-groups/Group_4_1"
BASE_FEATURE_PATH = f"{team_BASE_DIR}/features"

# Load your flights data (2015–2021)
flights = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined_flights_weather_v1.parquet")

In [0]:
# Danger zone: remove all features to recreate them
dbutils.fs.rm(f"{BASE_FEATURE_PATH}/", recurse=True)

In [0]:
flights = flights.withColumn("outcome", (F.when((F.col("DEP_DELAY") >= 15) | (F.col("CANCELLED") == 1), 1).otherwise(0)).cast("double"))

flights.cache()

In [0]:
# Apply feature steps
flights = add_local_time_features(flights, time_col="sched_depart_date_time", test_mode=True)
flights = compute_and_join_pagerank_metrics(flights, base_path=BASE_FEATURE_PATH, test_mode=True)
flights = generate_lagged_delay_aggregates(flights, base_path=BASE_FEATURE_PATH, test_mode=True)
flights = add_prophet_features_per_airport(flights, base_path=BASE_FEATURE_PATH, test_mode=True)

# Save final feature set for model training
save_features(flights, f"{BASE_FEATURE_PATH}/full_features_test")

In [0]:
# Directory Inspection
display(dbutils.fs.ls(f"{BASE_FEATURE_PATH}/"))

In [0]:
import holidays
import datetime
us_holidays = holidays.US()
datetime.date(2025, 1, 1) in us_holidays


In [0]:
flights = add_prophet_features_per_airport(flights, base_path=BASE_FEATURE_PATH, test_mode=True)

# Save final feature set for model training
save_features(flights, f"{BASE_FEATURE_PATH}/full_features_test")

In [0]:
flights_full_feat_test = spark.read.parquet(f"{BASE_FEATURE_PATH}/full_features_test")
display(flights_full_feat_test.filter( F.col("is_holiday_week") == True))