# Feature Engineering Pipeline (Experimental)

In [0]:
from pyspark.sql import functions as F

In [0]:
# Load your pipeline
%run /Workspace/Users/m.bakr@berkeley.edu/261-Final-Project/flightdelays/features.py

In [0]:
team_BASE_DIR = f"dbfs:/student-groups/Group_4_1"

df = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined__timefeat_seasfeat_cleaned_pr_v2.parquet")


In [0]:
BASE_FEATURE_PATH = f"{team_BASE_DIR}/features"


In [0]:
from flightdelays.features import *

In [0]:
df = df.withColumn("outcome", (F.when((F.col("DEP_DELAY") >= 15) | (F.col("CANCELLED") == 1), 1).otherwise(0)).cast("double"))

df.cache()

In [0]:
cols_to_exclude = ['daily', 'weekly', 'yearly', 'holidays', 'train', 'test', 'mean_dep_delay', 'prop_delayed']
selected_cols = [col for col in df.columns if not any(col.startswith(prefix) for prefix in cols_to_exclude)]

df_selected = df.select(selected_cols)
display(df_selected)

In [0]:
df_eng = compute_and_join_pagerank_metrics(df, base_path=BASE_FEATURE_PATH, test_mode=False)


In [0]:
display(flights.select('YEAR','QUARTER','ORIGIN','pagerank','in_degree','out_degree'))

In [0]:
test = spark.read.parquet("dbfs:/student-groups/Group_4_1/features/airport_pagerank/year=2015/")

In [0]:
display(test.select('id','pagerank','in_degree','out_degree'))

In [0]:
# Apply feature steps
flights = add_local_time_features(flights, time_col="sched_depart_date_time", test_mode=True)
flights = compute_and_join_pagerank_metrics(flights, base_path=BASE_FEATURE_PATH, test_mode=True)
flights = generate_lagged_delay_aggregates(flights, base_path=BASE_FEATURE_PATH, test_mode=True)
flights = add_prophet_features_per_airport(flights, base_path=BASE_FEATURE_PATH, test_mode=True)

# Save final feature set for model training
save_features(flights, f"{BASE_FEATURE_PATH}/full_features_test")

In [0]:
# Directory Inspection
display(dbutils.fs.ls(f"{BASE_FEATURE_PATH}/"))

In [0]:
import holidays
import datetime
us_holidays = holidays.US()
datetime.date(2025, 1, 1) in us_holidays


In [0]:
display(flights)

In [0]:
sdf = flights.filter(col("ORIGIN") == 'BOS').withColumn("ds", to_date("FL_DATE"))
df_pd = sdf.groupBy("ds").agg(avg("outcome").alias("y")).orderBy("ds").toPandas()

In [0]:
isinstance(df_pd, pd.DataFrame)

In [0]:
df_pd["ds"] = pd.to_datetime(df_pd["ds"])

In [0]:
us_holidays = holidays.US(years=range(2014, 2020))
holiday_dates = list(us_holidays.keys())

In [0]:
us_holidays_ts = [pd.Timestamp(h) for h in us_holidays]


In [0]:
df_pd["holidays"] = df_pd["ds"].apply(
    lambda d: [__builtins__.abs((d - h).days) for h in us_holidays_ts]
)


In [0]:
df_pd["is_holiday_week"] = df_pd["ds"].apply(
    lambda d: any(__builtins__.abs((d - pd.Timestamp(h)).days) <= 3 for h in us_holidays_ts)
)

In [0]:
df_pd['is_holiday_week'].value_counts()

In [0]:
model = Prophet(
                weekly_seasonality=True,
                yearly_seasonality=True,
                daily_seasonality=True,
                holidays_prior_scale=10
            )
model.add_country_holidays(country_name='US')
model.fit(df_pd)


In [0]:
future = model.make_future_dataframe(periods=0)
forecast = model.predict(future)
merged = pd.merge(forecast, df_pd, how="left", on="ds")

In [0]:
merged

In [0]:
airport_list = ["JFK", "ORD"] 
result_frames = []

for airport in airport_list:
        print(f"🔮 Prophet for airport: {airport}")
        save_path = f"{base_path}/prophet_outputs/airport={airport}/"

        if Path(save_path).exists():
            prophet_df = spark.read.parquet(save_path)
            print(f"✅ Loaded cached Prophet features for {airport}")
        else:
            sdf = spark_df.filter(col("ORIGIN") == airport).withColumn("ds", to_date("FL_DATE"))
            df_pd = sdf.groupBy("ds").agg(avg("outcome").alias("y")).orderBy("ds").toPandas()

            if df_pd.shape[0] < 90:
                print(f"⚠️ Not enough data for Prophet at {airport} — skipping.")
                continue
            df_pd["ds"] = pd.to_datetime(df_pd["ds"])
            print(holidays.US())
            df_pd["holidays"] = df_pd["ds"].apply(lambda d: abs((d - pd.Timestamp(h)).days) for h in holidays.US())
            df_pd["is_holiday_week"] = df_pd["ds"].apply(
                lambda d: any(abs((d - pd.Timestamp(h)).days) <= 3 for h in holidays.US())
            )

            model = Prophet(
                weekly_seasonality=True,
                yearly_seasonality=True,
                daily_seasonality=True,
                holidays_prior_scale=10
            )
            model.add_country_holidays(country_name='US')
            model.fit(df_pd)

            future = model.make_future_dataframe(periods=0)
            forecast = model.predict(future)
            merged = pd.merge(forecast, df_pd, how="left", on="ds")
            merged["residual"] = merged["y"] - merged["yhat"]
            merged["FL_DATE"] = merged["ds"] + pd.Timedelta(days=7)
            # Extract only past dates & components
            prophet_features = merged[["trend", "FL_DATE", "weekly", "daily", "yearly", "residual", "additive_terms", "multiplicative_terms", "is_holiday_week"]].copy()
            prophet_features["ORIGIN"] = airport
            prophet_features["is_holiday_week"] = df_pd.set_index("ds")["is_holiday_week"].reindex(forecast["ds"]).fillna(False).values

            final = spark.createDataFrame(prophet_features.rename(columns={"ds": "FL_DATE"}))
            final.write.mode("overwrite").parquet(save_path)
            print(f"💾 Saved Prophet output for {airport}")
            prophet_df = final
