## Testing

In [0]:
from pyspark.sql.functions import col, lag
from pyspark.sql.window import Window
import pyspark.sql.functions as F
import time 

In [0]:
folder_path = "dbfs:/student-groups/Group_4_1"
# dataset = 'parquet_airlines_data_1y' # 1 year
dataset = 'parquet_airlines_data_3m' # 3 months
df = spark.read.parquet(f"{folder_path}/interim/{dataset}_clean.parquet")

In [0]:
team_BASE_DIR = f"dbfs:/student-groups/Group_4_1"
spark.sparkContext.setCheckpointDir(f"{team_BASE_DIR}/modeling_checkpoints")
period = "1y" # on of the following values ("", "3m", "6m", "1y")
ydf = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined_flights_weather_{period}_v1.parquet")

In [0]:
display(df)

In [0]:
tail_freq = df.groupBy("TAIL_NUM").agg(F.count('TAIL_NUM').alias('count'),).orderBy("count")
display(tail_freq)

In [0]:
dd = df.select(df.origin_iata, df.dest_iata,
               df.CRS_ELAPSED_TIME, 
               df.DEP_DELAY,
               df.ORIGIN_CITY_NAME, df.DEST_CITY_NAME,
               df.dep_datetime, df.arr_datetime).where(df.OP_UNIQUE_CARRIER=="DL").where(df.TAIL_NUM=="D942DN").orderBy(df.TAIL_NUM, df.FL_DATE, df.dep_datetime)

# EX1 one Delta plane, 2 days, 9 trips
# dx = df.where(df.OP_UNIQUE_CARRIER=="DL").where(df.TAIL_NUM=="D942DN").orderBy(df.TAIL_NUM, df.FL_DATE, df.dep_datetime)
# EX2 
dx = df.where(df.TAIL_NUM=="N78008").orderBy(df.TAIL_NUM, df.FL_DATE, df.dep_datetime)
display(dx)


From prior flight we would want: 
- prior_origin "ORIGIN" / prior_origin_name "ORIGIN_CITY_NAME"
- prior_departure "dep_datetime" 
- prioir_is_delay

Nice to have
- has it arrived (arr_datetime >= 2hrs-expected departure time)

Assumptions:
- Tail number of flight is scheduled before 2hr of expected departure
- Prior flight is unique and consistant on tail number and airport location 
- Prior flight arrived with 24 hours and prior destination must match current departure airport

Adressing Data Leakage:
- prior flight departure time must be >= 2hrs+expected departure. Using actual departure time instead because I dont have expected



In [0]:
## DO NOT PURSUE JOIN METHOD. 
# CANNOT PINPOINT JOIN ON "PRIOR" FLIGHT WITH VARIABLE TIME BASED COMPONENT. WOULD NEED TO POST FILTER EXTRANEOUS JOIN ROWS 

# should use current flight estimated departure time 

# result = dx.alias("cf").join(dx.alias("pf"),[col('cf.tail_num') == col('pf.tail_num'), 
#                                              col('cf.ORIGIN') == col('pf.DEST'),
#                                              col('cf.FL_DATE')==col('pf.FL_DATE')
#                                              ],'left')\
#     .select("cf.tail_num","cf.ORIGIN_CITY_NAME", "cf.DEST_CITY_NAME", "cf.dep_datetime", "cf.arr_datetime", col("pf.ORIGIN_CITY_NAME").alias("PRIOR_ORIGIN"))

# result = dx.alias("cf").select("cf.tail_num","cf.ORIGIN_CITY_NAME", "cf.DEST_CITY_NAME", "cf.dep_datetime", "cf.arr_datetime")


# display(result)

In [0]:
# Example 1: one plane from delta based in Atlanta, 2 days, 9 trips
## NEED TO ADD MORE CONDITIONS GIVEN EDGE CASES

dxex1 = df.where(df.OP_UNIQUE_CARRIER=="DL").where(df.TAIL_NUM=="D942DN").orderBy(df.TAIL_NUM, df.FL_DATE, df.dep_datetime)

resultex1 = dxex1.withColumn("priorflight_origin",
                             lag("ORIGIN_CITY_NAME").over(Window.partitionBy("tail_num").orderBy("dep_datetime")))\
                .withColumn("priorflight_deptime",
                            lag("dep_datetime").over(Window.partitionBy("tail_num").orderBy("dep_datetime")))\
                .withColumn("priorflight_arrtime",
                            lag("arr_datetime").over(Window.partitionBy("tail_num").orderBy("dep_datetime")))\
                .withColumn("presentmoment", (F.col("dep_datetime") - F.expr("INTERVAL 2 HOURS")).cast("timestamp"))\
                .withColumn("priorflight_deptime_final", F.when(F.col("priorflight_deptime") <= (F.col("presentmoment")) ,F.col("priorflight_deptime")).otherwise(F.lit(None)))\
                .withColumn("priorflight_arrtime_final", F.when(F.col("priorflight_arrtime") <= (F.col("presentmoment")) ,F.col("priorflight_arrtime")).otherwise(F.lit(None)))\
    .select("tail_num",
            "ORIGIN_CITY_NAME", "DEST_CITY_NAME", 
            "priorflight_origin",
                "presentmoment",
            "dep_datetime", "arr_datetime", 
            "priorflight_deptime", "priorflight_arrtime",
            "priorflight_deptime_final", "priorflight_arrtime_final"
)

display(resultex1)


In [0]:
start_time = time.time()

# Example 2: NJ,TX, HI repetative routes. Flights missing because airplane is in locations without record of getting there

dx = df.where(df.TAIL_NUM=="N78008").orderBy(df.TAIL_NUM, df.FL_DATE, df.dep_datetime)

result = dx.withColumn("priorflight_origin",
                             lag("ORIGIN_CITY_NAME").over(Window.partitionBy("tail_num").orderBy("dep_datetime")))\
                .withColumn("priorflight_dest",
                             lag("DEST_CITY_NAME").over(Window.partitionBy("tail_num").orderBy("dep_datetime")))\
                .withColumn("priorflight_deptime",
                            lag("dep_datetime").over(Window.partitionBy("tail_num").orderBy("dep_datetime")))\
                .withColumn("priorflight_arrtime",
                            lag("arr_datetime").over(Window.partitionBy("tail_num").orderBy("dep_datetime")))\
                .withColumn("presentmoment", (F.col("dep_datetime") - F.expr("INTERVAL 2 HOURS")).cast("timestamp"))\
                .withColumn("dayprior_presentmoment", (F.col("dep_datetime") - F.expr("INTERVAL 26 HOURS")).cast("timestamp"))\
                .withColumn("priorflight_deptime_final", F.when(((F.col("priorflight_deptime") <= (F.col("presentmoment"))) & 
                                                                 (F.col("ORIGIN_CITY_NAME") == F.col("priorflight_dest")) &
                                                                 (F.col("priorflight_arrtime") >= F.col("dayprior_presentmoment")))
                                                                ,F.col("priorflight_deptime")).otherwise(F.lit(None)))\
                .withColumn("priorflight_arrtime_final", F.when(((F.col("priorflight_arrtime") <= (F.col("presentmoment"))) &
                                                                 (F.col("ORIGIN_CITY_NAME") == F.col("priorflight_dest")) & 
                                                                 (F.col("priorflight_arrtime") >= F.col("dayprior_presentmoment"))) 
                                                                ,F.col("priorflight_arrtime")).otherwise(F.lit(None)))\
    .select("tail_num",
            "ORIGIN_CITY_NAME", "DEST_CITY_NAME", 
            "priorflight_origin",
                # "presentmoment", "dayprior_presentmoment"
            "dep_datetime", "arr_datetime",
            # "priorflight_deptime", "priorflight_arrtime",
            "priorflight_deptime_final", 
            "priorflight_arrtime_final"
)

display(result)


end_time = time.time()
execution_time = end_time - start_time
print(f"Execution Time: {execution_time} seconds")

In [0]:
## RUN RESULT FOR ENTIRE 3MONTHS

start_time = time.time()

dx = df

result = dx.withColumn("priorflight_origin",
                             lag("ORIGIN_CITY_NAME").over(Window.partitionBy("tail_num").orderBy("dep_datetime")))\
                .withColumn("priorflight_dest",
                             lag("DEST_CITY_NAME").over(Window.partitionBy("tail_num").orderBy("dep_datetime")))\
                .withColumn("priorflight_deptime",
                            lag("dep_datetime").over(Window.partitionBy("tail_num").orderBy("dep_datetime")))\
                .withColumn("priorflight_arrtime",
                            lag("arr_datetime").over(Window.partitionBy("tail_num").orderBy("dep_datetime")))\
                .withColumn("presentmoment", (F.col("dep_datetime") - F.expr("INTERVAL 2 HOURS")).cast("timestamp"))\
                .withColumn("dayprior_presentmoment", (F.col("dep_datetime") - F.expr("INTERVAL 26 HOURS")).cast("timestamp"))\
                .withColumn("priorflight_deptime_final", F.when(((F.col("priorflight_deptime") <= (F.col("presentmoment"))) & 
                                                                 (F.col("ORIGIN_CITY_NAME") == F.col("priorflight_dest")) &
                                                                 (F.col("priorflight_arrtime") >= F.col("dayprior_presentmoment")))
                                                                ,F.col("priorflight_deptime")).otherwise(F.lit(None)))\
                .withColumn("priorflight_arrtime_final", F.when(((F.col("priorflight_arrtime") <= (F.col("presentmoment"))) &
                                                                 (F.col("ORIGIN_CITY_NAME") == F.col("priorflight_dest")) & 
                                                                 (F.col("priorflight_arrtime") >= F.col("dayprior_presentmoment"))) 
                                                                ,F.col("priorflight_arrtime")).otherwise(F.lit(None)))\
    .select("tail_num",
            "ORIGIN_CITY_NAME", "DEST_CITY_NAME", 
            "priorflight_origin",
                # "presentmoment", "dayprior_presentmoment"
            "dep_datetime", "arr_datetime",
            # "priorflight_deptime", "priorflight_arrtime",
            "priorflight_deptime_final", 
            "priorflight_arrtime_final"
)

# display(result)

end_time = time.time()
execution_time = end_time - start_time
print(f"Execution Time: {execution_time} seconds")



In [0]:
## SPOT CHECK 3 MONTH RESUTLS
display(result.head(20))

## Implementing

In [0]:
# folder_path = "dbfs:/student-groups/Group_4_1"
# dataset = 'joined_1y_weather_cleaned_test'
# data1 = spark.read.parquet(f"{folder_path}/interim/join_checkpoints/{dataset}.parquet")

data1 = spark.read.parquet(f"dbfs:/student-groups/Group_4_1/interim/join_checkpoints/joined_1y_weather_cleaned_combo.parquet")
display(data1)

In [0]:
data1.columns


In [0]:
## Notes

start_time = time.time()

def add_prior_cloumns(df):
    """input spark df 
        output spark df plus columns 
    """
    WindowConditions = Window.partitionBy("TAIL_NUM").orderBy("sched_depart_utc")
    WhenConditions = (F.col("ORIGIN") == F.col("priorflight_dest")) & (F.col("priorflight_deptime") >= F.col("twentysix_hours_prior_depart_UTC"))

    result_df = (df.withColumn("priorflight_origin", 
                              lag("ORIGIN").over(WindowConditions))\
                    .withColumn("priorflight_dest", 
                                lag("DEST").over(WindowConditions))\
                    .withColumn("twentysix_hours_prior_depart_UTC", 
                                (F.col("two_hours_prior_depart_UTC") - F.expr("INTERVAL 24 HOURS")).cast("timestamp"))\
                    .withColumn("priorflight_deptime", 
                                lag("sched_depart_utc").over(WindowConditions))\
                    .withColumn("priorflight_depdelay",
                                lag("DEP_DELAY").over(WindowConditions))\
                    .withColumn("priorflight_distance", 
                                lag("DISTANCE").over(WindowConditions))\
                    .withColumn("priorflight_crs_elapsed_time", 
                                lag("CRS_ELAPSED_TIME").over(WindowConditions))\
                    .withColumn("priorflight_depdelay_final", 
                                F.when(WhenConditions,F.col("priorflight_depdelay")).otherwise(F.lit(None)))\
                    .withColumn("priorflight_deptime_final", 
                                F.when(((F.col("priorflight_deptime") <= (F.col("two_hours_prior_depart_UTC"))) & WhenConditions)\
                                                                    ,F.col("priorflight_deptime")).otherwise(F.lit(None)))\
                    .withColumn("priorflight_isdelayed", 
                                F.when(F.col("priorflight_depdelay_final")>=15, 1).when(
                                    F.col("priorflight_depdelay_final")< 15, 0).otherwise(F.lit(None)))\
                    .withColumn("elapsed_time_calculated", 
                                (F.col("AIR_TIME")+ F.col("TAXI_IN")+F.col("TAXI_OUT")).cast("int"))\
                    .withColumn("flight_arr_time_calc", 
                                F.expr("from_unixtime(unix_timestamp(`sched_depart_utc`) + (`elapsed_time_calculated` * 60))"))\
                    .withColumn("priorflight_arr_time_calc", 
                                lag("flight_arr_time_calc").over(WindowConditions))\
                    .withColumn("priorflight_arr_time_calc_final", 
                                F.when(((F.col("priorflight_arr_time_calc") <= (F.col("two_hours_prior_depart_UTC"))) & WhenConditions)\
                                                                    ,F.col("priorflight_arr_time_calc")).otherwise(F.lit(None)))\
                    .withColumn("priorflight_arrived", 
                                F.when( (F.col("priorflight_arr_time_calc_final") <= (F.col("two_hours_prior_depart_UTC"))) & WhenConditions , 1).otherwise(0))\
                    .withColumn("priorflght_est_arr_time", 
                                F.expr("from_unixtime(unix_timestamp(`priorflight_deptime_final`) + (`priorflight_crs_elapsed_time` * 60))"))\
                    .withColumn("priorflght_est_arr_time_final", 
                                F.when(F.col("priorflight_arr_time_calc_final").isNotNull(),F.col("priorflight_arr_time_calc_final"))\
                                        .when(F.col("priorflight_deptime_final").isNotNull(), F.col("priorflght_est_arr_time")).otherwise(F.lit(None)))\
                    .withColumn("est_tail_turnaround_window_sec", 
                                F.col("sched_depart_utc").cast("long") - F.col("priorflght_est_arr_time_final").cast("timestamp").cast("long"))\
                    .withColumn("est_tail_turnaround_window_min", 
                                F.round(F.col("est_tail_turnaround_window_sec")/60))
                    
    )

    return result_df

result = add_prior_cloumns(data1)


end_time = time.time()
execution_time = end_time - start_time
print(f"Execution Time: {execution_time} seconds")



In [0]:
## FORMAT
start_time = time.time()

def add_prior_cloumns(df):
    """input spark df 
        output spark df plus columns 
    """
    WindowConditions = Window.partitionBy("TAIL_NUM").orderBy("sched_depart_utc")
    WhenConditions = (F.col("ORIGIN") == F.col("priorflight_dest")) & (F.col("priorflight_deptime") >= F.col("twentysix_hours_prior_depart_UTC"))

    result_df = (df.withColumn("priorflight_origin", 
                              lag("ORIGIN").over(WindowConditions)) #airplane origin at t-1
                    .withColumn("priorflight_dest", 
                                lag("DEST").over(WindowConditions)) #airplane dest transition t-1 to t
                    .withColumn("twentysix_hours_prior_depart_UTC", 
                                (F.col("two_hours_prior_depart_UTC") - F.expr("INTERVAL 24 HOURS"))
                                .cast("timestamp")) #constraint for "not too long ago"
                    .withColumn("priorflight_deptime", 
                                lag("sched_depart_utc").over(WindowConditions)) #scheduled departure utc of t-1 transition
                    .withColumn("priorflight_depdelay",
                                lag("DEP_DELAY").over(WindowConditions)) 
                    .withColumn("priorflight_distance", 
                                lag("DISTANCE").over(WindowConditions))
                    .withColumn("priorflight_crs_elapsed_time", 
                                lag("CRS_ELAPSED_TIME").over(WindowConditions))
                    .withColumn("priorflight_depdelay_final", 
                                F.when(WhenConditions,F.col("priorflight_depdelay")) #************
                                .otherwise(F.lit(None))
                                ) #dep delay contingent on origin=dest, deptime within 26 hrs
                    .withColumn("priorflight_deptime_final", 
                                F.when(
                                    (
                                        (F.col("priorflight_deptime") <= (F.col("two_hours_prior_depart_UTC"))
                                         ) & WhenConditions), #when yesterday <= deptime <= window, origin=dest
                                    F.col("priorflight_deptime"))
                                    .otherwise(F.lit(None))) #else null ****
                    .withColumn("priorflight_isdelayed", 
                                F.when(F.col("priorflight_depdelay_final")>=15, 1).when(
                                    F.col("priorflight_depdelay_final")< 15, 0).otherwise(F.lit(None))) #fill based on depdelay_final
                    .withColumn("elapsed_time_calculated", 
                                (F.col("AIR_TIME")+ F.col("TAXI_IN")+F.col("TAXI_OUT")).cast("int")) #contingent on live info
                    .withColumn("flight_arr_time_calc", 
                                F.expr("from_unixtime(unix_timestamp(`sched_depart_utc`) + (`elapsed_time_calculated` * 60))")) #****
                    .withColumn("priorflight_arr_time_calc", 
                                lag("flight_arr_time_calc").over(WindowConditions))\
                    .withColumn("priorflight_arr_time_calc_final", 
                                F.when(
                                    (
                                        (F.col("priorflight_arr_time_calc") <= (F.col("two_hours_prior_depart_UTC"))) 
                                        & WhenConditions), #yesterday <= arr_time_calc <= window, origin=dest
                                    F.col("priorflight_arr_time_calc"))
                                    .otherwise(F.lit(None))) #else null ****
                    .withColumn("priorflight_arrived", 
                                F.when(
                                    (F.col("priorflight_arr_time_calc_final") <= (F.col("two_hours_prior_depart_UTC"))
                                     ) & WhenConditions , 1).otherwise(0)) 
                    .withColumn("priorflght_est_arr_time", 
                                F.expr("from_unixtime(unix_timestamp(`priorflight_deptime_final`) + (`priorflight_crs_elapsed_time` * 60))"))\
                    .withColumn("priorflght_est_arr_time_final", 
                                F.when(F.col("priorflight_arr_time_calc_final").isNotNull(),F.col("priorflight_arr_time_calc_final"))\
                                        .when(F.col("priorflight_deptime_final").isNotNull(), F.col("priorflght_est_arr_time")).otherwise(F.lit(None)))\
                    .withColumn("est_tail_turnaround_window_sec", 
                                F.col("sched_depart_utc").cast("long") - F.col("priorflght_est_arr_time_final").cast("timestamp").cast("long"))\
                    .withColumn("est_tail_turnaround_window_min", 
                                F.round(F.col("est_tail_turnaround_window_sec")/60))
                    
    )

    return result_df

result = add_prior_cloumns(data1)


end_time = time.time()
execution_time = end_time - start_time
print(f"Execution Time: {execution_time} seconds")



In [0]:
display(result.where(result.TAIL_NUM=="250NV").orderBy(result.FL_DATE))

In [0]:
display(result.where(result.TAIL_NUM=="N984TW").orderBy(result.FL_DATE))

### Checkpoint results

In [0]:
result.count()

In [0]:
output_path = "dbfs:/student-groups/Group_4_1/interim/join_checkpoints/joined_1y_weather_cleaned_combo_pfd.parquet"
(
    result.write
    .mode("overwrite")
    .parquet(output_path)
)

In [0]:
test = spark.read.parquet(f"dbfs:/student-groups/Group_4_1/interim/join_checkpoints/joined_1y_weather_cleaned_combo_pfd.parquet")

In [0]:
test.count()