# Setup

[Project Plan](https://docs.google.com/spreadsheets/d/1E4A3SaTAEjh9owH4SBUMv987bktwrW4Q6TXCZ5LJ6Xg/edit?gid=1115838130#gid=1115838130)

[Very Useful Resources™](https://bcourses.berkeley.edu/courses/1540263/pages/additional-very-useful-resources?module_item_id=17267995)

[FP Module](https://bcourses.berkeley.edu/courses/1540263/pages/mids-261-final-project-dataset-and-cluster?module_item_id=17267994)

[FP Phase II Assignment](https://bcourses.berkeley.edu/courses/1540263/assignments/8834546)

In [0]:
from pyspark.sql.functions import col
import pyspark.sql.functions as F
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pyspark.sql.functions import col,isnan, when, count, concat_ws, countDistinct, collect_set

print("Your very own PHASE 2 legend is about to unfold! A world of dreams and adventures with PHASE 2 awaits! \nLet's go!") 

In [0]:
data_BASE_DIR = "dbfs:/mnt/mids-w261/"
display(dbutils.fs.ls(f"{data_BASE_DIR}")) #note the other possible samples we can use like 1 day

In [0]:
qdf_otpw = spark.read.format("csv").option("header","true").load(f"dbfs:/mnt/mids-w261/OTPW_3M_2015.csv") #quarter 1 otpw df


In [0]:
ydf_otpw = spark.read.format("csv").option("header","true").load(f"dbfs:/mnt/mids-w261/OTPW_12M/OTPW_12M/") #year 1 otpw df



In [0]:
ydf_otpw.count()

In [0]:
qdf_otpw.filter(F.col('HourlyPrecipitation').isNull()).select(F.col('BackupName').isNull())

In [0]:
qdf_otpw.select('origin_type').distinct().show()

In [0]:
display(qdf_otpw.select('origin_region').distinct())

# Basic EDA

In [0]:
display(qdf_otpw.groupBy('OP_UNIQUE_CARRIER').agg(
    F.avg('DEP_DELAY').alias('Avg_DEP_DELAY'),
    F.avg('ARR_DELAY').alias('Avg_ARR_DELAY'),
    F.avg('DISTANCE').alias('Avg_DISTANCE')
))

In [0]:
display(qdf_otpw.groupBy('FL_DATE').agg(
    F.avg('DEP_DELAY').alias('Avg_DEP_DELAY'),
).orderBy('FL_DATE'))

In [0]:
display(qdf_otpw.withColumn("DEP_HOUR", (F.col("DEP_TIME") / 100).cast("int")).select('DEP_HOUR','DEP_TIME').limit(10))


In [0]:
qdf_delays = qdf_otpw.groupBy('FL_DATE').agg(
    F.avg('DEP_DELAY').alias('Avg_DEP_DELAY')
).orderBy('FL_DATE').toPandas()

qdf_delays['FL_DATE'] = pd.to_datetime(qdf_delays['FL_DATE'])

plt.figure(figsize=(12, 6))
plt.plot(qdf_delays['FL_DATE'], qdf_delays['Avg_DEP_DELAY'], marker='o', linestyle='-')
plt.xlabel('Date')
plt.ylabel('Average Departure Delay (minutes)')
plt.title('Time Series of Average Departure Delays')
plt.xticks(rotation=45)  # Rotate x-axis labels for readability
plt.grid(True)


plt.show()

# birds eye view

In [0]:
from pyspark.sql.functions import col

#template to look at box plots and see what might be predictive of delay

wind_df = qdf_otpw.filter(F.col('HourlyWindSpeed').isNotNull()) \
    .withColumn("DEP_DEL15", col("DEP_DEL15").cast("string")) \
    .select("HourlyWindSpeed", "DEP_DEL15") \
    .toPandas()

plt.figure(figsize=(12, 6))
wind_df.boxplot(column="HourlyWindSpeed", by="DEP_DEL15")

plt.xticks(rotation=90) 
plt.xlabel("Departure Delay (Binary: 0 = No Delay, 1 = Delayed)")
plt.ylabel("Hourly Wind Speed")
plt.title("Boxplot of Hourly Wind Speed by Departure Delay")
plt.suptitle("")
plt.show()

#maybe need to take into account direction wrt route?? why 50 mph but not delayed (or canceled since cancelled DEP_DEL15 will be null)

- lagged delay times/expected arrival times for each record based on aircraft tracking; lagged weather @ origin lags, etc

## Nulls exploration

In [0]:
qdf_nulls = qdf_otpw.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in qdf_otpw.columns]).toPandas().T.sort_values(by=0, ascending=False)

In [0]:
qdf_nulls[qdf_nulls[0]>0].head(50)

In [0]:
qdf_nulls[qdf_nulls[0]>0].tail(50)

### Hourly Cols

In [0]:
#For rows with null values for columns of interest like Hourly dew point temp, are there other non null rows corresponding to the same airport and time that we can use/ match to fill in?

display( qdf_otpw.withColumn("DEP_HOUR", (F.col("DEP_TIME") / 100).cast("int")) \
    .groupBy('FL_DATE', 'DEP_HOUR', 'origin_station_name').agg(
        F.count(F.when(F.col('HourlyDewPointTemperature').isNull(), 1)).alias('Null_Count'),
        F.count(F.when(F.col('HourlyDewPointTemperature').isNotNull(), 1)).alias('Non_Null_Count')
).orderBy('FL_DATE', 'origin_station_name','DEP_HOUR').filter(F.col('Null_Count')>0))

#most of the time it seems like yes
#but why are just 1 null and others non null for these, is it because of flight characteristics?
#or row 620 - Detroit - 10 null, 27 non null at 8 am on jan 20
#are these the same rows where other hourly cols are null? ie if dewp is null are other hourly weather vals null 


In [0]:

#try to find patterns for when some cols are null
agg_expressions = [count(when(col(c).isNull(), 1)).alias(c) for c in qdf_otpw.columns if c != "HourlyPrecipitation"]

null_counts = qdf_otpw.filter(col("HourlyPrecipitation").isNull()).agg(*agg_expressions)

display(null_counts)

#note monthly total liquid precipitation only has 159028 missing records when hourly preciptation is null, only a small portion of the 1401363 overall missing monthly total liquid precipitation records


In [0]:
#confirm the cols are all null on the same records
agg_expressions = [count(when(col(c).isNull(), 1)).alias(c) for c in qdf_otpw.columns if c != "MonthlyTotalLiquidPrecipitation"]

display(qdf_otpw.filter(col("MonthlyTotalLiquidPrecipitation").isNull()).agg(*agg_expressions))


In [0]:
display(
    qdf_otpw.filter(F.col("HourlyPrecipitation").isNull())
    .groupBy('REPORT_TYPE')
    .agg(F.count('*').alias('count'))
    .orderBy(F.desc('count'))
)

#   FM-16 = SPECI Aviation selected special weather report 
#   FM-12 = SYNOP Report of surface observation form a fixed land station 

In [0]:
total_counts = qdf_otpw.groupBy('origin_station_name').agg(F.count('*').alias('total_count'))


null_counts = qdf_otpw.filter(F.col("HourlyPrecipitation").isNull()) \
    .groupBy('origin_station_name') \
    .agg(F.count('*').alias('null_count'))


percent_nulls = null_counts.join(total_counts, on="origin_station_name") \
    .withColumn("percentage_null", (F.col("null_count") / F.col("total_count")) * 100) \
    .orderBy(F.desc("percentage_null")) \
    .join(qdf_otpw.select('origin_station_name','origin_station_dis', 'origin_type').distinct(), on="origin_station_name")

display(percent_nulls)

#some stations are entirely null hourly records :( suggests that some stations are not reporting hourly weather data? 
#but not due to distance to station


In [0]:

display(qdf_otpw.filter(col('HourlyPrecipitation').isNull()) \
                 .select(count(when(col('BackupName').isNull(), 1)).alias('Null Count'), count(when(col('BackupName').isNotNull(), 1)).alias('Valid Count')))



In [0]:
#how many missing records per day per airport, is there at least 1 per hour? maybe some null to avoid duplicates?