In [0]:
# imports
import pandas as pd
import numpy as np
import pytz
from datetime import datetime, timedelta, time
from prophet import Prophet
from prophet.make_holidays import make_holidays_df
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from pyspark.sql.functions import to_timestamp
from prophet.plot import plot_forecast_component
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, StructType, DoubleType, LongType
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer, OneHotEncoder, MinMaxScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.mllib.evaluation import MulticlassMetrics,BinaryClassificationMetrics
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import col, when, to_timestamp, lit, udf
from pyspark.ml import Pipeline

In [0]:
team_BASE_DIR = f"dbfs:/student-groups/Group_4_1"


In [0]:
spark.sparkContext.setCheckpointDir(f"{team_BASE_DIR}/modeling_checkpoints")
df_full=spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined_1y_cleaned_engineered.parquet")


In [0]:
# data time period
period = "1y" # on of the following values ("", "3m", "6m", "1y")

# number of cross-validation folds
k = 5
overlap = 0.2


# compute seasonality?
# (False if you've already saved out seasonality models for a given CV split setup) #using false since erica already computed
compute_seasonality = False

# define train/test split date
min_test_dt = "2019-10-01"

# define what departure time variable is called
dep_utc_varname = "sched_depart_utc"

In [0]:
df_full = df_full.withColumns(
    {
        "dep_hour_utc": 
            F.hour(col(dep_utc_varname)),
        "outcome":  
            (F.when((col("DEP_DELAY") >= 15) | (col("CANCELLED") == 1), 1).otherwise(0)).cast("double")
            }
)

In [0]:
# split into train and test

df_train = df_full.filter(F.col(dep_utc_varname) < min_test_dt)
df_train.cache()
df_test = df_full.filter(F.col(dep_utc_varname) >= min_test_dt)
df_test.cache()

In [0]:
def get_seasonality_data(df, fold, k):
  """
  Look up seasonlaity features from saved seasonality model.
  """
  if fold == 'full':
      fn_model = f"seasonality_model_{period}_train.parquet"
  else:
      fn_model = f"seasonality_model_{period}_cv{fold}of{k}_overlap{overlap}.parquet"
  model = spark.read.parquet(f"{team_BASE_DIR}/interim/{fn_model}")

  joined_df = df.join(model, 
                    (df["ORIGIN"] == model["ORIGIN"]) & 
                    (df["DAY_OF_WEEK"] == model["dow"]) & 
                    (df["dep_hour_utc"] == model["hour"]),
                    how="left").drop(model["ORIGIN"])
  
  return joined_df


# CODE BELOW DERIVED FROM DEMO 11 NOTEBOOK
import pyspark.sql.functions as f


def upsample(train_df,verbose=False):
  '''Upsamples train_df to balance classes'''
  #balance classes in train
  delay_count = train_df.filter(f.col("outcome") == 1).count()
  non_delay_count = train_df.filter(f.col("outcome") == 0).count()

  total = delay_count + non_delay_count
  keep_percent = non_delay_count / delay_count

  train_delay = train_df.filter(f.col('outcome') == 0)
  train_non_delay = train_df.filter(f.col('outcome') == 1).sample(withReplacement=True, fraction=keep_percent,seed=42)
  train_upsampled = train_delay.union(train_non_delay)
  return train_upsampled


def downsample(train_df,verbose=False):
  '''Downsamples train_df to balance classes'''
  #balance classes in train
  delay_count = train_df.filter(f.col("outcome") == 1).count()
  non_delay_count = train_df.filter(f.col("outcome") == 0).count()

  total = delay_count + non_delay_count
  keep_percent = delay_count / non_delay_count
  
  train_delay = train_df.filter(f.col('outcome') == 1)
  train_non_delay = train_df.filter(f.col('outcome') == 0).sample(withReplacement=False,fraction=keep_percent,seed=42)
  train_downsampled = train_delay.union(train_non_delay)
  return train_downsampled

def cv_eval(preds):
  """
  Input: transformed df with prediction and label
  Output: desired score 
  """
  rdd_preds_m = preds.select(['prediction', 'outcome']).rdd
  rdd_preds_b = preds.select('outcome','probability').rdd.map(lambda row: (float(row['probability'][1]), float(row['outcome'])))
  metrics_m = MulticlassMetrics(rdd_preds_m)
  metrics_b = BinaryClassificationMetrics(rdd_preds_b)
  F2 = np.round(metrics_m.fMeasure(label=1.0, beta=2.0), 4)
  pr = metrics_b.areaUnderPR
  return F2, pr

def timeSeriesSplitCV(df, pipeline, cv_info, sampling=None, metric='f2', verbose=True, dep_utc_varname=dep_utc_varname):
  '''
  Perform time series split k-fold cross validation 
  '''

  k = len(cv_info)
  
  # Track score
  scores=[]
  
  # Start k-fold
  for i in range(k):
    
    # Create train set
    train_df = df.filter((df[dep_utc_varname] >= cv_info["train_min"][i]) & \
      (df[dep_utc_varname] < cv_info["train_max"][i])).cache()
      
    # Create dev set
    dev_df = df.filter((df[dep_utc_varname] >= cv_info["test_min"][i]) & \
      (df[dep_utc_varname] < cv_info["test_max"][i])).cache() 

    # Apply sampling on train if selected
    if sampling=='down':
      train_df = downsample(train_df)
      train_df = train_df.cache()
    elif sampling=='up':
      train_df = upsample(train_df)
      train_df = train_df.cache()
    # elif sampling=='weights':
    #   train_df = add_class_weights(train_df).cache()
      
    #print info on train and dev set for this fold
    if verbose:
      print('    TRAIN set for fold {} goes from {} to {}, count is {:,} flights ({})'.format((i+1), 
                                                                                      train_df.agg({dep_utc_varname:'min'}).collect()[0][0],
                                                                                      train_df.agg({dep_utc_varname:'max'}).collect()[0][0],
                                                                                      train_df.count(),
                                                                                      sampling + '-sampled' if sampling else 'no sampling'))
      print('    DEV set for fold {} goes from {} to {}, count is {:,} flights'.format((i+1), 
                                                                                      dev_df.agg({dep_utc_varname:'min'}).collect()[0][0],
                                                                                      dev_df.agg({dep_utc_varname:'max'}).collect()[0][0],
                                                                                      dev_df.count()))
      
    # TODO: remove once feat engineering applied outside
    train_df = get_seasonality_data(train_df, i, k)
    train_df = train_df.fillna({col:0 for col in ['daily','weekly']})
    dev_df = get_seasonality_data(dev_df, i, k)
    dev_df = dev_df.fillna({col:0 for col in ['daily','weekly']})

    # print(train_df.dtypes)
    # print(dev_df.dtypes)
        
    # Fit params on the model
    model = pipeline.fit(train_df)
    dev_pred = model.transform(dev_df)
    if metric=='f2':
      score = cv_eval(dev_pred)[0]
    elif metric=='pr':
      score = cv_eval(dev_pred)[1]
    scores.append(score)
    print(f'    Number of training datapoints for fold number {i+1} is {train_df.count():,} with a {metric} score of {score:.2f}') 
    print('------------------------------------------------------------')
  
  # Take average of all scores
  avg_score = np.average(scores)    
  print(f'Average {metric} score across all folds is {avg_score:.2f}')
  print("************************************************************")

  # # Train on full df
  # print('Training on full train dataset, and validating on dev dataset with best parameters from CV:')
  # print(best_parameters)
    
  # if verbose:
  #   print('    TRAIN set for best parameter fitted model goes from {} to {}, count is {:,} flights ({})'.format(train_df.agg({dep_utc_varname:'min'}).collect()[0][0],
  #                                                                                                    train_df.agg({dep_utc_varname:'max'}).collect()[0][0],
  #                                                                                                    train_df.count(),
  #                                                                                                    sampling + '-sampled' if sampling else 'no sampling'))
  return avg_score

In [0]:

df_train_downsampled = downsample(df_train).cache()
df_train_seasonal = get_seasonality_data(df_train_downsampled, 'full', k).cache()
df_test_seasonal = get_seasonality_data(df_test, 'full', k).cache()


In [0]:
import mlflow


# Baseline evaluation

In [0]:

base_run = 'runs:/5b1caaedee434f1fa84c4b0723099f5e/model'
base_model = mlflow.spark.load_model(base_run)
base_pred = base_model.transform(df_test_seasonal)
# get f2 score
base_score = cv_eval(base_pred)[0]
print(base_score) #sanity check

In [0]:
display(base_pred)

## False negs

In [0]:
#false negs
base_pred.filter((F.col('outcome')==1) &(F.col('prediction')==0)).groupBy('CANCELLED').count().show()

In [0]:

base_pred.filter((F.col('outcome')==1) &(F.col('prediction')==0)).groupBy('priorflight_cancelled_true').count().show()

In [0]:
import geohash
from geohash import bbox
import geopandas as gpd

In [0]:
import seaborn as sns

In [0]:


confusion_df = base_pred.groupBy('outcome', 'prediction').count().orderBy('outcome', 'prediction')
confusion_df = confusion_df.withColumn('label', 
                                       F.when((F.col('outcome') == 1) & (F.col('prediction') == 0), 'False Negative')
                                        .when((F.col('outcome') == 0) & (F.col('prediction') == 1), 'False Positive')
                                        .otherwise('Other'))
confusion_pd = confusion_df.toPandas()

confusion_matrix = confusion_pd.pivot(index='outcome', columns='prediction', values='count').fillna(0)
sns.heatmap(confusion_matrix, annot=True, fmt='g')
plt.xlabel('Prediction')
plt.ylabel('Outcome')
plt.title('Confusion Matrix')
plt.show()

In [0]:
confusion_df = base_pred.groupBy('outcome', 'prediction').count().orderBy('outcome', 'prediction').toPandas()
confusion_df


## By loc

In [0]:
import plotly.express as px
import plotly.graph_objects as go
import geohash
import geopandas as gpd
from shapely.geometry import Polygon
import json

In [0]:

base_pred = base_pred.withColumn(
    "classification",
    F.when((F.col("outcome") == 1) & (F.col("prediction") == 1), "true_positive")
     .when((F.col("outcome") == 0) & (F.col("prediction") == 0), "true_negative")
     .when((F.col("outcome") == 0) & (F.col("prediction") == 1), "false_positive")
     .when((F.col("outcome") == 1) & (F.col("prediction") == 0), "false_negative")
)

In [0]:
base_pred.groupBy('classification').count().show()

In [0]:
base_pred.groupBy('CANCELLED', 'classification').count().show()

In [0]:
confusion_metrics_by_cancellation = base_pred.groupBy('CANCELLED', 'classification').count()

pivot_confusion_metrics = confusion_metrics_by_cancellation.groupBy('CANCELLED').pivot('classification').sum('count')

highest_confusion_metrics_by_cancellaton = pivot_confusion_metrics.orderBy(
    F.col('false_positive').desc(),
    F.col('false_negative').desc(),
    F.col('true_positive').desc(),
    F.col('true_negative').desc()
)

display(highest_confusion_metrics_by_cancellaton)

In [0]:
confusion_metrics_by_origin = base_pred.groupBy('ORIGIN', 'classification').count()

pivot_confusion_metrics = confusion_metrics_by_origin.groupBy('ORIGIN').pivot('classification').sum('count')

highest_confusion_metrics_by_origin = pivot_confusion_metrics.orderBy(
    F.col('false_positive').desc(),
    F.col('false_negative').desc(),
    F.col('true_positive').desc(),
    F.col('true_negative').desc()
)

display(highest_confusion_metrics_by_origin)

In [0]:
ghdf=base_pred.filter((F.col('outcome')==1) &(F.col('prediction')==0)).groupBy('geohash').count().orderBy(F.col('count').desc()).toPandas()
ghdf.dropna(subset=['geohash'], inplace=True)

In [0]:
def geohash_to_polygon(gh):
    bbox = geohash.bbox(gh)  # Get bounding box for the geohash
    return Polygon([
        (bbox['w'], bbox['s']),  # Southwest corner
        (bbox['e'], bbox['s']),  # Southeast corner
        (bbox['e'], bbox['n']),  # Northeast corner
        (bbox['w'], bbox['n']),  # Northwest corner
        (bbox['w'], bbox['s'])   # Close the polygon
    ])

# Convert your DataFrame into a GeoDataFrame with polygons
ghdf["geometry"] = ghdf["geohash"].apply(geohash_to_polygon)
gdf = gpd.GeoDataFrame(ghdf, geometry="geometry")
gdf.set_crs(epsg=4326, inplace=True)

In [0]:
geojson = json.loads(gdf.set_index('geohash').to_json())  # Set geohash as index first

In [0]:
fig = px.choropleth_mapbox(
    ghdf,
    geojson=geojson,
    locations='geohash',  # Matches GeoJSON feature IDs
    color='count',
    color_continuous_scale="YlOrRd",
    range_color=(gdf['count'].min(), gdf['count'].max()),
    mapbox_style="open-street-map",
    zoom=3,
    center={"lat": 37.6, "lon": -95.6},  # Center on US
    opacity=0.3,
    labels={'count': '# False Negatives'}
)

fig.update_traces(marker_line_width=0)

# Customize layout
fig.update_layout(
    margin={"r":20,"t":40,"l":20,"b":20},
    coloraxis_colorbar={
        'title': 'Count of FN',
        'thickness': 20,
        'len': 0.5
    }
)

fig.show()

In [0]:
confusion_pd

# Added features evaluation

In [0]:
feats_run = 'runs:/b734b75c14d44c05a96782540f3a4868/model'

feats_model = mlflow.spark.load_model(feats_run)
feats_pred = feats_model.transform(df_test_seasonal) #df for analysis
# get f2 score
feats_score = cv_eval(feats_pred)[0]
print(feats_score) #sanity check

In [0]:

feats_pred = feats_pred.withColumn(
    "classification",
    F.when((F.col("outcome") == 1) & (F.col("prediction") == 1), "true_positive")
     .when((F.col("outcome") == 0) & (F.col("prediction") == 0), "true_negative")
     .when((F.col("outcome") == 0) & (F.col("prediction") == 1), "false_positive")
     .when((F.col("outcome") == 1) & (F.col("prediction") == 0), "false_negative")
)

In [0]:
feats_pred_renamed = feats_pred.withColumnRenamed('prediction', 'AF_prediction') \
                               .withColumnRenamed('classification', 'AF_classification')

base_pred_renamed = base_pred.withColumnRenamed('prediction', 'BL_prediction') \
                             .withColumnRenamed('classification', 'BL_classification')

result = feats_pred_renamed.join(base_pred_renamed, ['TAIL_NUM', 'sched_depart_utc'])

display(result)

In [0]:
comparison = result.groupBy("BL_classification", "AF_classification").count()
display(comparison)

In [0]:
feats_pred.groupBy('classification').count().show()

In [0]:
confusion_metrics_by_origin = feats_pred.groupBy('ORIGIN', 'classification').count()

pivot_confusion_metrics = confusion_metrics_by_origin.groupBy('ORIGIN').pivot('classification').sum('count')

highest_confusion_metrics_by_origin = pivot_confusion_metrics.orderBy(
    F.col('false_positive').desc(),
    F.col('false_negative').desc(),
    F.col('true_positive').desc(),
    F.col('true_negative').desc()
)

display(highest_confusion_metrics_by_origin)

# Added interactions evaluation

In [0]:
interacts_run = 'runs:/7da4c9d4d4cd4c418c56268729962bda/model'

# Load model
interacts_model = mlflow.spark.load_model(interacts_run)

# Perform inference via model.transform()
interacts_pred = interacts_model.transform(df_test_seasonal)

interacts_score = cv_eval(interacts_pred)[0]
print(interacts_score) #sanity check

In [0]:

interacts_pred = interacts_pred.withColumn(
    "classification",
    F.when((F.col("outcome") == 1) & (F.col("prediction") == 1), "true_positive")
     .when((F.col("outcome") == 0) & (F.col("prediction") == 0), "true_negative")
     .when((F.col("outcome") == 0) & (F.col("prediction") == 1), "false_positive")
     .when((F.col("outcome") == 1) & (F.col("prediction") == 0), "false_negative")
)

In [0]:
interacts_pred.groupBy('classification').count().show()

In [0]:
interacts_pred_renamed = interacts_pred.withColumnRenamed('prediction', 'INT_prediction') \
                               .withColumnRenamed('classification', 'INT_classification')


result = result.join(interacts_pred_renamed, ['TAIL_NUM', 'sched_depart_utc'])

display(result)

In [0]:
comparison = result.groupBy("AF_classification", "INT_classification").count()
display(comparison)

# Added regularization evaluation

In [0]:

reg_run= 'runs:/d6c4162bb8fd4818bbb714c2c711d628/model'

# Load model
reg_model = mlflow.spark.load_model(reg_run)

# Perform inference via model.transform()
reg_preds=reg_model.transform(df_test_seasonal)
reg_score = cv_eval(reg_preds)[0]
print(reg_score)

In [0]:

reg_preds = reg_preds.withColumn(
    "classification",
    F.when((F.col("outcome") == 1) & (F.col("prediction") == 1), "true_positive")
     .when((F.col("outcome") == 0) & (F.col("prediction") == 0), "true_negative")
     .when((F.col("outcome") == 0) & (F.col("prediction") == 1), "false_positive")
     .when((F.col("outcome") == 1) & (F.col("prediction") == 0), "false_negative")
)

In [0]:
reg_preds.groupBy('classification').count().show()

In [0]:
reg_pred_renamed = reg_preds.withColumnRenamed('prediction', 'REG_prediction') \
                               .withColumnRenamed('classification', 'REG_classification')


result = result.join(reg_pred_renamed, ['TAIL_NUM', 'sched_depart_utc'])

display(result)

In [0]:
comparison = result.groupBy("INT_classification", "REG_classification").count()
display(comparison)

In [0]:
result=result.join(df_test_seasonal, ['TAIL_NUM', 'sched_depart_utc'])

In [0]:


output_path = "dbfs:/student-groups/Group_4_1/interim/modeling_checkpoints/model_comparisons_1y.parquet"
(
    result.write
    .mode("overwrite")
    .parquet(output_path)
)




In [0]:
summary = result.filter(F.col('priorflight_dest') != F.col('ORIGIN')) \
    .groupBy('BL_classification', 'AF_classification', 'INT_classification', 'REG_classification') \
    .count()

display(summary)

In [0]:
missing_rte = df_train.filter(F.col('priorflight_dest') != F.col('ORIGIN'))
missing_rte.count()

In [0]:
df_train.count()

In [0]:
display(missing_rte.groupBy('outcome').count())

In [0]:
103889/(72752+103889)

In [0]:
display(df_train.groupBy('outcome').count())

In [0]:
1165275/(4388943+1165275)

In [0]:
result.checkpoint()

In [0]:
summary = result.filter(F.col('CANCELLED') == 1) \
    .groupBy('BL_classification', 'AF_classification', 'INT_classification', 'REG_classification') \
    .count()


In [0]:
display(summary)

In [0]:
output_path = "dbfs:/student-groups/Group_4_1/interim/modeling_checkpoints/model_comparisons_1y.parquet"

df=spark.read.parquet(output_path)

In [0]:
display(df)

In [0]:
import mlflow



In [0]:
reg_run= 'runs:/d6c4162bb8fd4818bbb714c2c711d628/model'
reg_model = mlflow.spark.load_model(reg_run)
reg_pred_train = reg_model.transform(df_train_seasonal)
# get f2 score
reg_train_score = cv_eval(reg_pred_train)[0]
print(reg_train_score) #sanity check

In [0]:
reg_pred_train

In [0]:
reg_pred_test = reg_model.transform(df_test_seasonal)
# get f2 score
reg_test_score = cv_eval(reg_pred_test)[0]
print(reg_test_score) #sanity check

In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType

# Define a UDF to extract the second value (probability of class 1)
def extract_positive_class_probability(vector):
    return vector[1]  # The second element corresponds to class '1'

# Register the UDF
extract_prob_udf = udf(extract_positive_class_probability, DoubleType())

# Apply the UDF to extract probabilities and create a new column
reg_pred_train = reg_pred_train.withColumn("positive_class_prob", extract_prob_udf(reg_pred_train["probability"]))
reg_pred_test = reg_pred_test.withColumn("positive_class_prob", extract_prob_udf(reg_pred_test["probability"]))




In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType

# Define a UDF to extract the second value (probability of class 1)
def extract_positive_class_probability(vector):
    # Assuming 'vector' is a DenseVector or SparseVector, the index 1 corresponds to class 1's probability
    return float(vector[1])  # Ensure the value is returned as a float

# Register the UDF
extract_prob_udf = udf(extract_positive_class_probability, DoubleType())

# Apply the UDF to extract probabilities and create a new column
reg_pred_train = reg_pred_train.withColumn("positive_class_prob", extract_prob_udf(reg_pred_train["probability"]))
reg_pred_test = reg_pred_test.withColumn("positive_class_prob", extract_prob_udf(reg_pred_test["probability"]))

display(reg_pred_test.select('positive_class_prob'))

In [0]:
display(reg_pred_test.select('prediction','probability','positive_class_prob'))

In [0]:
reg_pred_train_pandas = reg_pred_train.toPandas()
display(reg_pred_train_pandas)

In [0]:
train_preds=reg_pred_train.select('prediction','outcome','positive_class_prob').toPandas()
test_preds=reg_pred_test.select('prediction','outcome','positive_class_prob').toPandas()


In [0]:
def plot_residuals_binary(y_true, y_pred_prob, title="Residuals"):
    residuals = y_true - y_pred_prob  # True - predicted probability
    plt.figure(figsize=(6, 4))
    sns.histplot(residuals, kde=True, bins=30, color="blue")
    plt.axvline(0, color='red', linestyle='--')
    plt.title(title)
    plt.xlabel("Residuals (True - Predicted Probability)")
    plt.ylabel("Frequency")
    plt.show()

# Plot residuals using the predicted probabilities (after extracting them)
plot_residuals_binary(train_preds['outcome'], train_preds['positive_class_prob'], title="Train Set Residuals")
plot_residuals_binary(test_preds['outcome'], test_preds['positive_class_prob'], title="Test Set Residuals")


In [0]:
def plot_residuals(y_true, y_pred, title="Residuals"):
    residuals = y_true - y_pred
    plt.figure(figsize=(6, 4))
    sns.histplot(residuals, kde=True, bins=30, color="blue")
    plt.axvline(0, color='red', linestyle='--')
    plt.title(title)
    plt.xlabel("Residuals (True - Predicted)")
    plt.ylabel("Frequency")
    plt.show()

plot_residuals(train_preds['outcome'], train_preds['prediction'], title="Train Set Residuals")
plot_residuals(test_preds['outcome'], test_preds['prediction'], title="Test Set Residuals")

# Distribution of features on train vs. test sets
def plot_feature_distribution(train_data, test_data, feature_name):
    plt.figure(figsize=(6, 4))
    sns.histplot(train_data[feature_name], kde=True, color='blue', label='Train', stat="density")
    sns.histplot(test_data[feature_name], kde=True, color='red', label='Test', stat="density")
    plt.title(f"Feature Distribution: {feature_name}")
    plt.xlabel(f"{feature_name}")
    plt.ylabel("Density")
    plt.legend()
    plt.show()