# Catboost Model
Trying out CatBoost on the training data set, with cross validation.

In [0]:
import pyspark
from pyspark.sql.types import StringType, BooleanType, IntegerType
import pyspark.sql.functions as F

import airporttime
from datetime import datetime, timedelta

import numpy as np

In [0]:
from pyspark.sql import SQLContext
from pyspark.mllib.stat import Statistics
from pyspark.sql.functions import udf
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler,StandardScaler
from pyspark.ml.feature import Bucketizer
from pyspark.ml import Pipeline
from sklearn.metrics import confusion_matrix

In [0]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier
from sparkdl.xgboost import XgboostRegressor
import catboost_spark

#### Create the Azure BLOB storage to store data for quick access when datasets are huge

In [0]:
blob_container = "w261-scrr" # The name of your container created in https://portal.azure.com
storage_account = "midsw261rv" # The name of your Storage account created in https://portal.azure.com
secret_scope = "w261scrr" # The name of the scope created in your local computer using the Databricks CLI
secret_key = "w261scrrkey" # The name of the secret key created in your local computer using the Databricks CLI 
blob_url = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"
mount_path = "/mnt/mids-w261"

In [0]:
spark.conf.set(
  f"fs.azure.sas.{blob_container}.{storage_account}.blob.core.windows.net",
  dbutils.secrets.get(scope = secret_scope, key = secret_key)
)

In [0]:
%run "../libs/weather_aggregation"

In [0]:
%run "../libs/time_based_features"

In [0]:
%run "../libs/transform"

In [0]:
%run "../libs/model_helper_functions"

#### Import joined data

In [0]:
df_train = spark.read.parquet(f"{blob_url}/join_full_0329")

In [0]:
df_test = spark.read.parquet(f"{blob_url}/test_full_join_0404")

### Cross Validation

In [0]:
# Load the cross validation splits

df_train_split = []
df_val_split = []

for i in range(5):
  
  cv_train_str = "cv_train_0402_split" + str(i)
  cv_val_str = "cv_val_0402_split" + str(i)
  
  df_train_split.append(spark.read.parquet(f"{blob_url}/{cv_train_str}"))
  df_val_split.append(spark.read.parquet(f"{blob_url}/{cv_val_str}"))



In [0]:
# A little bit of preprocessing needed 
def preprocess(df):

  df = df.fillna(999999, subset=['CIG_CeilingHeightDim_median', 'VIS_Horizontal_median' ])
  df = df.fillna(0, subset=['AA_RainDepth','AA_RainDuration', 'AL_SnowAccumDuration', 'AL_SnowAccumDepth', 'AJ1_SnowDepth', 'AJ1_SnowEqWaterDepth','WND_Speed_mean', 'TMP_Value_mean', 'SLP_Value_mean'])
  
  df = df.withColumn("ORIGIN_DEST_COMBO", F.concat(col("ORIGIN"),F.lit('-'),col("DEST")))
  
  df = target_mean_encoding(df, col=['ORIGIN', 'DEST','ORIGIN_DEST_COMBO'], target='DEP_DEL15')

  df = df.withColumn("DAY_OF_WEEK",col("DAY_OF_WEEK").cast(StringType())) \
                          .withColumn("MONTH",col("MONTH").cast(StringType())) \
                          .drop('ORIGIN', 'DEST', 'ORIGIN_DEST_COMBO')
  
  return df

In [0]:
# flights + weather + time based attribute
selected_cols = ['DEP_DEL15', 'OP_UNIQUE_CARRIER', 'DAY_OF_WEEK', 'DISTANCE', 'DISTANCE_GROUP', 'MONTH', 'ORIGIN', 'DEST', \
                  'CIG_CeilingHeightDim_median', 'VIS_Horizontal_median', 'AA_RainDepth','AA_RainDuration', 'AL_SnowAccumDuration', \
                  'AL_SnowAccumDepth', 'AJ1_SnowDepth', 'AJ1_SnowEqWaterDepth','WND_Speed_mean', 'TMP_Value_mean', 'SLP_Value_mean', \
                  'PREV_DEP_DEL15']

df_temp = df_train_split[0].select(*selected_cols)

df_temp = preprocess(df_temp)

# Get numerical, categorical values and label ready for pipeline
labelCol = ['DEP_DEL15']

categoricalColumns = [t[0] for t in df_temp.dtypes if t[1] =='string']

numericCols = [t[0] for t in df_temp.dtypes if t[1] !='string']

numericCols.remove(*labelCol)

In [0]:
def processTrainAndTestData(train, test, sampling=None):

    cv_train = preprocess(train)
    cv_val = preprocess(test)

    if sampling == 'over':
      print("Using oversampling")
      cv_train = oversampling(cv_train)
    elif sampling == 'under':
      print("Using undersampling")
      cv_train = undersampling(cv_train)
      
    pipeline = getRegressionPipeline(categoricalColumns, numericCols, labelCol)

    pipelineModel = pipeline.fit(cv_train)  

    val_ml_train = pipelineModel.transform(cv_train)
    val_ml_test = pipelineModel.transform(cv_val)

    cols = cv_train.columns
    selectedCols = ['features'] + cols

    train = val_ml_train.select(selectedCols)
    test = val_ml_test.select(selectedCols)
    return train, test


In [0]:
def trainAndTestCatboost(train_splits, test_splits, selected_cols, hyper_params = None, is_data_processed = False):
  
  metricsArray = np.empty((0,3), int)

  for i, cv_train in enumerate(train_splits):
    #cv_train = cv_train.select(*selected_cols)
    #cv_test = df_val_split[i].select(*selected_cols)
  
    train, test = cv_train, test_splits[i]
    print("############################")
    print("Validation Set {:d}".format(i+1))
    print("Training Dataset Count: " + str(train.count()))
    print("Test Dataset Count: " + str(test.count()))
    
    if is_data_processed:
      model, pred = execCatboostModel(train, test, hyper_params = hyper_params)
    else: 
      model, pred = execCatboostModelUnprocessed(train, test, hyper_params = hyper_params)

    precision, recall, fmeasure = getMetrics(pred)

    print("Precision is {:.3f}".format(precision))
    print("Recall is {:.3f}".format(recall))
    print("F beta(0.5) score is {:.3f}".format(fmeasure))

    newrow = np.array([precision, recall, fmeasure])

    metricsArray = np.append(metricsArray, [newrow], axis=0)


  avgArray = np.mean(metricsArray, axis=0)

  print("############################")
  print("Average of Cross validation")
  print("Average Precision is {:.3f}".format(avgArray[0]))
  print("Average Recall is {:.3f}".format(avgArray[1]))
  print("Average F beta(0.5) score is {:.3f}".format(avgArray[2])) 
  
  return model, pred, metricsArray

  

In [0]:
## Pre-process cv splits for test and train
def preprocessCVSplitData(cv_train, cv_test, sampling=None):
  train_splits = []
  test_splits = []
  for i, train in enumerate(cv_train):   
    train, test = processTrainAndTestData(train, cv_test[i], sampling)
    train_splits.append(train)
    test_splits.append(test)
  return train_splits, test_splits

### Create train and test data sets with no sampling, under sampling and over sampling

In [0]:
# Get preprocessed train and test splits (under sampling)
train_splits_under, test_splits_under = preprocessCVSplitData(df_train_split, df_val_split, sampling='under')

In [0]:
# Get preprocessed train and test splits (over sampling)
train_splits_over, test_splits_over = preprocessCVSplitData(df_train_split, df_val_split, sampling='over')

In [0]:
# Get preprocessed train and test splits (no sampling)
train_splits, test_splits = preprocessCVSplitData(df_train_split, df_val_split, sampling=None)

In [0]:
# Catboost complains if you don't set this before running (should be set to the number of cores available at time of running)
spark.conf.set("spark.tasks.cpus", 28)

### Run the default CatBoost classifer, using only the default hyper parameters and using the under sampled data sets

In [0]:
# defaults 
hyper_params={}
trainAndTestCatboost(train_splits_under, test_splits_under, selected_cols, hyper_params=hyper_params, is_data_processed=True)

### Now run with number of iterations set to 10 (default is 1000). The learning rate automatically adjusts to account for this

In [0]:
## Tweaking hyper params - decrease number of iterations to 10
hyper_params = {    
 
  'iterations': 10
}
trainAndTestCatboost(train_splits_under, test_splits_under, selected_cols, hyper_params=hyper_params, is_data_processed=True)

### Attempting to run on the oversampled data

In [0]:
## Over sampling 
hyper_params = {    
 
  'iterations': 10
}
trainAndTestCatboost(train_splits_over, test_splits_over, selected_cols, hyper_params=hyper_params, is_data_processed=True)

I've left the above error in as it's a good example of where Catboost and Spark/Databricks don't seem to play nicely

## Summary
The results of this model are poor compared with some of the other models we have tried. It's also pretty difficult to run it (frequently errors out due to cluster nodes) and the documenation is quite poor (not up to date with latest major release).