In [1]:
# default libs
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.3f}'.format)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# plotting libs

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')

import datashader as ds
import colorcet as cc

# ML lib
        
import h2o
from h2o.estimators import H2OXGBoostEstimator
from h2o.grid.grid_search import H2OGridSearch

In [2]:
def reduce_mem_usage(df, verbose=True):
    """
    Function to reduce the required memory of a dataframe.
    Returns the dataframe with reduced size.
    """
    numerics = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

---
# Loading and feature engineering

## Loading

We load all three datasets and put the airport info into the `flights` sets.

In [3]:
airports = pd.read_csv('/kaggle/input/flight-delays-prediction-challeng2021/airports.csv')
flights_train = pd.read_csv('/kaggle/input/flight-delays-prediction-challeng2021/flights_train.csv')
flights_test = pd.read_csv('/kaggle/input/flight-delays-prediction-challeng2021/flights_test.csv')

airports_origin = airports[['IATA_CODE','LATITUDE','LONGITUDE']].rename(columns = {'IATA_CODE' : 'ORIGIN_AIRPORT'})
airports_arrive = airports[['IATA_CODE','LATITUDE','LONGITUDE']].rename(columns = {'IATA_CODE' : 'DESTINATION_AIRPORT'})

# make airports nice
flights_train = flights_train.merge(airports_origin, on = 'ORIGIN_AIRPORT').rename(columns = {'LATITUDE' : 'LATITUDE_origin', 'LONGITUDE' : 'LONGITUDE_origin'})
flights_train = flights_train.merge(airports_arrive, on = 'DESTINATION_AIRPORT').rename(columns = {'LATITUDE' : 'LATITUDE_arrival', 'LONGITUDE' : 'LONGITUDE_arrival'})

# make airports nice
flights_test = flights_test.merge(airports_origin, on = 'ORIGIN_AIRPORT').rename(columns = {'LATITUDE' : 'LATITUDE_origin', 'LONGITUDE' : 'LONGITUDE_origin'})
flights_test = flights_test.merge(airports_arrive, on = 'DESTINATION_AIRPORT').rename(columns = {'LATITUDE' : 'LATITUDE_arrival', 'LONGITUDE' : 'LONGITUDE_arrival'})


### Fix Airports

In [4]:
airports[airports.LATITUDE.isna()]

In [5]:
# Fix ECP
airports.loc[96,["LATITUDE", "LONGITUDE"]] = (30.3548543,-85.8017021)
# Fix PBG
airports.loc[234,["LATITUDE", "LONGITUDE"]] = (44.6520597,-73.470109)
# Fix UST
airports.loc[313,["LATITUDE", "LONGITUDE"]] = (29.95439,-81.3450803)

In [6]:
airports = flights_train.ORIGIN_AIRPORT.unique().tolist()
airports.extend(flights_train.DESTINATION_AIRPORT.unique())
airports.extend(flights_train.ORIGIN_AIRPORT.unique())
airports.extend(flights_train.DESTINATION_AIRPORT.unique())

airports = list(set(airports))

In [7]:
try:
    weather = pd.read_csv("../input/weather-data-2015-for-all-us-airports/weather_for_all_airports.csv") 
except FileNotFoundError:
    weather = pd.DataFrame()
    df = pd.DataFrame(columns=[
        "station",
        "valid",
        "tmpc",
        "sknt",
        "p01m",
        "vsby",
        "gust",
        "skyc1",
        "skyc2",
        "skyc3",
        "wxcodes",
        "ice_accretion_6hr",
        "snowdepth"
    ])
    
    for code in airports.IATA_CODE:
        url = f"https://mesonet.agron.iastate.edu/cgi-bin/request/asos.py?station={code}&data=tmpc&data=sknt&data=p01m&data=vsby&data=gust&data=skyc1&data=skyc2&data=skyc3&data=wxcodes&data=ice_accretion_6hr&data=snowdepth&year1=2015&month1=1&day1=1&year2=2015&month2=8&day2=1&tz=Etc%2FUTC&format=onlycomma&latlon=no&elev=no&missing=empty&trace=T&direct=no&report_type=1&report_type=2"
        df = pd.read_csv(url)
        weather = weather.append(df)
    weather.to_csv("./weather_data.csv")

In [8]:
weather.station.unique().shape[0]

In [9]:
weather.valid = pd.to_datetime(weather.valid)
weather.valid = weather.valid.dt.round('1h')

In [10]:
weather = weather.drop(columns=["Unnamed: 0","Unnamed: 0.1"])

In [11]:
weahter = reduce_mem_usage(weather)

In [12]:
len(weahter.station)

In [13]:
weahter["MATCH"] = weahter.valid.astype(str) + weahter.station 

In [14]:
weahter = weahter[~weahter.duplicated()]
weather = weahter[~weahter["MATCH"].duplicated()]
weather.drop(columns="MATCH")

In [15]:
# clean weahter data a bit further
weahter.ice_accretion_6hr = weahter.ice_accretion_6hr.fillna(0)
weahter.loc[weahter.ice_accretion_6hr=='T',"ice_accretion_6hr"] = 0
weahter.ice_accretion_6hr = weahter.ice_accretion_6hr.astype(float)
weather.sknt = weather.sknt.fillna(0)
weahter.loc[weahter.p01m=='T', "p01m"] = 0
weahter.p01m = weahter.p01m.astype(float)
weather.vsby = weather.vsby.fillna(1)

In [16]:
len(weahter.station)

## capturing important info

In this section we gather some information about the data set we think is important for feature engineering :)

### airport and airline rankings

There are a TON of airports, they are computationally intensive if we want to include the arrival and destinations as categories. Keeping the top something and one hot encoding those is an option. but what might be nicer (and less intensive) is ranking them by their average delay, and feeding that rank into the model. 

In [17]:
orig_rank = flights_train.groupby('ORIGIN_AIRPORT').ARRIVAL_DELAY.mean().sort_values(ascending=True).index.tolist()
dest_rank = flights_train.groupby('DESTINATION_AIRPORT').ARRIVAL_DELAY.mean().sort_values(ascending=True).index.tolist()
airl_rank = flights_train.groupby('AIRLINE').ARRIVAL_DELAY.mean().sort_values(ascending=True).index.tolist()
orig_rank = {p:i for i, p in enumerate(orig_rank)}
dest_rank = {p:i for i, p in enumerate(dest_rank)}
airl_rank = {p:i for i, p in enumerate(airl_rank)}

### different time columns

In [18]:
clock_time_columns = [
    'SCHEDULED_DEPARTURE', # planned leaving time
    'DEPARTURE_TIME',      # actual leaving time
    'WHEELS_OFF',          # moment the airplane leaves the ground
    'SCHEDULED_ARRIVAL',   # planned arrical
]
relative_time_columns = [
    'TAXI_OUT',            # mintues between departure and wheels off
    'SCHEDULED_TIME',      # planned flight (+ground?) time
    'ARRIVAL_DELAY'        # difference between scheduled arrival and actual arrival
]

### outliers

harms performance

In [19]:
stds = 4
# calculate summary statistics
data_mean, data_std = flights_train["ARRIVAL_DELAY"].mean(), flights_train["ARRIVAL_DELAY"].std()
# identify outliers
cut_off = data_std * stds
lower, upper = data_mean - cut_off, data_mean + cut_off

x_upper = flights_train[flights_train["ARRIVAL_DELAY"] > upper]
x_lower = flights_train[flights_train["ARRIVAL_DELAY"] < lower]
total = flights_train.shape[0]

print("\n")
print(f'There are {x_upper.shape[0]} ({(x_upper.shape[0]/total*100):.2f}%) observations above {upper:.2f} ({stds} standard deviations).')
print(f'There are {x_lower.shape[0]} ({(x_lower.shape[0]/total*100):.2f}%) observations below {lower:.2f} ({stds} standard deviations).')

## Data prep

In [20]:
def prep_df(df_in, rm_outliers = False):

    # drop useless columns
    out = df_in.drop(columns=[
        # O/A airports and distance are known
        'LATITUDE_origin',
        'LONGITUDE_origin',
        'LATITUDE_arrival',
        'LONGITUDE_arrival',
        # most date stuff is useless
        # 'YEAR', # only 2015 but needed to convert to datetime
        # 'MONTH', # no repetition with test
        # probably useless columns
        'FLIGHT_NUMBER', # garbage
        'TAIL_NUMBER', # also uninteresting
    ])

    for x in out.columns:
        # set float columns to int
        if (out[x].dtype == 'float64'):
            out[x] = out[x].astype(np.int64)
        # fix clock time columns lol
        if x in clock_time_columns:
            out[x] = ((out[x] // 100)*60) + (out[x] % 100)
            
    out["SCHEDULED_DEPARTURE_DATETIME"] = pd.to_datetime(out[["YEAR", "MONTH", "DAY"]]) + pd.to_timedelta(out["SCHEDULED_DEPARTURE"], unit="minutes")
    out["SCHEDULED_ARRIVAL_DATETIME"] = out["SCHEDULED_DEPARTURE_DATETIME"] + pd.to_timedelta(out["SCHEDULED_TIME"], unit="minutes")

    out = out.drop(columns=["YEAR"])
    
#     # did it carry over days - useless
#     out["CROSS_DAY"] = np.where(out.SCHEDULED_DEPARTURE > out.SCHEDULED_ARRIVAL, 1, 0)
#     out["CROSS_DAY"] = out.CROSS_DAY.astype(np.bool_)
#     out["MULTI_DAY"] = np.where(out.SCHEDULED_TIME > 1440, 1, 0)
#     out["MULTI_DAY"] = out.MULTI_DAY.astype(np.bool_)

    # no account for days
    # df_out.insert(10, "DEPARTURE_DELAY", (df_out["DEPARTURE_TIME"] - df_out["SCHEDULED_DEPARTURE"]))
    # yes account for days but break some things on the way, still better than w/o probably
    out["DEPARTURE_DELAY"] = out.DEPARTURE_TIME - out.SCHEDULED_DEPARTURE
    # very few planes leave more than 1.5 hr early
    out["DEPARTURE_DELAY_COR"] = np.where(out.DEPARTURE_DELAY < -90, out.DEPARTURE_DELAY + 1440, out.DEPARTURE_DELAY)
    
    # should help with reconstructing days,
    out["TAXI_OUT2"]       = out.DEPARTURE_TIME      - out.WHEELS_OFF
    out["TAXI_OUT_COMP"]   = out.TAXI_OUT            - out.TAXI_OUT2
    out["TOTALTIME_LEFT"]  = out.DEPARTURE_TIME      - out.SCHEDULED_ARRIVAL
    out["TAKEOFF_DELAY"]   = out.SCHEDULED_DEPARTURE - out.WHEELS_OFF
    out["SCHEDULED_TIME2"] = out.SCHEDULED_DEPARTURE - out.SCHEDULED_ARRIVAL
    out["AIRTIME_LEFT"]    = out.WHEELS_OFF          - out.SCHEDULED_ARRIVAL

    # airport destination and arrival rank
    out.insert(3, "O_RANK", out["ORIGIN_AIRPORT"].map(orig_rank))
    out.insert(3, "D_RANK", out["DESTINATION_AIRPORT"].map(dest_rank))
    out.insert(3, "A_RANK", out["AIRLINE"].map(airl_rank))
    
    # still need these
    #out = out.drop(columns = [
    #    "ORIGIN_AIRPORT",
    #    "DESTINATION_AIRPORT",
    #])
    
    if(rm_outliers):
        out = out[out["ARRIVAL_DELAY"] < upper]
        out = out[out["ARRIVAL_DELAY"] > lower]
    return out

In [21]:
ptrain = prep_df(flights_train, rm_outliers = False)
ptrain = reduce_mem_usage(ptrain)
ptrain.sort_values(by="id")

In [22]:
ptest = prep_df(flights_test)
ptest = reduce_mem_usage(ptest)
ptest.sort_values(by="id")

## Align weather data with flights data

In [23]:
# Round times to match with rounded weather times
ptrain.SCHEDULED_DEPARTURE_DATETIME = ptrain.SCHEDULED_DEPARTURE_DATETIME.dt.round('1h') 
ptest.SCHEDULED_DEPARTURE_DATETIME = ptest.SCHEDULED_DEPARTURE_DATETIME.dt.round('1h')

ptrain.SCHEDULED_ARRIVAL_DATETIME = ptrain.SCHEDULED_ARRIVAL_DATETIME.dt.round('1h') 
ptest.SCHEDULED_ARRIVAL_DATETIME = ptest.SCHEDULED_ARRIVAL_DATETIME.dt.round('1h')

In [24]:
ptrain = ptrain.merge(weather, how="left", left_on=["SCHEDULED_DEPARTURE_DATETIME", "ORIGIN_AIRPORT"], right_on=["valid","station"], suffixes=("_departure","_departure"))

In [25]:
ptest = ptest.merge(weather, how="left", left_on=["SCHEDULED_DEPARTURE_DATETIME", "ORIGIN_AIRPORT"], right_on=["valid","station"], suffixes=("_departure","_departure"))

In [26]:
ptrain = ptrain.merge(weather, how="left", left_on=["SCHEDULED_ARRIVAL_DATETIME","DESTINATION_AIRPORT"], right_on=["valid","station"], suffixes=("_departure","_arrival"))

In [27]:
ptest = ptest.merge(weather, how="left", left_on=["SCHEDULED_ARRIVAL_DATETIME","DESTINATION_AIRPORT"], right_on=["valid","station"], suffixes=("_departure","_arrival"))

In [58]:
# Free unused memory
del weather
del airports
del flights_train
del flights_test
del airports_origin
del airports_arrive
del flights_train
del flights_train
del flights_test
del flights_test

In [28]:
skyc = ptrain["skyc1_departure"].dropna().unique().tolist()
skyc.extend(ptrain["skyc2_departure"].dropna().unique().tolist())
skyc.extend(ptrain["skyc3_departure"].dropna().unique().tolist())

skyc.extend(ptrain["skyc1_arrival"].dropna().unique().tolist())
skyc.extend(ptrain["skyc2_arrival"].dropna().unique().tolist())
skyc.extend(ptrain["skyc3_arrival"].dropna().unique().tolist())
skyc

In [29]:
# Add each weather code as one-hot encoded column to the dataframes
for i, sky_cond in enumerate(skyc):
    print(f"Doing {i+1}/{len(skyc)}")
    
    train_mask_arrival = (ptrain["skyc1_arrival"] == sky_cond) | (ptrain["skyc2_arrival"] == sky_cond) | (ptrain["skyc3_arrival"] == sky_cond)
    test_mask_arrival = (ptest["skyc1_arrival"] == sky_cond) | (ptest["skyc2_arrival"] == sky_cond) | (ptest["skyc3_arrival"] == sky_cond)
    
    train_mask_departure = (ptrain["skyc1_departure"] == sky_cond) | (ptrain["skyc2_departure"] == sky_cond) | (ptrain["skyc3_departure"] == sky_cond)
    test_mask_departure = (ptest["skyc1_departure"] == sky_cond) | (ptest["skyc2_departure"] == sky_cond) | (ptest["skyc3_departure"] == sky_cond) 
    
    ptrain[f"{sky_cond}_arrival"] = train_mask_arrival
    ptrain[f"{sky_cond}_departure"] = train_mask_departure
    
    ptest[f"{sky_cond}_departure"] = test_mask_departure
    ptest[f"{sky_cond}_arrival"] = test_mask_arrival

In [30]:
wxc = ptrain["0_arrival"].dropna().unique().tolist()
wxc.extend(ptrain["1_arrival"].dropna().unique().tolist())
wxc.extend(ptrain["2_arrival"].dropna().unique().tolist())
wxc.extend(ptrain["3_arrival"].dropna().unique().tolist())

wxc.extend(ptrain["0_departure"].dropna().unique().tolist())
wxc.extend(ptrain["1_departure"].dropna().unique().tolist())
wxc.extend(ptrain["2_departure"].dropna().unique().tolist())
wxc.extend(ptrain["3_departure"].dropna().unique().tolist())

In [31]:
# Add each weather code as one-hot encoded column to the dataframes
events = ['TS']
codes_to_use = [code for code in wxc if any([ch in code for ch in events])]
for i, wxcode in enumerate(codes_to_use):
    print(f"Doing {i+1}/{len(codes_to_use)}")
    train_mask_arrival = (ptrain["0_arrival"] == wxcode) | (ptrain["1_arrival"] == wxcode) |(ptrain["2_arrival"] == wxcode) | (ptrain["3_arrival"] == wxcode)
    test_mask_arrival = (ptest["0_arrival"] == wxcode) | (ptest["1_arrival"] == wxcode) |(ptest["2_arrival"] == wxcode) | (ptest["3_arrival"] == wxcode)
    
    train_mask_departure = (ptrain["0_departure"] == wxcode) | (ptrain["1_departure"] == wxcode) |(ptrain["2_departure"] == wxcode) | (ptrain["3_departure"] == wxcode)
    test_mask_departure = (ptest["0_departure"] == wxcode) | (ptest["1_departure"] == wxcode) |(ptest["2_departure"] == wxcode) | (ptest["3_departure"] == wxcode)
    
    ptrain[f"{wxcode}_departure"] = train_mask_departure
    ptrain[f"{wxcode}_arrival"] = train_mask_arrival
    
    ptest[f"{wxcode}_departure"] = test_mask_departure
    ptest[f"{wxcode}_arrival"] = test_mask_arrival

In [32]:
cols_to_remove = ["SCHEDULED_DEPARTURE_DATETIME","SCHEDULED_ARRIVAL_DATETIME", "ORIGIN_AIRPORT", "DESTINATION_AIRPORT", "station_departure", "valid_departure", 
                  "station_arrival", "valid_arrival", "wxcodes_departure", "wxcodes_arrival", "0_arrival", "1_arrival", 
                  "2_arrival", "3_arrival", "0_departure", "1_departure", "2_departure", "3_departure", "skyc1_departure", 
                  "skyc2_departure", "skyc3_departure", "skyc1_arrival", "skyc2_arrival", "skyc3_arrival", "MATCH_departure",
                  "MATCH_arrival"]
ptrain = ptrain.drop(columns=cols_to_remove)
ptest = ptest.drop(columns=cols_to_remove)

In [33]:
# Fillna which have been caused by missing timestamps in weather data
ptrain.tmpc_departure = ptrain.tmpc_departure.fillna(0) 
ptrain.sknt_departure = ptrain.sknt_departure.fillna(0)
ptrain.p01m_departure = ptrain.p01m_departure.fillna(0)
ptrain.vsby_departure = ptrain.vsby_departure.fillna(10)
ptrain.gust_departure = ptrain.gust_departure.fillna(0)
ptrain.ice_accretion_6hr_departure = ptrain.ice_accretion_6hr_departure.fillna(0)

ptrain.tmpc_arrival = ptrain.tmpc_departure.fillna(0) 
ptrain.sknt_arrival = ptrain.sknt_departure.fillna(0)
ptrain.p01m_arrival = ptrain.p01m_departure.fillna(0)
ptrain.vsby_arrival = ptrain.vsby_departure.fillna(10)
ptrain.gust_arrival = ptrain.gust_departure.fillna(0)
ptrain.ice_accretion_6hr_arrival = ptrain.ice_accretion_6hr_departure.fillna(0)

ptest.tmpc_departure = ptest.tmpc_departure.fillna(0) 
ptest.sknt_departure = ptest.sknt_departure.fillna(0)
ptest.p01m_departure = ptest.p01m_departure.fillna(0)
ptest.vsby_departure = ptest.vsby_departure.fillna(10)
ptest.gust_departure = ptest.gust_departure.fillna(0)
ptest.ice_accretion_6hr_departure = ptest.ice_accretion_6hr_departure.fillna(0)

ptest.tmpc_arrival = ptest.tmpc_departure.fillna(0) 
ptest.sknt_arrival = ptest.sknt_departure.fillna(0)
ptest.p01m_arrival = ptest.p01m_departure.fillna(0)
ptest.vsby_arrival = ptest.vsby_departure.fillna(10)
ptest.gust_arrival = ptest.gust_departure.fillna(0)
ptest.ice_accretion_6hr_arrival = ptest.ice_accretion_6hr_departure.fillna(0)

In [None]:
graph = sns.scatterplot(data=ptrain, x="ARRIVAL_DELAY", y="DEPARTURE_DELAY2", hue="TAXI_OUT")
graph.axhline(-90)
graph

In [None]:
cvs = ds.Canvas(plot_width=1000, plot_height=1000)  # auto range or provide the `bounds` argument
agg = cvs.points(temp, 'OD_RANK', 'ARRIVAL_DELAY')  # this is the histogram
img = ds.tf.set_background(ds.tf.shade(agg, how="log", cmap=cc.bmw), "white").to_pil()  # create a rasterized imageplt.imshow(img)
plt.imshow(img)
plt.axis('off')
plt.show()

In [38]:
ptest

In [34]:
ptest.to_csv("test_emiel_v6.csv",index=False)
ptrain.to_csv("train_emiel_v6.csv",index=False)

# Modeling

In [35]:
#no.of sockets i.e available slots for physical processors
!lscpu | grep 'Socket(s):'


#no.of cores each processor is having
!lscpu | grep 'Core(s) per socket'

!cat /proc/meminfo | grep 'MemAvailable'

In [36]:
# h2o.cluster().shutdown()
h2o.init(max_mem_size = "10G")

In [59]:
htrain = h2o.H2OFrame(ptrain)
del ptrain

In [60]:
if "AIRLINE" in htrain.columns:
    htrain["AIRLINE"] = htrain["AIRLINE"].asfactor()
    htrain[["CLR_departure", "CLR_arrival", "OVC_departure", "OVC_arrival", "FEW_departure", "FEW_arrival", "SCT_departure", "SCT_arrival", "BKN_departure", 
            "BKN_arrival", "VV _departure", "VV _arrival", "-TSRA_departure", "-TSRA_arrival", "TS_departure", "TS_arrival", "+TSRA_departure", "+TSRA_arrival",
            "VCTS_departure","VCTS_arrival","TSRA_departure","TSRA_arrival","TSGSRA_departure","TSGSRA_arrival","-TSGRRA_departure","-TSGRRA_arrival",
            "-TSDZ_departure","-TSDZ_arrival","-TSRAGS_departure","-TSRAGS_arrival","TSRAGR_departure","TSRAGR_arrival","+TSRAGR_departure","+TSRAGR_arrival",
            "-TSRAGR_departure", "-TSRAGR_arrival","+TSRAGS_departure","+TSRAGS_arrival","-VCTSRA_departure","-VCTSRA_arrival","+TSGRRA_departure","+TSGRRA_arrival",
            "VCTSRA_departure","VCTSRA_arrival","+TSPL_departure","+TSPL_arrival"]] = htrain[["CLR_departure", "CLR_arrival", "OVC_departure", "OVC_arrival", "FEW_departure", "FEW_arrival", "SCT_departure", "SCT_arrival", "BKN_departure", 
            "BKN_arrival", "VV _departure", "VV _arrival", "-TSRA_departure", "-TSRA_arrival", "TS_departure", "TS_arrival", "+TSRA_departure", "+TSRA_arrival",
            "VCTS_departure","VCTS_arrival","TSRA_departure","TSRA_arrival","TSGSRA_departure","TSGSRA_arrival","-TSGRRA_departure","-TSGRRA_arrival",
            "-TSDZ_departure","-TSDZ_arrival","-TSRAGS_departure","-TSRAGS_arrival","TSRAGR_departure","TSRAGR_arrival","+TSRAGR_departure","+TSRAGR_arrival",
            "-TSRAGR_departure", "-TSRAGR_arrival","+TSRAGS_departure","+TSRAGS_arrival","-VCTSRA_departure","-VCTSRA_arrival","+TSGRRA_departure","+TSGRRA_arrival",
            "VCTSRA_departure","VCTSRA_arrival","+TSPL_departure","+TSPL_arrival"]].asfactor()
htrain

In [61]:
y = "ARRIVAL_DELAY"
X = htrain.columns
X = list(filter(lambda x: x not in [y, "id"], X))
print(y)
print(X)

# grid search

sucks

In [None]:
train, valid, test = htrain.split_frame(ratios=[.4,.3], seed=35)

---
# ntrees

Default 50,
results looked promising but actually trying `ntrees = 500` really overfit the data. no dice.

we stick with `ntrees = 150`

In [None]:
xgb_params_ntrees = {
#     'learn_rate': [0.01, 0.1],
#     'max_depth': [3, 5, 9],
#     'sample_rate': [0.8, 1.0],
#     'col_sample_rate': [0.2, 0.5, 1.0],
#     'skip_drop': [0, 0.5, 1.0],
    'ntrees': [30, 50, 100, 200, 300, 400, 500],
}

# Train and validate a cartesian grid of GBMs
xgb_grid_ntrees = H2OGridSearch(model=H2OXGBoostEstimator,
                          grid_id='xgb_grid_ntrees',
                          hyper_params=xgb_params_ntrees)
xgb_grid_ntrees.train(
    x=X, y=y,
    training_frame=train,
    validation_frame=valid,
    seed=35
)


In [None]:
# Get the grid results, sorted by validation RMSE
xgb_grid_ntrees_perf = xgb_grid_ntrees.get_grid(sort_by='RMSE', decreasing=False)
xgb_grid_ntrees_perf

In [None]:
# Now let's evaluate the model performance on a test set
for i, m in enumerate(xgb_grid_ntrees_perf.models):
    print(f' #{i} rmse: {m.model_performance(test).rmse()}')

---
## skipdrop

did absolutetly nothing

probably because we did not set `dart` as the booster haha

In [None]:
# Now let's evaluate the model performance on a test set
for i, m in enumerate(xgb_grid_skipdrop_perf.models):
    print(f' #{i} rmse: {m.model_performance(test).rmse()}')

---
## max depth

In [None]:
xgb_params_maxd = {
    'max_depth': [3, 5, 9],
}

# Train and validate a cartesian grid of GBMs
xgb_grid_maxd = H2OGridSearch(model=H2OXGBoostEstimator,
                          grid_id='xgb_grid_maxd',
                          hyper_params=xgb_params_maxd)
xgb_grid_maxd.train(
    x=X, y=y,
    training_frame=train,
    validation_frame=valid,
    seed=35
)

In [None]:
# Get the grid results, sorted by validation RMSE
xgb_grid_maxd_perf = xgb_grid_maxd.get_grid(sort_by='RMSE', decreasing=False)
xgb_grid_maxd_perf

In [None]:
# Now let's evaluate the model performance on a test set
for i, m in enumerate(xgb_grid_maxd_perf.models):
    print(f' #{i} rmse: {m.model_performance(test).rmse()}')

---
# model training

In [None]:
train, valid = htrain.split_frame(ratios=[.8], seed=35)

In [None]:
%%time

# Build and train the model:
flight_xgb3 = H2OXGBoostEstimator(
    booster='dart',
    normalize_type="tree",
    seed=35,
    ntrees = 200,
    stopping_metric    = "RMSE",
    stopping_rounds    = 100,
    stopping_tolerance = 0.01,
)
flight_xgb3.train(
    x=X,
    y=y,
    training_frame=htrain
)

In [None]:
h2o.save_model(model=flight_xgb3, path="", force=True)

In [None]:
flight_xgb3.model_performance(valid)

# Predicting

In [None]:
htest = h2o.H2OFrame(ptest)
htest["AIRLINE"] = htest["AIRLINE"].asfactor()
htest

In [None]:
pred = flight_xgb3.predict(htest)
pred

In [None]:
pred_df = pred.as_data_frame()
kaggle = pd.DataFrame()

kaggle["id"] = ptest["id"]
kaggle["ARRIVAL_DELAY"] = pred_df["predict"]
kaggle = kaggle.sort_values(by="id")
kaggle.to_csv("xgb6_emiel_new_features.csv",index=False)

In [None]:
compare = flights_train["ARRIVAL_DELAY"].to_frame().describe()
compare["PREDICTIONS"] = pred_df.describe()
compare["DIFF"] = compare["PREDICTIONS"] - compare["ARRIVAL_DELAY"]
compare["DIFF"]["count"] = "-"
compare

--- 
# Is CatBoost the coolest thing ever?

In [None]:
# who knows
# also yes

import catboost as cb

In [None]:
cbr = cb.CatBoostRegressor(loss_function="RMSE")
cbr.fit(X_train_val, y_train_val)
cbr.predict(X_test)
eval_pred = cbr.predict(X_test)
pd.DataFrame(eval_pred, columns=['ARRIVAL_DELAY']).to_csv("flight_result.csv", index_label='id')