In [None]:
# Imports 

# Locally created packages
import processing as prc
import configs as cfg

# Global python packages
import pandas as pd
from sklearn.model_selection import KFold
import numpy as np
import matplotlib
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV
from scipy import stats

In [None]:
# Create dataframe from csv file
df = pd.read_csv(cfg.path, index_col = 0)

In [None]:
# Add name of column indicating if a landing is friction limited 
fricLim = cfg.fricLim

# Add name of colmn with the friction coefficient
mu = cfg.mu
df['mu_allRaw'] = df[mu]

# Create a new index as combination of date and runway, to be a Unique Key Identifier
df['date'] = df.index
df['key'] = df.index.astype(str) + '-' + df['aerodrome'].astype(str)
df = df.set_index('key')
  
# Remove possibly duplicated rows
df_notoverlap = df[~df.index.duplicated(keep='first')]
print('Dropped {}'.format(len(df)-len(df_notoverlap)) + ' overlapping flight landings')

# Removing friction coefficients below 0.05, as this is inprobable and has a high probabillity of being wrong measurements
df_notoverlap2 = df_notoverlap.loc[~(df_notoverlap.mu_allRaw <= 0.05),:]

# Removing landings which are not friction limited
df_notoverlap2 = df_notoverlap2.loc[(df_notoverlap2.fricLim > 0),:]

# Drop unusable/unecessary columns
df_new = df_notoverlap2.drop(cfg.unusable_columns, axis = 1)
df_new.aerodrome = df_new.aerodrome.astype(int)
df_new.direction = df_new.direction.astype(int)

# Process wind variables (using direction to get wind in/out from right angle)
df_new = prc.wind_pros(df_new)
df_new = df_new.drop('direction', axis = 1)

# Drop rows with NAN-values in the response variable
df_new = df_new.dropna(axis = 0, how = 'any', subset = ['mu_allRaw'])

# Create the explanatory matix, removing the response variable and unecessary wind variables
X = df_new.drop(['mu_allRaw', 'across02_in', 'along02_in','dir10_in','ac_calc02_in'], axis = 1)
X['tmp_abs'] = X.tmp.abs()
X['rwy_abs'] = X.rwy.abs()


# Set the response to be the friction coefficient
y = df_new['mu_allRaw']

# Set random state
state = cfg.state

In [None]:
# Set wished random states
state = cfg.state

# Setting up a 10-fold crossvalidation
skf = KFold(n_splits=10, random_state = state, shuffle = True)

results_mse = {}
results_rmse = {}
results_tce = {}
results_params = {}
results_mae = {}
feature_important = {}
results = {}
results[0] = {}
results[1] = {}
results[2] = {}
results[3] = {}
results[4] = {}
results[-1] = {}
results[-2] = {}
results[-3] = {}
results[-4] = {}

num = 0

tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

In [None]:
# Perform 10-fold crossvalidation of XGBoost regression model
for train_index, test_index in skf.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]

    print('Cv with state: ' + str(state))

    # Set up XGBoost model
    xgb_model = xgb.XGBRegressor(objective = 'reg:squarederror',seed = state, n_jobs = -1, importance_type = 'total_gain')

    parameters = {'n_estimators': stats.randint(50, 250),
                  'learning_rate': stats.uniform(0.01, 0.2),
                  'subsample': stats.uniform(0.3, 0.7),
                  'min_split_loss': stats.uniform(0, 0.4),
                  'reg_lambda': stats.uniform(0, 10)
                 }
    print('Training model')
    
    # Set up randomized search cross validation
    clf_r = RandomizedSearchCV(xgb_model, parameters, n_jobs=-1, 
                       cv=3, scoring='neg_mean_squared_error',
                       verbose=10, refit=True, random_state = state, n_iter=20)

    # Fit the model
    clf_r.fit(X_train, y_train)
    y_pred = clf_r.predict(X_test)
    
    # Bin the friction coefficient to braking action categories
    y_test_cat = pd.cut(x = y_test, bins = [-1, 0.05,0.075,0.1,0.15,0.2,100],right = True, labels = [0,1,2,3,4,5]) 
    y_pred_cat = pd.cut(x = pd.Series(y_pred), bins = [-1, 0.05,0.075,0.1,0.15,0.2,100],right = True, labels = [0,1,2,3,4,5]) 

    y_test_cat = y_test_cat.reset_index(drop = True).astype('int')
    y_pred_cat = y_pred_cat.reset_index(drop = True).astype('int')

    y_diff = y_test_cat-y_pred_cat
    y_diff_abs = abs(y_diff)

    # Print results
    print(' ')
    print('Test RMSE:',round(mean_squared_error(y_test, y_pred, squared= False),4))
    print('Test mean value:',round(y_test.mean(),4))
    print('Test RMSE / mean value:',round(mean_squared_error(y_test, y_pred, squared= False)/y_test.mean(),4))
    print(' ')
    print('Test class error:',round(y_diff_abs.mean(),4))

    for i in y_diff.value_counts().index:
        print('{} : {:.0f} : {:.1f}%'.format(i, y_diff.value_counts()[i],100*y_diff.value_counts()[i]/len(y_test)))
        results[i][num] = y_diff.value_counts()[i]

    mse =  mean_squared_error(y_test,y_pred)
    mae = mean_absolute_error(y_test,y_pred)
    print('Mean Squared Error: ' + str(mse))
    rmse = mean_squared_error(y_test, y_pred, squared= False)
    tce = y_diff_abs.mean()

    best_parameters = clf_r.best_params_
    print('Best parameters: ' + str(best_parameters))

    results_mse[num] = mse
    results_mae[num] = mae
    results_rmse[num] = rmse
    results_tce[num] = tce
    results_params[num] = best_parameters
    feature_important[num] = clf_r.best_estimator_.feature_importances_

    num += 1

In [None]:
# Print results from all cross validations

print('0: ', sum(results[0].values()))
print('1: ', sum(results[1].values()))
print('2: ', sum(results[2].values()))
print('3: ', sum(results[3].values()))
print('4: ', sum(results[4].values()))
print('-1: ', sum(results[-1].values()))
print('-2: ', sum(results[-2].values()))
print('-3: ', sum(results[-3].values()))
print('-4: ', sum(results[-4].values()))

print('rmse: ', sum(results_rmse.values()))
print('mse: ', sum(results_mse.values()))
print('mae: ', sum(results_mae.values()))
print('Test class error: ', sum(results_tce.values()))