## Import libraries

In [137]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.model_selection import train_test_split, KFold
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
from sklearn import metrics
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
import time


## Loading and normalizing data

In [138]:
train = pd.read_csv('./train_shuffled_16features.csv')
test = pd.read_csv('./test_shuffled_16features.csv')

In [139]:
# split test/train for train data
X = train.iloc[:, :-1]
y = train['FUEL_CONSUMPTION']

X_test = test.iloc[:, :-1]
y_test = test['FUEL_CONSUMPTION']


In [140]:
# normalize train data
from sklearn.preprocessing import MinMaxScaler
norm = MinMaxScaler().fit(train.iloc[:, :-1])
X = norm.transform(X)

In [141]:
# normalize test data
X_test = norm.transform(X_test)

In [142]:
# split train and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

## Linear regression as base line

In [143]:
# linear regression as base line
from sklearn.linear_model import LinearRegression
linear = LinearRegression()
linear.fit(X_train, y_train)
y_pred = linear.predict(X_valid)
print("Linear Regression RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_valid, y_pred= y_pred)))


Linear Regression RMSE 10.517711098444833


In [144]:
# base line test set result
y_pred = linear.predict(X_test)
print("Linear Regression RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_test, y_pred= y_pred)))

Linear Regression RMSE 12.953305404984285


## Model & pipeline initialization

In [145]:
# initialize ML models with the best single model in autoML 1h training

knn = KNeighborsRegressor(n_neighbors=2, weights='distance')

sgd = SGDRegressor(alpha=5.971194851970654e-05, epsilon=0.046386514888991696,
               eta0=0.015045576741766471, loss='squared_epsilon_insensitive',
               max_iter=128, penalty='l1', power_t=0.11270288932176514,
               random_state=42, tol=3.883427782042011e-05, warm_start=True)

svr = SVR(C=40.141231303731544, cache_size=2021.4479166666667,
      epsilon=0.007539926872636399, gamma=0.2008913139792154,
      tol=0.01039576978136512, verbose=0)

linear_svr = LinearSVR(C=17.287648490892263, dual=False, epsilon=0.12424593618930425,
            loss='squared_epsilon_insensitive', random_state=42,
            tol=0.00021914796972665943)

dt = DecisionTreeRegressor(max_depth=13, min_samples_split=6, random_state=42)

et = ExtraTreesRegressor(max_features=0.9797793053686011, min_samples_split=4,
                      n_estimators=512, n_jobs=1, random_state=42,
                      warm_start=True)
                      
rf = RandomForestRegressor(max_features=1.0, n_estimators=512, n_jobs=-1,
                        random_state=42, warm_start=True)

nn = MLPRegressor(alpha=0.00014351289686054372, beta_1=0.999, beta_2=0.9,
               early_stopping=True, hidden_layer_sizes=(96,),
               learning_rate_init=0.006286073628770789, max_iter=64,
               n_iter_no_change=32, random_state=42, verbose=0, warm_start=True)

ada = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=10),
                    learning_rate=0.2663713456553139, loss='exponential',
                    n_estimators=318, random_state=42)


In [146]:
def train_models(X_train, X_valid, y_train, y_valid):

    # KNN
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_valid)
    print("KNN RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_valid, y_pred= y_pred)))
    
    # SGD
    sgd.fit(X_train, y_train)
    y_pred = sgd.predict(X_valid)
    print("SGD RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_valid, y_pred= y_pred)))

    # SVR
    svr.fit(X_train, y_train)
    y_pred = svr.predict(X_valid)
    print("SVR RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_valid, y_pred= y_pred)))

    # Linear SVR
    linear_svr.fit(X_train, y_train)
    y_pred = linear_svr.predict(X_valid)
    print("Linear SVR RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_valid, y_pred= y_pred)))

    # et
    et.fit(X_train, y_train)
    y_pred = et.predict(X_valid)
    print("Extra Tree RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_valid, y_pred= y_pred)))

    # Decision Tree
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_valid)
    print("Decision Tree RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_valid, y_pred= y_pred)))

    # Random Forest
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_valid)
    print("Random Forest RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_valid, y_pred= y_pred)))

    # NN
    nn.fit(X_train, y_train)
    y_pred = nn.predict(X_valid)
    print("Neural Network RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_valid, y_pred= y_pred)))
    
    # Adaboost
    ada.fit(X_train, y_train)
    y_pred = ada.predict(X_valid)
    print("Adaboost RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_valid, y_pred= y_pred)))


In [147]:
def test_data_result(X_test, y_test):

    # KNN
    y_pred = knn.predict(X_test)
    print("KNN RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_test, y_pred= y_pred)))

    # SGD
    y_pred = sgd.predict(X_test)
    print("SGD RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_test, y_pred= y_pred)))

    # SVR
    y_pred = svr.predict(X_test)
    print("SVR RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_test, y_pred= y_pred)))

    # Linear SVR
    y_pred = linear_svr.predict(X_test)
    print("Linear SVR RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_test, y_pred= y_pred)))

    # Extra Tree
    y_pred = et.predict(X_test)
    print("Extra Tree RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_test, y_pred= y_pred)))

    # Decision Tree
    y_pred = dt.predict(X_test)
    print("Decision Tree RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_test, y_pred= y_pred)))

    # Random Forest
    y_pred = rf.predict(X_test)
    print("Random Forest RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_test, y_pred= y_pred)))

    # NN
    y_pred = nn.predict(X_test)
    print("Neural Network RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_test, y_pred= y_pred)))

    # Adaboost
    y_pred = ada.predict(X_test)
    print("Adaboost RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_test, y_pred= y_pred)))

## Cross validation

In [148]:
# n_fold = 5
# folds = KFold(n_splits=n_fold, shuffle=True, random_state=42)

In [149]:
# def cv_classical_models(X, y, model):
    
#     RMSE_scores = []
#     MAE_scores = []
#     R2_scores = []
#     AR2_scores = []
    
#     for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
#         print('Fold', fold_n, 'started at', time.ctime())
#         regr = 0
#         X_train, X_valid = X[train_index], X[valid_index]
#         y_train, y_valid = y[train_index], y[valid_index]
        
#         if model == 'ADA':
#             regr = ada
#             regr.fit(X_train, y_train)

#         if model == 'SVR':
#             regr = svr
#             regr.fit(X_train, y_train)
        
#         if model == 'RF':
#             regr = rf
#             regr.fit(X_train, y_train)

#         if model == 'DT':
#             regr = dt
#             regr.fit(X_train, y_train)

#         if model == 'NN':
#             regr = nn
#             regr.fit(X_train, y_train)

#         if model == 'KNN':
#             regr = knn
#             regr.fit(X_train, y_train)

#         if model == 'SGD':
#             regr = sgd
#             regr.fit(X_train, y_train)
        
#         if model == 'Linear SVR':
#             regr = linear_svr
#             regr.fit(X_train, y_train)

#         if model == 'ET':
#             regr = et
#             regr.fit(X_train, y_train)      
        
#         y_pred_valid = regr.predict(X_valid).reshape(-1,)
#         RMSE_score = np.sqrt(metrics.mean_squared_error(y_true=y_valid, y_pred= y_pred_valid))
#         MAE_score = mean_absolute_error(y_valid,y_pred_valid)
#         R2_score = r2_score(y_valid, y_pred_valid)
#         RMSE_scores.append(RMSE_score)
#         MAE_scores.append(MAE_score)
#         R2_scores.append(R2_score)

#         n = X.shape[0]
#         k = X.shape[1]
#         AR2_score = 1-((1-R2_score)*(n-1)/(n-k-1))
#         AR2_scores.append(AR2_score)

#     print(model,' -- CV RMSE mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(RMSE_scores), np.std(RMSE_scores)))
#     print(model,' -- CV MAE mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(MAE_scores), np.std(MAE_scores)))
#     print(model,' -- CV R2 mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(R2_scores), np.std(R2_scores)))
#     print(model,' -- CV Adjusted R2 mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(AR2_scores), np.std(AR2_scores)))

In [150]:
# cv_classical_models(X, y, 'KNN')
# cv_classical_models(X, y, 'Linear SVR')
# cv_classical_models(X, y, 'SVR')
# cv_classical_models(X, y, 'SGD')
# cv_classical_models(X, y, 'DT')
# cv_classical_models(X, y, 'ET')
# cv_classical_models(X, y, 'RF')
# cv_classical_models(X, y, 'NN')
# cv_classical_models(X, y, 'ADA')

## Results

In [151]:
train_models(X_train, X_valid, y_train, y_valid)

KNN RMSE 8.672881457237327
SGD RMSE 11.128287880438053
SVR RMSE 10.292517577092408
Linear SVR RMSE 10.518423373392075
Extra Tree RMSE 0.8351641015014807
Decision Tree RMSE 2.3105245627191198
Random Forest RMSE 1.9024522866466844




Neural Network RMSE 7.67641036172388
Adaboost RMSE 1.9941563001951765


In [152]:
test_data_result(X_test, y_test)

KNN RMSE 13.208204707403
SGD RMSE 13.383710559411073
SVR RMSE 12.811836842651838
Linear SVR RMSE 12.95094780841463
Extra Tree RMSE 3.215047198547246
Decision Tree RMSE 4.908793090422475
Random Forest RMSE 3.2667461322841067
Neural Network RMSE 10.171263172368052
Adaboost RMSE 3.4083184573829772


## Permutation feature importance

In [153]:
from sklearn.inspection import permutation_importance
# nn
r = permutation_importance(nn, X_valid, y_valid,
                            n_repeats=30,
                            random_state=42)

for i in r.importances_mean.argsort()[::-1]:
    if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
        print(f"{train.columns[i]:<8} "
            f"{r.importances_mean[i]:.3f}"
            f" +/- {r.importances_std[i]:.3f}")

SPEED    0.949 +/- 0.018
FUEL_LEVEL 0.039 +/- 0.007
ENGINE_LOAD 0.036 +/- 0.005
MAF      0.023 +/- 0.004
FUEL_RATE 0.020 +/- 0.004
THROTTLE_POS 0.005 +/- 0.001
INTAKE_MANIFOLD_PRESSURE 0.004 +/- 0.002


In [154]:
# decision tree
r = permutation_importance(dt, X_valid, y_valid,
                            n_repeats=30,
                            random_state=42)

for i in r.importances_mean.argsort()[::-1]:
    if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
        print(f"{train.columns[i]:<8} "
            f"{r.importances_mean[i]:.3f}"
            f" +/- {r.importances_std[i]:.3f}")

SPEED    4.748 +/- 0.527
THROTTLE_POS 0.975 +/- 0.331
MAF      0.264 +/- 0.020
FUEL_RATE 0.172 +/- 0.012
ENGINE_LOAD 0.021 +/- 0.002


In [155]:
# extra trees
r = permutation_importance(et, X_valid, y_valid,
                            n_repeats=30,
                            random_state=42)

for i in r.importances_mean.argsort()[::-1]:
    if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
        print(f"{train.columns[i]:<8} "
            f"{r.importances_mean[i]:.3f}"
            f" +/- {r.importances_std[i]:.3f}")

SPEED    4.209 +/- 0.323
FUEL_RATE 0.338 +/- 0.023
MAF      0.328 +/- 0.021
ENGINE_RPM 0.029 +/- 0.004
ENGINE_LOAD 0.017 +/- 0.002
THROTTLE_POS 0.006 +/- 0.001
TIMING_ADVANCE 0.003 +/- 0.001
FUEL_LEVEL 0.002 +/- 0.001
ENGINE_RUNTIME 0.000 +/- 0.000


In [156]:
# random forest
r = permutation_importance(rf, X_valid, y_valid,
                            n_repeats=30,
                            random_state=42)

for i in r.importances_mean.argsort()[::-1]:
    if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
        print(f"{train.columns[i]:<8} "
            f"{r.importances_mean[i]:.3f}"
            f" +/- {r.importances_std[i]:.3f}")

SPEED    4.347 +/- 0.379
FUEL_RATE 0.211 +/- 0.021
MAF      0.208 +/- 0.020
ENGINE_RPM 0.037 +/- 0.006
THROTTLE_POS 0.033 +/- 0.009
ENGINE_LOAD 0.019 +/- 0.003


In [157]:
# adaboost
r = permutation_importance(ada, X_valid, y_valid,
                            n_repeats=30,
                            random_state=42)

for i in r.importances_mean.argsort()[::-1]:
    if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
        print(f"{train.columns[i]:<8} "
            f"{r.importances_mean[i]:.3f}"
            f" +/- {r.importances_std[i]:.3f}")

SPEED    4.883 +/- 0.655
FUEL_RATE 0.196 +/- 0.018
MAF      0.188 +/- 0.018
THROTTLE_POS 0.003 +/- 0.001
ENGINE_LOAD 0.000 +/- 0.000


In [158]:
# SGD
r = permutation_importance(sgd, X_valid, y_valid,
                            n_repeats=30,
                            random_state=42)

for i in r.importances_mean.argsort()[::-1]:
    if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
        print(f"{train.columns[i]:<8} "
            f"{r.importances_mean[i]:.3f}"
            f" +/- {r.importances_std[i]:.3f}")

SPEED    0.726 +/- 0.021
MAF      0.064 +/- 0.006
FUEL_RATE 0.064 +/- 0.006
ENGINE_LOAD 0.031 +/- 0.004
ENGINE_RPM 0.026 +/- 0.003
TIMING_ADVANCE 0.010 +/- 0.002
FUEL_LEVEL 0.009 +/- 0.003
ENGINE_RUNTIME 0.006 +/- 0.002
THROTTLE_POS 0.004 +/- 0.001
AIR_INTAKE_TEMP 0.004 +/- 0.002
AMBIENT_AIR_TEMP 0.002 +/- 0.001


In [159]:
# KNN
r = permutation_importance(knn, X_valid, y_valid,
                            n_repeats=30,
                            random_state=42)

for i in r.importances_mean.argsort()[::-1]:
    if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
        print(f"{train.columns[i]:<8} "
            f"{r.importances_mean[i]:.3f}"
            f" +/- {r.importances_std[i]:.3f}")

SPEED    0.640 +/- 0.055
ENGINE_LOAD 0.279 +/- 0.056
ENGINE_RPM 0.268 +/- 0.061
FUEL_LEVEL 0.266 +/- 0.041
ENGINE_RUNTIME 0.185 +/- 0.063
INTAKE_MANIFOLD_PRESSURE 0.135 +/- 0.041
MAF      0.121 +/- 0.053
FUEL_RATE 0.121 +/- 0.053
AMBIENT_AIR_TEMP 0.097 +/- 0.046
AIR_INTAKE_TEMP 0.090 +/- 0.033


In [160]:
# SVR
r = permutation_importance(svr, X_valid, y_valid,
                            n_repeats=30,
                            random_state=42)

for i in r.importances_mean.argsort()[::-1]:
    if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
        print(f"{train.columns[i]:<8} "
            f"{r.importances_mean[i]:.3f}"
            f" +/- {r.importances_std[i]:.3f}")

SPEED    0.540 +/- 0.012
FUEL_RATE 0.057 +/- 0.012
MAF      0.057 +/- 0.012
TIMING_ADVANCE 0.011 +/- 0.001
FUEL_LEVEL 0.009 +/- 0.002
AIR_INTAKE_TEMP 0.003 +/- 0.001
ENGINE_RPM 0.002 +/- 0.000
INTAKE_MANIFOLD_PRESSURE 0.001 +/- 0.000
BAROMETRIC_PRESSURE(KPA) 0.001 +/- 0.000


In [161]:
# Linear SVR
r = permutation_importance(linear_svr, X_valid, y_valid,
                            n_repeats=30,
                            random_state=42)

for i in r.importances_mean.argsort()[::-1]:
    if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
        print(f"{train.columns[i]:<8} "
            f"{r.importances_mean[i]:.3f}"
            f" +/- {r.importances_std[i]:.3f}")

SPEED    0.739 +/- 0.021
MAF      0.072 +/- 0.007
FUEL_RATE 0.072 +/- 0.007
ENGINE_LOAD 0.028 +/- 0.004
ENGINE_RPM 0.024 +/- 0.003
TIMING_ADVANCE 0.011 +/- 0.002
FUEL_LEVEL 0.006 +/- 0.002
THROTTLE_POS 0.006 +/- 0.002
ENGINE_RUNTIME 0.006 +/- 0.002
