## Import libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.model_selection import train_test_split, KFold
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
from sklearn import metrics
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
import time


## Loading and normalizing data

In [2]:
train = pd.read_csv('./train_shuffled_7features.csv')
test = pd.read_csv('./test_shuffled_7features.csv')

In [3]:
# split test/train for train data
X = train.iloc[:, :-1]
y = train['FUEL_CONSUMPTION']

X_test = test.iloc[:, :-1]
y_test = test['FUEL_CONSUMPTION']


In [4]:
# normalize train data
from sklearn.preprocessing import MinMaxScaler
norm = MinMaxScaler().fit(train.iloc[:, :-1])
X = norm.transform(X)

In [5]:
# normalize test data
X_test = norm.transform(X_test)

In [6]:
# split train and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

## Linear regression as base line

In [7]:
# linear regression as base line
from sklearn.linear_model import LinearRegression
linear = LinearRegression()
linear.fit(X_train, y_train)
y_pred = linear.predict(X_valid)
print("Linear Regression RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_valid, y_pred= y_pred)))


Linear Regression RMSE 11.186266994830012


In [8]:
# base line test set result
y_pred = linear.predict(X_test)
print("Linear Regression RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_test, y_pred= y_pred)))

Linear Regression RMSE 11.186438017490415


## Model & pipeline initialization

In [9]:
# initialize ML models

knn = KNeighborsRegressor(n_neighbors=1, weights='distance')

sgd = SGDRegressor(alpha=5.1145874299878906e-05, average=True,
               eta0=1.3342174605223877e-05, l1_ratio=0.3883764520954751,
               max_iter=512, penalty='elasticnet', power_t=0.06294035436457378,
               random_state=42, tol=4.672384942590959e-05, warm_start=True)

svr = SVR(C=3133.304218632794, cache_size=2008.7369791666667,
      coef0=0.4057352025775811, epsilon=0.005909999318357989,
      gamma=0.0009196972638709646, kernel='poly', tol=0.002555869012110542,
      verbose=0)

linear_svr = LinearSVR(C=28731.940263285174, dual=False, epsilon=0.7872520084668472,
            loss='squared_epsilon_insensitive', random_state=42,
            tol=2.2381728154443955e-05)

# dt = DecisionTreeRegressor(max_depth=40)
dt = DecisionTreeRegressor(criterion='friedman_mse', max_depth=4655,
                        min_samples_leaf=5, min_samples_split=12,
                        random_state=42)

et = ExtraTreesRegressor(criterion='friedman_mse', max_features=0.857137620215694,
                      n_estimators=512, n_jobs=1, random_state=42,
                      warm_start=True)
                      
# rf = RandomForestRegressor(n_estimators=600,min_samples_split= 5,min_samples_leaf=2,max_features='auto',max_depth=20,bootstrap=False, random_state=0, n_jobs=-1)
rf = RandomForestRegressor(bootstrap=False, max_features=0.22177348265923788,
                        min_samples_split=10, n_estimators=512, n_jobs=1,
                        random_state=42)

nn = MLPRegressor(alpha=0.1111, early_stopping=True, hidden_layer_sizes=(10, 70, 25),
             learning_rate='adaptive', max_iter=15000, n_iter_no_change=32,
             solver='lbfgs', random_state=42)

ada = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=9),
                    learning_rate=0.39825318885251826, loss='square',
                    n_estimators=317, random_state=42)


In [10]:
def train_models(X_train, X_valid, y_train, y_valid):

    # KNN
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_valid)
    print("KNN RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_valid, y_pred= y_pred)))
    
    # SGD
    sgd.fit(X_train, y_train)
    y_pred = sgd.predict(X_valid)
    print("SGD RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_valid, y_pred= y_pred)))

    # SVR
    svr.fit(X_train, y_train)
    y_pred = svr.predict(X_valid)
    print("SVR RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_valid, y_pred= y_pred)))

    # Linear SVR
    linear_svr.fit(X_train, y_train)
    y_pred = linear_svr.predict(X_valid)
    print("Linear SVR RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_valid, y_pred= y_pred)))

    # et
    et.fit(X_train, y_train)
    y_pred = et.predict(X_valid)
    print("Extra Tree RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_valid, y_pred= y_pred)))

    # Decision Tree
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_valid)
    print("Decision Tree RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_valid, y_pred= y_pred)))

    # Random Forest
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_valid)
    print("Random Forest RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_valid, y_pred= y_pred)))

    # NN
    nn.fit(X_train, y_train)
    y_pred = nn.predict(X_valid)
    print("Neural Network RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_valid, y_pred= y_pred)))
    
    # Adaboost
    ada.fit(X_train, y_train)
    y_pred = ada.predict(X_valid)
    print("Adaboost RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_valid, y_pred= y_pred)))


In [11]:
def test_data_result(X_test, y_test):

    # KNN
    y_pred = knn.predict(X_test)
    print("KNN RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_test, y_pred= y_pred)))

    # SGD
    y_pred = sgd.predict(X_test)
    print("SGD RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_test, y_pred= y_pred)))

    # SVR
    y_pred = svr.predict(X_test)
    print("SVR RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_test, y_pred= y_pred)))

    # Linear SVR
    y_pred = linear_svr.predict(X_test)
    print("Linear SVR RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_test, y_pred= y_pred)))

    # Extra Tree
    y_pred = et.predict(X_test)
    print("Extra Tree RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_test, y_pred= y_pred)))

    # Decision Tree
    y_pred = dt.predict(X_test)
    print("Decision Tree RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_test, y_pred= y_pred)))

    # Random Forest
    y_pred = rf.predict(X_test)
    print("Random Forest RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_test, y_pred= y_pred)))

    # NN
    y_pred = nn.predict(X_test)
    print("Neural Network RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_test, y_pred= y_pred)))

    # Adaboost
    y_pred = ada.predict(X_test)
    print("Adaboost RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_test, y_pred= y_pred)))

## Cross validation

In [12]:
# n_fold = 5
# folds = KFold(n_splits=n_fold, shuffle=True, random_state=42)

In [13]:
# def cv_classical_models(X, y, model):
    
#     RMSE_scores = []
#     MAE_scores = []
#     R2_scores = []
#     AR2_scores = []
    
#     for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
#         print('Fold', fold_n, 'started at', time.ctime())
#         regr = 0
#         X_train, X_valid = X[train_index], X[valid_index]
#         y_train, y_valid = y[train_index], y[valid_index]
        
#         if model == 'ADA':
#             regr = ada
#             regr.fit(X_train, y_train)

#         if model == 'SVR':
#             regr = svr
#             regr.fit(X_train, y_train)
        
#         if model == 'RF':
#             regr = rf
#             regr.fit(X_train, y_train)

#         if model == 'DT':
#             regr = dt
#             regr.fit(X_train, y_train)

#         if model == 'NN':
#             regr = nn
#             regr.fit(X_train, y_train)

#         if model == 'KNN':
#             regr = knn
#             regr.fit(X_train, y_train)

#         if model == 'SGD':
#             regr = sgd
#             regr.fit(X_train, y_train)
        
#         if model == 'Linear SVR':
#             regr = linear_svr
#             regr.fit(X_train, y_train)

#         if model == 'ET':
#             regr = et
#             regr.fit(X_train, y_train)      
        
#         y_pred_valid = regr.predict(X_valid).reshape(-1,)
#         RMSE_score = np.sqrt(metrics.mean_squared_error(y_true=y_valid, y_pred= y_pred_valid))
#         MAE_score = mean_absolute_error(y_valid,y_pred_valid)
#         R2_score = r2_score(y_valid, y_pred_valid)
#         RMSE_scores.append(RMSE_score)
#         MAE_scores.append(MAE_score)
#         R2_scores.append(R2_score)

#         n = X.shape[0]
#         k = X.shape[1]
#         AR2_score = 1-((1-R2_score)*(n-1)/(n-k-1))
#         AR2_scores.append(AR2_score)

#     print(model,' -- CV RMSE mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(RMSE_scores), np.std(RMSE_scores)))
#     print(model,' -- CV MAE mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(MAE_scores), np.std(MAE_scores)))
#     print(model,' -- CV R2 mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(R2_scores), np.std(R2_scores)))
#     print(model,' -- CV Adjusted R2 mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(AR2_scores), np.std(AR2_scores)))

In [14]:
# cv_classical_models(X, y, 'KNN')
# cv_classical_models(X, y, 'Linear SVR')
# cv_classical_models(X, y, 'SVR')
# cv_classical_models(X, y, 'SGD')
# cv_classical_models(X, y, 'DT')
# cv_classical_models(X, y, 'ET')
# cv_classical_models(X, y, 'RF')
# cv_classical_models(X, y, 'NN')
# cv_classical_models(X, y, 'ADA')

## Results

In [15]:
train_models(X_train, X_valid, y_train, y_valid)

KNN RMSE 8.815757323015635




SGD RMSE 12.218796920565973
SVR RMSE 11.747059841253327
Linear SVR RMSE 11.211898925931491
Extra Tree RMSE 1.5157794179298565
Decision Tree RMSE 2.0627827543621287
Random Forest RMSE 4.8068407916663825
Neural Network RMSE 6.04454723803436
Adaboost RMSE 2.295935828260476


In [16]:
test_data_result(X_test, y_test)

KNN RMSE 8.046013731558524
SGD RMSE 12.326752958523212
SVR RMSE 11.880209855201374
Linear SVR RMSE 11.19949840416876
Extra Tree RMSE 1.251918556603486
Decision Tree RMSE 2.5990525831430706
Random Forest RMSE 3.7343668777795744
Neural Network RMSE 5.746746270521942
Adaboost RMSE 1.2561150367808531
