## Import libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.model_selection import train_test_split, KFold
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
from sklearn import metrics
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
import time


## Loading and normalizing data

In [2]:
train = pd.read_csv('./train_shuffled_v2.csv')
test = pd.read_csv('./test_shuffled_v2.csv')

In [3]:
# split test/train for train data
X = train.iloc[:, :-1]
y = train['FUEL_CONSUMPTION']

X_test = test.iloc[:, :-1]
y_test = test['FUEL_CONSUMPTION']


In [4]:
# normalize train data
from sklearn.preprocessing import MinMaxScaler
norm = MinMaxScaler().fit(train.iloc[:, :-1])
X = norm.transform(X)

In [5]:
# normalize test data
X_test = norm.transform(X_test)

In [6]:
# split train and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

## Linear regression as base line

In [7]:
# linear regression as base line
from sklearn.linear_model import LinearRegression
linear = LinearRegression()
linear.fit(X_train, y_train)
y_pred = linear.predict(X_valid)
print("Linear Regression RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_valid, y_pred= y_pred)))


Linear Regression RMSE 5.217594647364855


In [8]:
# base line test set result
y_pred = linear.predict(X_test)
print("Linear Regression RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_test, y_pred= y_pred)))

Linear Regression RMSE 7.860535820020106


## Model & pipeline initialization

In [9]:
# initialize ML models

knn = KNeighborsRegressor(n_neighbors=2, weights='distance')

sgd = SGDRegressor(alpha=0.01, early_stopping=True, epsilon=0.19, l1_ratio=0.98,
             learning_rate='adaptive', loss='squared_epsilon_insensitive',
             penalty='elasticnet', random_state=42, warm_start=True)

svr = SVR(C=4.5, cache_size=1600, coef0=0.6, epsilon=0.9, gamma=0.99, kernel='poly')

linear_svr = LinearSVR(C=292, dual=False, epsilon=0.06, loss='squared_epsilon_insensitive',
          random_state=42)
            
dt = DecisionTreeRegressor(criterion='friedman_mse', max_depth=30,
                      min_samples_split=12, random_state=42, splitter='random')

et = ExtraTreesRegressor(criterion='friedman_mse', max_depth=67, max_features=0.73,
                    min_samples_split=4, n_estimators=600, random_state=42)
                      
rf = RandomForestRegressor(bootstrap=False, criterion='poisson', max_depth=74,
                      max_features=0.3, min_samples_split=3, n_estimators=500,
                      random_state=42, warm_start=True)

nn = MLPRegressor(alpha=0.1111, early_stopping=True,
             hidden_layer_sizes=(10, 70, 25), max_iter=15000,
             n_iter_no_change=32, random_state=42, solver='lbfgs')

ada = AdaBoostRegressor(base_estimator=dt,
                  learning_rate=0.36, loss='exponential', n_estimators=400,
                  random_state=42)


In [10]:
def train_models(X_train, X_valid, y_train, y_valid):

    # # KNN
    # knn.fit(X_train, y_train)
    # y_pred = knn.predict(X_valid)
    # print("KNN RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_valid, y_pred= y_pred)))
    # print("KNN MAPE", metrics.mean_absolute_percentage_error(y_true=y_valid, y_pred= y_pred))
    
    # # SGD
    # sgd.fit(X_train, y_train)
    # y_pred = sgd.predict(X_valid)
    # print("SGD RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_valid, y_pred= y_pred)))
    # print("SGD MAPE", metrics.mean_absolute_percentage_error(y_true=y_valid, y_pred= y_pred))

    # # SVR
    # svr.fit(X_train, y_train)
    # y_pred = svr.predict(X_valid)
    # print("SVR RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_valid, y_pred= y_pred)))
    # print("SVR MAPE", metrics.mean_absolute_percentage_error(y_true=y_valid, y_pred= y_pred))

    # # Linear SVR
    # linear_svr.fit(X_train, y_train)
    # y_pred = linear_svr.predict(X_valid)
    # print("Linear SVR RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_valid, y_pred= y_pred)))
    # print("Linear SVR MAPE", metrics.mean_absolute_percentage_error(y_true=y_valid, y_pred= y_pred))

    # # et
    # et.fit(X_train, y_train)
    # y_pred = et.predict(X_valid)
    # print("Extra Tree RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_valid, y_pred= y_pred)))
    # print("KExtra TreeNN MAPE", metrics.mean_absolute_percentage_error(y_true=y_valid, y_pred= y_pred))

    # # Decision Tree
    # dt.fit(X_train, y_train)
    # y_pred = dt.predict(X_valid)
    # print("Decision Tree RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_valid, y_pred= y_pred)))
    # print("Decision Tree MAPE", metrics.mean_absolute_percentage_error(y_true=y_valid, y_pred= y_pred))

    # # Random Forest
    # rf.fit(X_train, y_train)
    # y_pred = rf.predict(X_valid)
    # print("Random Forest RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_valid, y_pred= y_pred)))
    # print("Random Forest MAPE", metrics.mean_absolute_percentage_error(y_true=y_valid, y_pred= y_pred))

    # NN
    nn.fit(X_train, y_train)
    y_pred = nn.predict(X_valid)
    print("Neural Network RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_valid, y_pred= y_pred)))
    print("Neural Network MAPE", metrics.mean_absolute_percentage_error(y_true=y_valid, y_pred= y_pred))
    
    # # Adaboost
    # ada.fit(X_train, y_train)
    # y_pred = ada.predict(X_valid)
    # print("Adaboost RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_valid, y_pred= y_pred)))
    # print("Adaboost MAPE", metrics.mean_absolute_percentage_error(y_true=y_valid, y_pred= y_pred))


In [11]:
def test_data_result(X_test, y_test):

    # # KNN
    # y_pred = knn.predict(X_test)
    # print("KNN RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_test, y_pred= y_pred)))
    # print("KNN MAPE", metrics.mean_absolute_percentage_error(y_true=y_test, y_pred= y_pred))

    # # SGD
    # y_pred = sgd.predict(X_test)
    # print("SGD RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_test, y_pred= y_pred)))
    # print("SGD MAPE", metrics.mean_absolute_percentage_error(y_true=y_test, y_pred= y_pred))

    # # SVR
    # y_pred = svr.predict(X_test)
    # print("SVR RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_test, y_pred= y_pred)))
    # print("SVR MAPE", metrics.mean_absolute_percentage_error(y_true=y_test, y_pred= y_pred))

    # # Linear SVR
    # y_pred = linear_svr.predict(X_test)
    # print("Linear SVR RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_test, y_pred= y_pred)))
    # print("Linear SVR MAPE", metrics.mean_absolute_percentage_error(y_true=y_test, y_pred= y_pred))

    # # Extra Tree
    # y_pred = et.predict(X_test)
    # print("Extra Tree RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_test, y_pred= y_pred)))
    # print("Extra Tree MAPE", metrics.mean_absolute_percentage_error(y_true=y_test, y_pred= y_pred))

    # # Decision Tree
    # y_pred = dt.predict(X_test)
    # print("Decision Tree RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_test, y_pred= y_pred)))
    # print("Decision Tree MAPE", metrics.mean_absolute_percentage_error(y_true=y_test, y_pred= y_pred))

    # # Random Forest
    # y_pred = rf.predict(X_test)
    # print("Random Forest RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_test, y_pred= y_pred)))
    # print("Random Forest MAPE", metrics.mean_absolute_percentage_error(y_true=y_test, y_pred= y_pred))

    # NN
    y_pred = nn.predict(X_test)
    print("Neural Network RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_test, y_pred= y_pred)))
    print("Neural Network MAPE", metrics.mean_absolute_percentage_error(y_true=y_test, y_pred= y_pred))

    # # Adaboost
    # y_pred = ada.predict(X_test)
    # print("Adaboost RMSE", np.sqrt(metrics.mean_squared_error(y_true=y_test, y_pred= y_pred)))
    # print("Adaboost MAPE", metrics.mean_absolute_percentage_error(y_true=y_test, y_pred= y_pred))


## Cross validation

In [12]:
# n_fold = 5
# folds = KFold(n_splits=n_fold, shuffle=True, random_state=42)

In [13]:
# def cv_classical_models(X, y, model):
    
#     RMSE_scores = []
#     MAE_scores = []
#     R2_scores = []
#     AR2_scores = []
    
#     for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
#         print('Fold', fold_n, 'started at', time.ctime())
#         regr = 0
#         X_train, X_valid = X[train_index], X[valid_index]
#         y_train, y_valid = y[train_index], y[valid_index]
        
#         if model == 'ADA':
#             regr = ada
#             regr.fit(X_train, y_train)

#         if model == 'SVR':
#             regr = svr
#             regr.fit(X_train, y_train)
        
#         if model == 'RF':
#             regr = rf
#             regr.fit(X_train, y_train)

#         if model == 'DT':
#             regr = dt
#             regr.fit(X_train, y_train)

#         if model == 'NN':
#             regr = nn
#             regr.fit(X_train, y_train)

#         if model == 'KNN':
#             regr = knn
#             regr.fit(X_train, y_train)

#         if model == 'SGD':
#             regr = sgd
#             regr.fit(X_train, y_train)
        
#         if model == 'Linear SVR':
#             regr = linear_svr
#             regr.fit(X_train, y_train)

#         if model == 'ET':
#             regr = et
#             regr.fit(X_train, y_train)      
        
#         y_pred_valid = regr.predict(X_valid).reshape(-1,)
#         RMSE_score = np.sqrt(metrics.mean_squared_error(y_true=y_valid, y_pred= y_pred_valid))
#         MAE_score = mean_absolute_error(y_valid,y_pred_valid)
#         R2_score = r2_score(y_valid, y_pred_valid)
#         RMSE_scores.append(RMSE_score)
#         MAE_scores.append(MAE_score)
#         R2_scores.append(R2_score)

#         n = X.shape[0]
#         k = X.shape[1]
#         AR2_score = 1-((1-R2_score)*(n-1)/(n-k-1))
#         AR2_scores.append(AR2_score)

#     print(model,' -- CV RMSE mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(RMSE_scores), np.std(RMSE_scores)))
#     print(model,' -- CV MAE mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(MAE_scores), np.std(MAE_scores)))
#     print(model,' -- CV R2 mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(R2_scores), np.std(R2_scores)))
#     print(model,' -- CV Adjusted R2 mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(AR2_scores), np.std(AR2_scores)))

In [14]:
# cv_classical_models(X, y, 'KNN')
# cv_classical_models(X, y, 'Linear SVR')
# cv_classical_models(X, y, 'SVR')
# cv_classical_models(X, y, 'SGD')
# cv_classical_models(X, y, 'DT')
# cv_classical_models(X, y, 'ET')
# cv_classical_models(X, y, 'RF')
# cv_classical_models(X, y, 'NN')
# cv_classical_models(X, y, 'ADA')

## Results

In [15]:
train_models(X_train, X_valid, y_train, y_valid)

Neural Network RMSE 0.9675666332012647
Neural Network MAPE 0.0024846723022076997


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [16]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 44 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   SPEED                                 10000 non-null  float64
 1   MAF                                   10000 non-null  float64
 2   ENGINE_RPM                            10000 non-null  float64
 3   THROTTLE_POS                          10000 non-null  float64
 4   INTAKE_MANIFOLD_PRESSURE              10000 non-null  float64
 5   ENGINE_LOAD                           10000 non-null  float64
 6   FUEL_RATE                             10000 non-null  float64
 7   FUEL_RATE_SQRT                        10000 non-null  float64
 8   FUEL_RATE_PWR_2                       10000 non-null  float64
 9   FUEL_RATE_PWR_3                       10000 non-null  float64
 10  INTAKE_MANIFOLD_PRESSURE_SQRT         10000 non-null  float64
 11  INTAKE_MANIFOLD_

In [17]:
test_data_result(X_test, y_test)

'''KNN RMSE 4.969371400946747
KNN MAPE 0.07629307893785157
SGD RMSE 10.641319502256032
SGD MAPE 0.5189861427956324
SVR RMSE 9.131021569061796
SVR MAPE 0.10737832794044584
Linear SVR RMSE 6.613063495214109
Linear SVR MAPE 0.43949283361762104
Extra Tree RMSE 2.387114947752746
Extra Tree MAPE 0.01057194277783781
Decision Tree RMSE 3.2101628270970233
Decision Tree MAPE 0.05234468768972168
Random Forest RMSE 3.416570641423174
Random Forest MAPE 0.2625709069635105
Neural Network RMSE 0.143510272403907
Neural Network MAPE 0.0068828416011658435
Adaboost RMSE 1.6083412524203518
Adaboost MAPE 0.01636706686532803
Gradient boosting RMSE 2.0069944092547836
Gradient boosting MAPE 0.0370013317164879'''

Neural Network RMSE 0.2991187746972061
Neural Network MAPE 0.002367546588646389


'KNN RMSE 4.969371400946747\nKNN MAPE 0.07629307893785157\nSGD RMSE 10.641319502256032\nSGD MAPE 0.5189861427956324\nSVR RMSE 9.131021569061796\nSVR MAPE 0.10737832794044584\nLinear SVR RMSE 6.613063495214109\nLinear SVR MAPE 0.43949283361762104\nExtra Tree RMSE 2.387114947752746\nExtra Tree MAPE 0.01057194277783781\nDecision Tree RMSE 3.2101628270970233\nDecision Tree MAPE 0.05234468768972168\nRandom Forest RMSE 3.416570641423174\nRandom Forest MAPE 0.2625709069635105\nNeural Network RMSE 0.143510272403907\nNeural Network MAPE 0.0068828416011658435\nAdaboost RMSE 1.6083412524203518\nAdaboost MAPE 0.01636706686532803\nGradient boosting RMSE 2.0069944092547836\nGradient boosting MAPE 0.0370013317164879'

## permutation importance

In [30]:
from sklearn.inspection import permutation_importance

r = permutation_importance(nn, X_valid, y_valid,
                            n_repeats=30,
                            random_state=42)

for i in r.importances_mean.argsort()[::-1]:
    if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
        print(f"{train.columns[i]:<8} "
            f"{r.importances_mean[i]:.3f}"
            f" +/- {r.importances_std[i]:.3f}")

'''FUEL_RATE_SQRT 1.462 +/- 0.233
MAF_SQRT 1.341 +/- 0.202
SPEED_SQRT 1.001 +/- 0.032
SPEED    0.553 +/- 0.033
SPEED_X_THROTTLE_POS 0.162 +/- 0.023
SPEED_X_ENGINE_RPM 0.150 +/- 0.026
SPEED_X_ENGINE_LOAD 0.141 +/- 0.026
SPEED_X_INTAKE_MANIFOLD_PRESSURE 0.092 +/- 0.011
SPEED_PWR_2 0.068 +/- 0.015
SPEED_X_MAF 0.030 +/- 0.013
SPEED_X_FUEL_RATE 0.023 +/- 0.011
FUEL_RATE 0.017 +/- 0.002
MAF      0.016 +/- 0.002
ENGINE_RPM 0.009 +/- 0.002
FUEL_RATE_PWR_3 0.001 +/- 0.000
MAF_X_ENGINE_RPM 0.000 +/- 0.000
ENGINE_LOAD_PWR_3 0.000 +/- 0.000'''


FUEL_RATE_SQRT 1.479 +/- 0.220
MAF_SQRT 1.356 +/- 0.192
SPEED_SQRT 1.002 +/- 0.028
SPEED    0.563 +/- 0.039
SPEED_X_THROTTLE_POS 0.167 +/- 0.023
SPEED_X_ENGINE_RPM 0.156 +/- 0.027
SPEED_X_ENGINE_LOAD 0.148 +/- 0.028
SPEED_X_INTAKE_MANIFOLD_PRESSURE 0.094 +/- 0.011
SPEED_PWR_2 0.070 +/- 0.016
SPEED_X_MAF 0.030 +/- 0.012
SPEED_X_FUEL_RATE 0.022 +/- 0.010
FUEL_RATE 0.017 +/- 0.002
MAF      0.016 +/- 0.002
ENGINE_RPM 0.009 +/- 0.002
FUEL_RATE_PWR_3 0.001 +/- 0.000
MAF_X_ENGINE_RPM 0.000 +/- 0.000
ENGINE_LOAD_PWR_3 0.000 +/- 0.000


'FUEL_RATE_SQRT 1.462 +/- 0.233\nMAF_SQRT 1.341 +/- 0.202\nSPEED_SQRT 1.001 +/- 0.032\nSPEED    0.553 +/- 0.033\nSPEED_X_THROTTLE_POS 0.162 +/- 0.023\nSPEED_X_ENGINE_RPM 0.150 +/- 0.026\nSPEED_X_ENGINE_LOAD 0.141 +/- 0.026\nSPEED_X_INTAKE_MANIFOLD_PRESSURE 0.092 +/- 0.011\nSPEED_PWR_2 0.068 +/- 0.015\nSPEED_X_MAF 0.030 +/- 0.013\nSPEED_X_FUEL_RATE 0.023 +/- 0.011\nFUEL_RATE 0.017 +/- 0.002\nMAF      0.016 +/- 0.002\nENGINE_RPM 0.009 +/- 0.002\nFUEL_RATE_PWR_3 0.001 +/- 0.000\nMAF_X_ENGINE_RPM 0.000 +/- 0.000\nENGINE_LOAD_PWR_3 0.000 +/- 0.000'

## Some tuning efforts

### KNN

In [19]:
# n_neighbors = [x for x in range(1, 100)]
# weights = ['uniform', 'distance']
# algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute']
# parameter_space = {
#     'n_neighbors' : n_neighbors,
#     'weights' : weights,
#     'algorithm' : algorithm
# }

# clf = RandomizedSearchCV(KNeighborsRegressor(), parameter_space, n_jobs=12, cv = 3, verbose=2, random_state=42, n_iter=1000)
# clf.fit(X_train, y_train)

# print('Best estimator found:\n', clf.best_estimator_)

'''
Best estimator found:
 KNeighborsRegressor(n_neighbors=2, weights='distance')'''

"\nBest estimator found:\n KNeighborsRegressor(n_neighbors=2, weights='distance')"

### SVR

In [20]:
# kernel = ['linear', 'poly', 'rbf', 'sigmoid']
# gamma = [x * 0.01 for x in range(1, 100)]
# coef0 = [x * 0.01 for x in range(1, 100)]
# epsilon = [x * 0.01 for x in range(1, 100)]
# C = [x * 0.1 for x in range(1, 50)]
# cache_size = [x for x in range(200, 2000, 200)]
# parameter_space = {
#     'kernel' : kernel,
#     'gamma' : gamma,
#     'coef0' : coef0,
#     'epsilon' : epsilon,
#     'C' : C,
#     'cache_size' : cache_size,
# }

# clf = RandomizedSearchCV(SVR(), parameter_space, n_jobs=12, cv = 3, verbose=2, random_state=42, n_iter=1000)
# clf.fit(X_train, y_train)

# print('Best estimator found:\n', clf.best_estimator_)

'''
Best estimator found:
 SVR(C=4.5, cache_size=1600, coef0=0.6, epsilon=0.9, gamma=0.99, kernel='poly')'''

"\nBest estimator found:\n SVR(C=4.5, cache_size=1600, coef0=0.6, epsilon=0.9, gamma=0.99, kernel='poly')"

### Linear SVR

In [21]:

# epsilon = [x * 0.01 for x in range(0, 100)]
# C = [x for x in range(100, 1000)]
# loss = ['epsilon_insensitive', 'squared_epsilon_insensitive']
# parameter_space = {
#     'epsilon' : epsilon,
#     'C' : C,
#     'loss' : loss
# }

# clf = RandomizedSearchCV(LinearSVR(random_state=42, dual=False), parameter_space, n_jobs=12, cv = 3, verbose=2, random_state=42, n_iter=5000)
# clf.fit(X_train, y_train)

# print('Best estimator found:\n', clf.best_estimator_)

# '''
# Best estimator found:
#  LinearSVR(C=292, dual=False, epsilon=0.06, loss='squared_epsilon_insensitive',
#           random_state=42)'''

### Decision tree

In [22]:
# max_depth = [x for x in range(1, 100)]
# min_samples_split = [x for x in range(1, 100)]
# min_samples_leaf = [x for x in range(1, 100)]

# parameter_space = {
#     'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
#     'splitter': ['best', 'random'],
#     'max_depth': max_depth,
#     'min_samples_split': min_samples_split,
#     'min_samples_leaf': min_samples_leaf,
# }

# clf = RandomizedSearchCV(DecisionTreeRegressor(random_state=42), parameter_space, n_jobs=4, cv = 3, verbose=2, random_state=42, n_iter=1000)
# clf.fit(X_train, y_train)

# print('Best estimator found:\n', clf.best_estimator_)

'''
Best estimator found:
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=30,
                      min_samples_split=12, random_state=42, splitter='random')
'''

"\nBest estimator found:\n DecisionTreeRegressor(criterion='friedman_mse', max_depth=30,\n                      min_samples_split=12, random_state=42, splitter='random')\n"

### Random forest

In [23]:
# n_estimators = [x for x in range(100, 1000, 100)]
# max_depth = [x for x in range(1, 100)]
# min_samples_split = [x for x in range(1, 20)]
# min_samples_leaf = [x for x in range(1, 20)]
# max_features = [x * 0.01 for x in range(1, 100)]
# bootstrap = [True, False]
# warm_start = [True, False]

# parameter_space = {
#     'n_estimators' : n_estimators,
#     'criterion': ['squared_error', 'absolute_error', 'poisson'],
#     'max_features': max_features,
#     'max_depth': max_depth,
#     'min_samples_split': min_samples_split,
#     'min_samples_leaf': min_samples_leaf,
#     'bootstrap' : bootstrap,
#     'warm_start' : warm_start
# }

# clf = RandomizedSearchCV(RandomForestRegressor(random_state=42), parameter_space, n_jobs=8, cv = 3, verbose=2, random_state=42, n_iter=1000)
# clf.fit(X_train, y_train)

# print('Best estimator found:\n', clf.best_estimator_)

'''Best estimator found:
 RandomForestRegressor(bootstrap=False, criterion='poisson', max_depth=74,
                      max_features=0.3, min_samples_split=3, n_estimators=500,
                      random_state=42, warm_start=True)'''

"Best estimator found:\n RandomForestRegressor(bootstrap=False, criterion='poisson', max_depth=74,\n                      max_features=0.3, min_samples_split=3, n_estimators=500,\n                      random_state=42, warm_start=True)"

### Extra trees

In [24]:
# n_estimators = [x for x in range(100, 1000, 100)]
# max_depth = [x for x in range(1, 100)]
# min_samples_split = [x for x in range(1, 20)]
# min_samples_leaf = [x for x in range(1, 20)]
# max_features = [x * 0.01 for x in range(1, 100)]
# bootstrap = [True, False]
# warm_start = [True, False]

# parameter_space = {
#     'n_estimators' : n_estimators,
#     'criterion': ['squared_error', 'absolute_error', 'friedman_mse'],
#     'max_features': max_features,
#     'max_depth': max_depth,
#     'min_samples_split': min_samples_split,
#     'min_samples_leaf': min_samples_leaf,
#     'bootstrap' : bootstrap,
#     'warm_start' : warm_start
# }

# clf = RandomizedSearchCV(ExtraTreesRegressor(random_state=42), parameter_space, n_jobs=12, cv = 3, verbose=2, random_state=42, n_iter=1000)
# clf.fit(X_train, y_train)

# print('Best estimator found:\n', clf.best_estimator_)

'''Best estimator found:
 ExtraTreesRegressor(criterion='friedman_mse', max_depth=67, max_features=0.73,
                    min_samples_split=4, n_estimators=600, random_state=42)'''


"Best estimator found:\n ExtraTreesRegressor(criterion='friedman_mse', max_depth=67, max_features=0.73,\n                    min_samples_split=4, n_estimators=600, random_state=42)"

### Adaboost

In [25]:
# n_estimators = [x for x in range(50, 500, 50)]
# learning_rate = [x * 0.01 for x in range(1, 100)]
# loss = ['linear', 'square', 'exponential']

# parameter_space = {
#     'n_estimators' : n_estimators,
#     'learning_rate' : learning_rate,
#     'loss' : loss
# }

# clf = RandomizedSearchCV(AdaBoostRegressor(dt, random_state=42), parameter_space, n_jobs=12, cv = 3, verbose=2, random_state=42, n_iter=1000)
# clf.fit(X_train, y_train)

# print('Best estimator found:\n', clf.best_estimator_)

'''Best estimator found:
 AdaBoostRegressor(base_estimator=DecisionTreeRegressor(criterion='friedman_mse',
                                                       max_depth=30,
                                                       min_samples_split=12,
                                                       random_state=42,
                                                       splitter='random'),
                  learning_rate=0.36, loss='exponential', n_estimators=400,
                  random_state=42)'''



"Best estimator found:\n AdaBoostRegressor(base_estimator=DecisionTreeRegressor(criterion='friedman_mse',\n                                                       max_depth=30,\n                                                       min_samples_split=12,\n                                                       random_state=42,\n                                                       splitter='random'),\n                  learning_rate=0.36, loss='exponential', n_estimators=400,\n                  random_state=42)"

### SGD

In [26]:

# penalty = ['l1', 'l2', 'elasticnet']
# l1_ratio = [x * 0.01 for x in range(1, 100)]
# alpha = [x * 0.01 for x in range(1, 100)]
# epsilon = [x * 0.01 for x in range(1, 100)]
# loss = ['huber', 'epsilon_insensitive', 'squared_epsilon_insensitive']
# learning_rate = ['constant','adaptive', 'invscaling', 'optimal']

# parameter_space = {
#     'penalty' : penalty,
#     'alpha' : alpha,
#     'loss' : loss, 
#     'l1_ratio' : l1_ratio,
#     'epsilon' : epsilon,
#     'learning_rate' : learning_rate,

# }

# clf = RandomizedSearchCV(SGDRegressor(random_state=42, early_stopping=True), parameter_space, n_jobs=12, cv = 3, verbose=2, random_state=42, n_iter=1000)
# clf.fit(X_train, y_train)

# print('Best estimator found:\n', clf.best_estimator_)

'''Best estimator found:
 SGDRegressor(alpha=0.01, early_stopping=True, epsilon=0.19, l1_ratio=0.98,
             learning_rate='adaptive', loss='squared_epsilon_insensitive',
             penalty='elasticnet', random_state=42)'''

"Best estimator found:\n SGDRegressor(alpha=0.01, early_stopping=True, epsilon=0.19, l1_ratio=0.98,\n             learning_rate='adaptive', loss='squared_epsilon_insensitive',\n             penalty='elasticnet', random_state=42)"

### Neural network

In [27]:

# layer = [(x, y, z) for x in range(10,50,5) for y in range(10,50,5) for z in range(10,50,5)]
# alpha = [x * 0.00001 for x in range(10, 100000, 10)]
# max_iter = [x for x in range(1000, 15000, 2000)]
# early_stopping = [True]
# n_iter_no_change = [32]

# parameter_space = {
#     'hidden_layer_sizes': layer,
#     'activation': ['tanh', 'relu', 'logistic', 'identity'],
#     'solver': ['sgd', 'adam', 'lbfgs'],
#     'alpha': alpha,
#     'learning_rate': ['constant','adaptive', 'invascaling'],
#     'max_iter' : max_iter,
#     'early_stopping' : early_stopping,
#     'n_iter_no_change' : n_iter_no_change,
# }

# clf = RandomizedSearchCV(MLPRegressor(random_state=42), parameter_space, n_jobs=12, cv = 3, verbose=2, random_state=42, n_iter=1000)
# clf.fit(X_train, y_train)

# print('Best estimator found:\n', clf.best_estimator_)

'''
Best estimator found:
MLPRegressor(alpha=0.1111, early_stopping=True, hidden_layer_sizes=(10, 70, 25),
             learning_rate='adaptive', max_iter=15000, n_iter_no_change=32,
             solver='lbfgs', random_state=42)
'''

"\nBest estimator found:\nMLPRegressor(alpha=0.1111, early_stopping=True, hidden_layer_sizes=(10, 70, 25),\n             learning_rate='adaptive', max_iter=15000, n_iter_no_change=32,\n             solver='lbfgs', random_state=42)\n"

In [28]:
# take away -> 21th APRIL
# tsfresh
# autoML
# try larger dataset -> eVED
# discover & derive new feature -> gear
# model comparison -> good to go
# naive base line for regression problem -> linear regression 
# future email also cc -> l.pantiskas@vu.nl
# next meeting MAY 2, 10 am
