In [1]:
import pandas as pd
import helpers.processing_helpers as ph

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import make_scorer

from sklearn.neural_network import  MLPRegressor

from sklearn.discriminant_analysis import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestRegressor

from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression


In [2]:
df_dev = pd.read_csv("./dataset/development.csv")

In [3]:
noise_indexes = [0,7,12,15,16,17]
acc_idxs = [1,2,3,4,5,6,8,9,10,11,13,14]
features = ["pmax", "negpmax", 'area', 'tmax', 'rms']

sensors_removed = df_dev.drop(columns=ph.get_column_names(features, noise_indexes))
df = sensors_removed.drop(columns=ph.get_column_names(['tmax', 'rms', 'area'], acc_idxs))

In [4]:
df = df.sample(frac=1)

In [5]:
score = make_scorer(ph.mean_euclid_dist, greater_is_better=False)

In [6]:
y_train_valid = df[['x', 'y']].copy()

X_train_valid = df.drop(columns=['x', 'y'])

In [7]:
ml_pipe = Pipeline([
    ('scale', StandardScaler()),
    ('clf', MLPRegressor(random_state=42, max_iter=200, n_iter_no_change=50, learning_rate_init=0.01))
])

In [12]:
ml_param_grid = {'clf__hidden_layer_sizes' : [(50,),
                                           (25, 25)
                                            ], 
              'clf__learning_rate_init' : [0.01, 0.001],
              'clf__activation' : ['logistic', 'tanh', 'relu']
              }

In [13]:
gridsearch = GridSearchCV(ml_pipe, ml_param_grid, scoring=score, cv=3, verbose=2, n_jobs=-1)
gridsearch.fit(X_train_valid, y_train_valid)

Fitting 3 folds for each of 12 candidates, totalling 36 fits




In [14]:
results = pd.concat([pd.DataFrame(gridsearch.cv_results_["params"]),
                     -pd.DataFrame(gridsearch.cv_results_["mean_test_score"], columns=["MED"]),
                     pd.DataFrame(gridsearch.cv_results_["mean_fit_time"], columns=["Time"])],
                     axis=1)
df = results.sort_values('MED')

# Permanently changes the pandas settings
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
 
# All dataframes hereafter reflect these changes.
display(df)

pd.reset_option('all')

Unnamed: 0,clf__activation,clf__hidden_layer_sizes,clf__learning_rate_init,MED,Time
7,tanh,"(25, 25)",0.001,4.85869,347.915418
3,logistic,"(25, 25)",0.001,4.880375,321.567408
2,logistic,"(25, 25)",0.01,4.892509,324.665934
0,logistic,"(50,)",0.01,4.993008,299.876442
1,logistic,"(50,)",0.001,5.140426,301.244184
6,tanh,"(25, 25)",0.01,5.30712,312.378373
5,tanh,"(50,)",0.001,5.350966,292.975851
4,tanh,"(50,)",0.01,5.405702,310.411038
10,relu,"(25, 25)",0.01,5.718892,249.900243
8,relu,"(50,)",0.01,5.904697,220.89907


  pd.reset_option('all')


In [23]:
rf_param_grid = {'n_estimators' : [50, 100],
                 'max_features' : ['sqrt', 'log2'],
                 }

In [25]:
reg_rf = RandomForestRegressor(random_state=42)
gridsearch_rf = GridSearchCV(reg_rf, rf_param_grid, scoring=score, cv=3, verbose=2)
gridsearch_rf.fit(X_train_valid.sample(100000, random_state=42), y_train_valid.sample(100000, random_state=42))

Fitting 3 folds for each of 4 candidates, totalling 12 fits


In [26]:
results = pd.concat([pd.DataFrame(gridsearch_rf.cv_results_["params"]),
                     -pd.DataFrame(gridsearch_rf.cv_results_["mean_test_score"], columns=["MED"]),
                     pd.DataFrame(gridsearch_rf.cv_results_["mean_fit_time"], columns=["Time"])],
                     axis=1)
df = results.sort_values('MED')

# Permanently changes the pandas settings
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
 
# All dataframes hereafter reflect these changes.
display(df)

pd.reset_option('all')

Unnamed: 0,max_features,n_estimators,MED,Time
1,sqrt,100,4.94811,64.820339
3,log2,100,4.94811,53.999708
0,sqrt,50,5.129652,31.570638
2,log2,50,5.129652,31.71014


  pd.reset_option('all')


In [8]:
svr_pipe = Pipeline([
    ('scale', StandardScaler()),
    ('clf', MultiOutputRegressor(SVR()))
])

In [9]:
svr_param_grid = {'clf__estimator__kernel' :['poly', 'rbf', 'sigmoid'], 
              'clf__estimator__degree' : [2,3]
              }

In [11]:
gridsearch_svr = GridSearchCV(svr_pipe, svr_param_grid, scoring=score, cv=2, verbose=2)
gridsearch_svr.fit(X_train_valid.sample(50000, random_state=42), y_train_valid.sample(50000, random_state=42))

Fitting 2 folds for each of 6 candidates, totalling 12 fits
[CV] END clf__estimator__degree=2, clf__estimator__kernel=poly; total time= 1.5min
[CV] END clf__estimator__degree=2, clf__estimator__kernel=poly; total time= 1.4min
[CV] END clf__estimator__degree=2, clf__estimator__kernel=rbf; total time= 2.9min
[CV] END clf__estimator__degree=2, clf__estimator__kernel=rbf; total time= 2.7min
[CV] END clf__estimator__degree=2, clf__estimator__kernel=sigmoid; total time= 2.3min
[CV] END clf__estimator__degree=2, clf__estimator__kernel=sigmoid; total time= 2.3min
[CV] END clf__estimator__degree=3, clf__estimator__kernel=poly; total time= 1.5min
[CV] END clf__estimator__degree=3, clf__estimator__kernel=poly; total time= 1.5min
[CV] END clf__estimator__degree=3, clf__estimator__kernel=rbf; total time= 3.4min
[CV] END clf__estimator__degree=3, clf__estimator__kernel=rbf; total time= 3.5min
[CV] END clf__estimator__degree=3, clf__estimator__kernel=sigmoid; total time= 3.0min
[CV] END clf__estimato

In [12]:
results = pd.concat([pd.DataFrame(gridsearch_svr.cv_results_["params"]),
                     -pd.DataFrame(gridsearch_svr.cv_results_["mean_test_score"], columns=["MED"]),
                     pd.DataFrame(gridsearch_svr.cv_results_["mean_fit_time"], columns=["Time"])],
                     axis=1)
df = results.sort_values('MED')

# Permanently changes the pandas settings
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
 
# All dataframes hereafter reflect these changes.
display(df)

pd.reset_option('all')

Unnamed: 0,clf__estimator__degree,clf__estimator__kernel,MED,Time
1,2,rbf,9.174546,57.148367
4,3,rbf,9.174546,70.619858
0,2,poly,41.529701,55.989959
2,2,sigmoid,116.000019,90.136749
5,3,sigmoid,116.000019,114.14031
3,3,poly,1707.250126,54.426767


  pd.reset_option('all')


In [8]:
lr_pipe = Pipeline(steps=[
    ('poly', PolynomialFeatures()),
    ('scale', StandardScaler()),
    ('clf', MultiOutputRegressor(LinearRegression())),
])

In [13]:
lr_param_grid={'poly__degree': [1,2]}

In [14]:
gridsearch_lr = GridSearchCV(lr_pipe, lr_param_grid, scoring=score, cv=3, verbose=3, n_jobs=-1)
gridsearch_lr.fit(X_train_valid, y_train_valid)

Fitting 3 folds for each of 2 candidates, totalling 6 fits


In [15]:
results = pd.concat([pd.DataFrame(gridsearch_lr.cv_results_["params"]),
                     -pd.DataFrame(gridsearch_lr.cv_results_["mean_test_score"], columns=["MED"]),
                     pd.DataFrame(gridsearch_lr.cv_results_["mean_fit_time"], columns=["Time"])],
                     axis=1)
df = results.sort_values('MED')

# Permanently changes the pandas settings
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
 
# All dataframes hereafter reflect these changes.
display(df)

pd.reset_option('all')

Unnamed: 0,poly__degree,MED,Time
1,2,15.510059,30.041193
0,1,17.926855,1.563505


  pd.reset_option('all')
