In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import helpers.processing_helpers as ph

from sklearn.model_selection import RandomizedSearchCV

from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import make_scorer


In [2]:
df_dev = pd.read_csv("./dataset/development.csv")

In [83]:
noise_indexes = [0,7,12,15,16,17]
acc_idxs = [1,2,3,4,5,6,8,9,10,11,13,14]
features = ["pmax", "negpmax", 'area', 'tmax', 'rms']

sensors_removed = df_dev.drop(columns=ph.get_column_names(features, noise_indexes))
sensors_removed = sensors_removed.drop(columns=ph.get_column_names(['tmax', 'rms'], acc_idxs))

In [84]:
negpmax_clms = sensors_removed[[f'negpmax[{i}]' for i in acc_idxs]]
df_above_zero = negpmax_clms[(negpmax_clms > 0).any(axis=1)]
df_zero = sensors_removed.drop(df_above_zero.index.values)

In [85]:
joined_negpmax = pd.concat([df_zero[clmn] for clmn in negpmax_clms])
df_zero = df_zero.drop(joined_negpmax[joined_negpmax < -100].index)

In [88]:
score = make_scorer(ph.mean_euclid_dist, greater_is_better=False)

In [109]:
subset = df_zero
X_train, y_train = ph.insert_zeros(subset, acc_idxs, 17)

In [110]:
param_grid = {'n_estimators' : [375],
              'max_depth': [40],
              }

# criterion: 'absolute error' was very slow and didn't improve the model
# max_features: 'sqrt' seemed to consistently give the best results
# The more estimators the better, but slower training

In [111]:
reg = RandomForestRegressor(max_features='sqrt')
gridsearch = RandomizedSearchCV(reg, param_grid, scoring=score)
gridsearch.fit(X_train, y_train)



Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END .....................max_depth=40, n_estimators=375; total time=15.4min
[CV] END .....................max_depth=40, n_estimators=375; total time=14.6min
[CV] END .....................max_depth=40, n_estimators=375; total time=13.5min


KeyboardInterrupt: 

In [108]:
results = pd.concat([pd.DataFrame(gridsearch.cv_results_["params"]),
                     pd.DataFrame(gridsearch.cv_results_["mean_test_score"], columns=["Accuracy"]),
                     pd.DataFrame(gridsearch.cv_results_["mean_fit_time"], columns=["Time"])],
                     axis=1)
df = results.sort_values('Accuracy', ascending=False)

# Permanently changes the pandas settings
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
 
# All dataframes hereafter reflect these changes.
display(df)

Unnamed: 0,n_estimators,max_depth,Accuracy,Time
3,400,40,-4.97354,73.575768
2,375,40,-4.97485,68.734185
0,375,20,-4.976083,69.082594
1,400,20,-4.976605,73.171837
5,400,60,-4.977488,73.457774
4,375,60,-4.981404,68.787329
