In [11]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
import helpers.processing_helpers as ph
from sklearn.svm import LinearSVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression

In [12]:
def round_to_nearest_5(n):
    return np.round(n / 5) * 5


rounding_vectorized = np.vectorize(round_to_nearest_5)

In [10]:
df = pd.read_csv("./development.csv")

noise_indexes = [0,7,12,15,16,17]



features = ['pmax', 'negpmax', 'area', 'tmax', 'rms']

drop_features = ['area', 'tmax', 'rms']

df = df.drop(columns=ph.get_column_names(features, noise_indexes)) 

In [13]:
def find_best_feature_reduction(df: pd.DataFrame):
    
    feature_to_loss = {}
    drop_features_list = [('area',), ('tmax',), ('rms',), ('area', 'tmax'), ('area', 'rms'), ('rms', 'tmax'), ('area', 'tmax', 'rms')]
    acc_idxs = [1,2,3,4,5,6,8,9,10,11,13,14]

    for drop_features in drop_features_list:
        
        df_dev = df.drop(columns=ph.get_column_names(drop_features, acc_idxs))
        print(df_dev.columns)
        y_train_valid = df_dev[['x', 'y']].copy()

        X_train_valid = df_dev.drop(columns=['x', 'y'])

        X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, shuffle=True)

        means = X_train.mean()
        stds = X_train.std()

        X_train_normalized = (X_train - means) / stds

        X_valid_normalized = (X_valid - means) / stds

        mlp = MLPRegressor(random_state=42, verbose=1, n_iter_no_change=500, max_iter=100, learning_rate_init=0.01, activation="logistic", learning_rate="adaptive")
        mlp.fit(X_train_normalized, y_train)

        y_pred = mlp.predict(X_valid_normalized)

        y_pred_rounded = round_to_nearest_5(y_pred)
        result = (ph.mean_euclid_dist(y_valid, y_pred), ph.mean_euclid_dist(y_valid, y_pred_rounded))
        feature_to_loss[drop_features] = result
    return feature_to_loss

In [14]:
results = find_best_feature_reduction(df)

Index(['x', 'y', 'pmax[1]', 'negpmax[1]', 'tmax[1]', 'rms[1]', 'pmax[2]',
       'negpmax[2]', 'tmax[2]', 'rms[2]', 'pmax[3]', 'negpmax[3]', 'tmax[3]',
       'rms[3]', 'pmax[4]', 'negpmax[4]', 'tmax[4]', 'rms[4]', 'pmax[5]',
       'negpmax[5]', 'tmax[5]', 'rms[5]', 'pmax[6]', 'negpmax[6]', 'tmax[6]',
       'rms[6]', 'pmax[8]', 'negpmax[8]', 'tmax[8]', 'rms[8]', 'pmax[9]',
       'negpmax[9]', 'tmax[9]', 'rms[9]', 'pmax[10]', 'negpmax[10]',
       'tmax[10]', 'rms[10]', 'pmax[11]', 'negpmax[11]', 'tmax[11]', 'rms[11]',
       'pmax[13]', 'negpmax[13]', 'tmax[13]', 'rms[13]', 'pmax[14]',
       'negpmax[14]', 'tmax[14]', 'rms[14]'],
      dtype='object')
Iteration 1, loss = 13507.62061816
Iteration 2, loss = 885.12672794
Iteration 3, loss = 119.08186807
Iteration 4, loss = 37.31128624
Iteration 5, loss = 23.06834553
Iteration 6, loss = 18.56599539
Iteration 7, loss = 16.16446853
Iteration 8, loss = 14.81654097
Iteration 9, loss = 13.81522094
Iteration 10, loss = 13.10786424
Iteration 



Index(['x', 'y', 'pmax[1]', 'negpmax[1]', 'area[1]', 'rms[1]', 'pmax[2]',
       'negpmax[2]', 'area[2]', 'rms[2]', 'pmax[3]', 'negpmax[3]', 'area[3]',
       'rms[3]', 'pmax[4]', 'negpmax[4]', 'area[4]', 'rms[4]', 'pmax[5]',
       'negpmax[5]', 'area[5]', 'rms[5]', 'pmax[6]', 'negpmax[6]', 'area[6]',
       'rms[6]', 'pmax[8]', 'negpmax[8]', 'area[8]', 'rms[8]', 'pmax[9]',
       'negpmax[9]', 'area[9]', 'rms[9]', 'pmax[10]', 'negpmax[10]',
       'area[10]', 'rms[10]', 'pmax[11]', 'negpmax[11]', 'area[11]', 'rms[11]',
       'pmax[13]', 'negpmax[13]', 'area[13]', 'rms[13]', 'pmax[14]',
       'negpmax[14]', 'area[14]', 'rms[14]'],
      dtype='object')
Iteration 1, loss = 13411.32971624
Iteration 2, loss = 839.57247158
Iteration 3, loss = 117.52817128
Iteration 4, loss = 37.41240647
Iteration 5, loss = 22.74099172
Iteration 6, loss = 17.62524301
Iteration 7, loss = 15.13833726
Iteration 8, loss = 13.74925110
Iteration 9, loss = 12.84028109
Iteration 10, loss = 12.23641489
Iteration 



Index(['x', 'y', 'pmax[1]', 'negpmax[1]', 'area[1]', 'tmax[1]', 'pmax[2]',
       'negpmax[2]', 'area[2]', 'tmax[2]', 'pmax[3]', 'negpmax[3]', 'area[3]',
       'tmax[3]', 'pmax[4]', 'negpmax[4]', 'area[4]', 'tmax[4]', 'pmax[5]',
       'negpmax[5]', 'area[5]', 'tmax[5]', 'pmax[6]', 'negpmax[6]', 'area[6]',
       'tmax[6]', 'pmax[8]', 'negpmax[8]', 'area[8]', 'tmax[8]', 'pmax[9]',
       'negpmax[9]', 'area[9]', 'tmax[9]', 'pmax[10]', 'negpmax[10]',
       'area[10]', 'tmax[10]', 'pmax[11]', 'negpmax[11]', 'area[11]',
       'tmax[11]', 'pmax[13]', 'negpmax[13]', 'area[13]', 'tmax[13]',
       'pmax[14]', 'negpmax[14]', 'area[14]', 'tmax[14]'],
      dtype='object')
Iteration 1, loss = 13372.78021842
Iteration 2, loss = 828.68824817
Iteration 3, loss = 113.70713589
Iteration 4, loss = 36.18780409
Iteration 5, loss = 21.80480868
Iteration 6, loss = 17.03874323
Iteration 7, loss = 14.77282706
Iteration 8, loss = 13.42454443
Iteration 9, loss = 12.57347053
Iteration 10, loss = 12.0565912



Index(['x', 'y', 'pmax[1]', 'negpmax[1]', 'rms[1]', 'pmax[2]', 'negpmax[2]',
       'rms[2]', 'pmax[3]', 'negpmax[3]', 'rms[3]', 'pmax[4]', 'negpmax[4]',
       'rms[4]', 'pmax[5]', 'negpmax[5]', 'rms[5]', 'pmax[6]', 'negpmax[6]',
       'rms[6]', 'pmax[8]', 'negpmax[8]', 'rms[8]', 'pmax[9]', 'negpmax[9]',
       'rms[9]', 'pmax[10]', 'negpmax[10]', 'rms[10]', 'pmax[11]',
       'negpmax[11]', 'rms[11]', 'pmax[13]', 'negpmax[13]', 'rms[13]',
       'pmax[14]', 'negpmax[14]', 'rms[14]'],
      dtype='object')
Iteration 1, loss = 13352.69549254
Iteration 2, loss = 888.49911928
Iteration 3, loss = 120.06250395
Iteration 4, loss = 37.18674663
Iteration 5, loss = 22.66463063
Iteration 6, loss = 17.82851789
Iteration 7, loss = 15.40173028
Iteration 8, loss = 13.99850841
Iteration 9, loss = 12.89236855
Iteration 10, loss = 12.18897762
Iteration 11, loss = 11.63695960
Iteration 12, loss = 11.31123756
Iteration 13, loss = 11.07388327
Iteration 14, loss = 10.83587408
Iteration 15, loss = 10.6383



Index(['x', 'y', 'pmax[1]', 'negpmax[1]', 'tmax[1]', 'pmax[2]', 'negpmax[2]',
       'tmax[2]', 'pmax[3]', 'negpmax[3]', 'tmax[3]', 'pmax[4]', 'negpmax[4]',
       'tmax[4]', 'pmax[5]', 'negpmax[5]', 'tmax[5]', 'pmax[6]', 'negpmax[6]',
       'tmax[6]', 'pmax[8]', 'negpmax[8]', 'tmax[8]', 'pmax[9]', 'negpmax[9]',
       'tmax[9]', 'pmax[10]', 'negpmax[10]', 'tmax[10]', 'pmax[11]',
       'negpmax[11]', 'tmax[11]', 'pmax[13]', 'negpmax[13]', 'tmax[13]',
       'pmax[14]', 'negpmax[14]', 'tmax[14]'],
      dtype='object')
Iteration 1, loss = 13561.01783641
Iteration 2, loss = 895.94042744
Iteration 3, loss = 119.40604295
Iteration 4, loss = 36.64331148
Iteration 5, loss = 22.26149038
Iteration 6, loss = 17.43506881
Iteration 7, loss = 15.20112032
Iteration 8, loss = 13.67042344
Iteration 9, loss = 12.76109699
Iteration 10, loss = 12.11696768
Iteration 11, loss = 11.69149686
Iteration 12, loss = 11.34395734
Iteration 13, loss = 11.20549008
Iteration 14, loss = 10.95486566
Iteration 15, lo



Index(['x', 'y', 'pmax[1]', 'negpmax[1]', 'area[1]', 'pmax[2]', 'negpmax[2]',
       'area[2]', 'pmax[3]', 'negpmax[3]', 'area[3]', 'pmax[4]', 'negpmax[4]',
       'area[4]', 'pmax[5]', 'negpmax[5]', 'area[5]', 'pmax[6]', 'negpmax[6]',
       'area[6]', 'pmax[8]', 'negpmax[8]', 'area[8]', 'pmax[9]', 'negpmax[9]',
       'area[9]', 'pmax[10]', 'negpmax[10]', 'area[10]', 'pmax[11]',
       'negpmax[11]', 'area[11]', 'pmax[13]', 'negpmax[13]', 'area[13]',
       'pmax[14]', 'negpmax[14]', 'area[14]'],
      dtype='object')
Iteration 1, loss = 13349.24901947
Iteration 2, loss = 841.76775636
Iteration 3, loss = 117.18889060
Iteration 4, loss = 37.09317777
Iteration 5, loss = 22.47312942
Iteration 6, loss = 17.53559989
Iteration 7, loss = 15.05256805
Iteration 8, loss = 13.69614286
Iteration 9, loss = 12.73823331
Iteration 10, loss = 12.05641993
Iteration 11, loss = 11.68913826
Iteration 12, loss = 11.21905943
Iteration 13, loss = 10.96438600
Iteration 14, loss = 10.64894344
Iteration 15, lo



Index(['x', 'y', 'pmax[1]', 'negpmax[1]', 'pmax[2]', 'negpmax[2]', 'pmax[3]',
       'negpmax[3]', 'pmax[4]', 'negpmax[4]', 'pmax[5]', 'negpmax[5]',
       'pmax[6]', 'negpmax[6]', 'pmax[8]', 'negpmax[8]', 'pmax[9]',
       'negpmax[9]', 'pmax[10]', 'negpmax[10]', 'pmax[11]', 'negpmax[11]',
       'pmax[13]', 'negpmax[13]', 'pmax[14]', 'negpmax[14]'],
      dtype='object')
Iteration 1, loss = 13360.53858218
Iteration 2, loss = 889.79243839
Iteration 3, loss = 118.72699082
Iteration 4, loss = 36.08456234
Iteration 5, loss = 22.00425010
Iteration 6, loss = 17.57088248
Iteration 7, loss = 15.15619741
Iteration 8, loss = 13.58161321
Iteration 9, loss = 12.47983900
Iteration 10, loss = 11.78310370
Iteration 11, loss = 11.25664175
Iteration 12, loss = 10.85100426
Iteration 13, loss = 10.60091156
Iteration 14, loss = 10.38468552
Iteration 15, loss = 10.19388020
Iteration 16, loss = 10.09303296
Iteration 17, loss = 9.94501888
Iteration 18, loss = 9.87897838
Iteration 19, loss = 9.85612909
Iter



In [16]:
print(results)

{('area',): (5.002756464700468, 5.005793604991764), ('tmax',): (4.870996941658324, 4.863394208045973), ('rms',): (4.864068984153766, 4.859901141493429), ('area', 'tmax'): (4.846800950401072, 4.842250908994254), ('area', 'rms'): (4.854868918870851, 4.842508324509863), ('rms', 'tmax'): (4.739657690458377, 4.71447859779251), ('area', 'tmax', 'rms'): (4.667231364614155, 4.649811609963127)}
