In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
import helpers.processing_helpers as ph
from sklearn.svm import LinearSVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression

In [2]:
df_dev = pd.read_csv("./dataset/development.csv")

In [68]:
subset = df_dev.sample(200000)

In [None]:
y_train_valid = subset[['x', 'y']].copy()

X_train_valid = subset.drop(columns=['x', 'y'])

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, shuffle=True, random_state=42)

In [None]:
reg = RandomForestRegressor(100, random_state=42)
reg.fit(X_train , y_train)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:  2.2min
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s


0.9974940429058765

In [None]:
med_with_noise = ph.mean_euclid_dist(y_valid, reg.predict(X_valid))
print(med_with_noise)

6.034339648110773


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s


In [None]:
sorted(zip(X_train_valid.columns, reg.feature_importances_), key=lambda x: x[1],reverse=True)

[('pmax[8]', 0.36531827093782604),
 ('pmax[11]', 0.24200094225924315),
 ('pmax[10]', 0.1550234916755122),
 ('pmax[5]', 0.08386749649625616),
 ('pmax[13]', 0.035135405634305866),
 ('negpmax[13]', 0.026835622720675297),
 ('pmax[9]', 0.015287941891783835),
 ('negpmax[10]', 0.011671145716784813),
 ('negpmax[3]', 0.010573816627380437),
 ('negpmax[11]', 0.01037597951401781),
 ('pmax[4]', 0.006175032739158957),
 ('negpmax[5]', 0.005640025638681187),
 ('negpmax[8]', 0.004437650119396651),
 ('negpmax[1]', 0.004073220149525142),
 ('pmax[3]', 0.0037901723981325474),
 ('negpmax[6]', 0.0027384499593256367),
 ('negpmax[4]', 0.002490787437037326),
 ('negpmax[14]', 0.0023181677883974657),
 ('pmax[1]', 0.00199689404596475),
 ('pmax[2]', 0.0016538179841708868),
 ('area[5]', 0.0013903451323228093),
 ('negpmax[9]', 0.0012310377425118356),
 ('pmax[6]', 0.0010321829803238993),
 ('pmax[14]', 0.0008872746110166745),
 ('negpmax[2]', 0.00037728039616458374),
 ('pmax[15]', 0.00036939524370810465),
 ('area[3]', 0

In [69]:
noise_indexes = [0,7,12,15,16,17]
features = ["pmax", "negpmax", 'area', 'tmax', 'rms']

noise_removed = subset.drop(columns=ph.get_column_names(features, noise_indexes))

In [12]:
y_train_valid = noise_removed[['x', 'y']].copy()

X_train_valid = noise_removed.drop(columns=['x', 'y'])

X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, shuffle=True, random_state=42)

X_train["RANDOM"] = np.random.RandomState(42).randn(X_train.shape[0])

X_valid["RANDOM"] = np.random.RandomState(42).randn(X_valid.shape[0])

reg = RandomForestRegressor(100, random_state=42, verbose=1)
reg.fit(X_train , y_train)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:  1.5min


In [13]:
med_without_noise = ph.mean_euclid_dist(y_valid, reg.predict(X_valid))
print(med_without_noise)

5.9749290657933525


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s


In [14]:
sorted(zip(X_train.columns, reg.feature_importances_), key=lambda x: x[1],reverse=True)

[('pmax[8]', 0.35919939467781353),
 ('pmax[11]', 0.2465764896563688),
 ('pmax[10]', 0.12072256899748196),
 ('pmax[5]', 0.08552980035222958),
 ('negpmax[10]', 0.04908274539424266),
 ('pmax[13]', 0.04076497404389449),
 ('negpmax[13]', 0.020043190275787878),
 ('negpmax[3]', 0.013678232117085316),
 ('pmax[9]', 0.01064152860119482),
 ('negpmax[11]', 0.008678075494486867),
 ('pmax[4]', 0.0076229278714844625),
 ('negpmax[5]', 0.005996493890753078),
 ('negpmax[8]', 0.004331507994431463),
 ('pmax[3]', 0.0038964548839531352),
 ('negpmax[1]', 0.0037577635264512484),
 ('negpmax[6]', 0.003162798664943817),
 ('negpmax[4]', 0.0024324342968564065),
 ('negpmax[14]', 0.0022357302202393524),
 ('pmax[1]', 0.002172688689590872),
 ('pmax[2]', 0.002101688085456933),
 ('pmax[6]', 0.0012446883190621145),
 ('negpmax[9]', 0.0011706262952725414),
 ('pmax[14]', 0.0009010138504360996),
 ('area[5]', 0.0008957620065835679),
 ('area[13]', 0.0003110824522480881),
 ('area[3]', 0.0003110000301892652),
 ('area[10]', 0.000

In [15]:
acc_idxs = [1,2,3,4,5,6,8,9,10,11,13,14]

tmax_removed = noise_removed.drop(columns=ph.get_column_names(['tmax'], acc_idxs))

Unnamed: 0,x,y,pmax[1],negpmax[1],area[1],rms[1],pmax[2],negpmax[2],area[2],rms[2],...,area[11],rms[11],pmax[13],negpmax[13],area[13],rms[13],pmax[14],negpmax[14],area[14],rms[14]
34625,230.0,310.0,5.348051,-4.648804,12.152222,2.013025,6.714259,-3.104941,8.109871,0.923330,...,4.502872,1.402630,14.019574,-3.699420,15.275155,1.663140,6.616529,-5.207074,4.660941,1.627708
321674,545.0,500.0,50.839157,-28.475906,28.840155,1.136354,21.097842,-7.294614,9.704996,1.512880,...,6.469504,1.009969,54.198132,-29.839709,33.210144,1.342560,19.785873,-8.721756,12.237433,1.335298
347083,570.0,280.0,17.823843,-6.615488,10.438300,1.457986,4.029828,-5.459930,1.498529,1.096616,...,29.890898,1.346239,56.622235,-30.477496,28.527908,1.604346,74.486804,-44.708966,39.585242,1.529592
112039,305.0,350.0,5.413358,-4.514621,3.345485,1.523706,3.895712,-4.210794,4.186127,1.444183,...,11.588037,1.051934,27.694763,-10.296997,14.616467,1.158872,6.843442,-4.792911,8.186821,0.962266
60567,250.0,465.0,4.356668,-4.101782,6.719417,1.388255,3.972430,-4.348142,2.427769,0.980960,...,5.703888,1.003586,12.922034,-5.928003,8.763921,0.928184,4.585940,-4.216483,4.747282,1.458534
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79863,270.0,345.0,4.948703,-4.210634,3.852511,1.563000,4.095816,-4.465941,1.866143,1.489051,...,12.100421,0.903577,19.283597,-9.546970,13.182732,1.136538,4.831732,-5.587214,2.540865,1.748688
45439,240.0,205.0,4.565583,-3.698386,3.553262,1.036240,5.314240,-5.128571,2.938818,1.342098,...,9.226533,1.567863,12.395941,-4.786432,7.786508,0.633836,4.667148,-6.742276,2.657169,1.046773
118071,325.0,365.0,5.845828,-3.874579,4.629249,0.945597,4.993250,-4.955664,2.854639,1.312238,...,10.113978,1.149586,21.735974,-7.542163,12.076418,1.245374,5.781720,-5.381366,2.966754,1.060008
167056,375.0,380.0,7.665552,-3.926001,4.105273,0.654876,6.754630,-4.100168,4.815539,1.410131,...,14.144662,0.993153,60.257803,-29.172372,30.621620,1.469711,12.769928,-5.586145,9.742466,1.663797


In [16]:
y_train_valid = tmax_removed[['x', 'y']].copy()

X_train_valid = tmax_removed.drop(columns=['x', 'y'])

X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, shuffle=True, random_state=42)

X_train["RANDOM"] = np.random.RandomState(42).randn(X_train.shape[0])
X_valid["RANDOM"] = np.random.RandomState(42).randn(X_valid.shape[0])

reg = RandomForestRegressor(100, random_state=42)
reg.fit(X_train, y_train)

In [17]:
med_without_noise = ph.mean_euclid_dist(y_valid, reg.predict(X_valid))
print(med_without_noise)

5.921845989666506


In [18]:
sorted(zip(X_train.columns, reg.feature_importances_), key=lambda x: x[1],reverse=True)

[('pmax[8]', 0.35921605177697646),
 ('pmax[11]', 0.2465594760062793),
 ('pmax[10]', 0.12070284894155157),
 ('pmax[5]', 0.08556699116605541),
 ('negpmax[10]', 0.049072909340919395),
 ('pmax[13]', 0.04079390388619112),
 ('negpmax[13]', 0.020030842884711464),
 ('negpmax[3]', 0.013751275266783701),
 ('pmax[9]', 0.010646517970578815),
 ('negpmax[11]', 0.008679803448721986),
 ('pmax[4]', 0.007638124731100946),
 ('negpmax[5]', 0.006061859676149818),
 ('negpmax[8]', 0.004330903043366197),
 ('pmax[3]', 0.0038317817584008467),
 ('negpmax[1]', 0.0037639003177120044),
 ('negpmax[6]', 0.0031685990366911183),
 ('negpmax[4]', 0.002438972398862019),
 ('negpmax[14]', 0.0022674953926519912),
 ('pmax[1]', 0.002169667924622957),
 ('pmax[2]', 0.002129230669104925),
 ('pmax[6]', 0.0012433755277164382),
 ('negpmax[9]', 0.0011588195417854884),
 ('pmax[14]', 0.000942644465507889),
 ('area[5]', 0.0008582015360344895),
 ('area[10]', 0.0003274131887989458),
 ('area[3]', 0.0003251908146561986),
 ('area[13]', 0.000

In [70]:
pmax_negpmax_area = noise_removed.drop(columns=ph.get_column_names(['tmax', 'rms'], acc_idxs))

In [71]:
y_train_valid = pmax_negpmax_area[['x', 'y']].copy()

X_train_valid = pmax_negpmax_area.drop(columns=['x', 'y'])

X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, shuffle=True, random_state=42)

In [None]:
reg = RandomForestRegressor(100, random_state=42)
reg.fit(X_train , y_train)

In [21]:
med_without_noise = ph.mean_euclid_dist(y_valid, reg.predict(X_valid))
print(med_without_noise)

5.823436933201352


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s


In [22]:
sorted(zip(X_train_valid.columns, reg.feature_importances_), key=lambda x: x[1],reverse=True)

[('pmax[8]', 0.35924600915026605),
 ('pmax[11]', 0.2466002702488734),
 ('pmax[10]', 0.12072175390935759),
 ('pmax[5]', 0.08556862340889844),
 ('negpmax[10]', 0.04911374560148825),
 ('pmax[13]', 0.04080715119863556),
 ('negpmax[13]', 0.020070860441112347),
 ('negpmax[3]', 0.01371097235492843),
 ('pmax[9]', 0.010664669212522508),
 ('negpmax[11]', 0.008715563185851824),
 ('pmax[4]', 0.007646562065743783),
 ('negpmax[5]', 0.006019270837061783),
 ('negpmax[8]', 0.004347457752937383),
 ('pmax[3]', 0.003976152875367849),
 ('negpmax[1]', 0.003778936444156386),
 ('negpmax[6]', 0.003148153649022397),
 ('negpmax[4]', 0.002455396812606812),
 ('negpmax[14]', 0.0022806004902664856),
 ('pmax[1]', 0.002186465119543101),
 ('pmax[2]', 0.0021411770849413436),
 ('pmax[6]', 0.0012307715268400914),
 ('negpmax[9]', 0.001178430798458846),
 ('pmax[14]', 0.0009179707869943025),
 ('area[5]', 0.000911085357653264),
 ('area[13]', 0.00034643630528788974),
 ('area[10]', 0.0003070754312450793),
 ('area[3]', 0.0003040