In [263]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import LocalOutlierFactor
import helpers.processing_helpers as ph
from sklearn.svm import LinearSVR
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression

In [289]:
df_dev = pd.read_csv("./dataset/development.csv")
df_ev = pd.read_csv("./dataset/evaluation.csv")

acc_idxs = [1,2,3,4,5,6,8,9,10,11,13,14]
noise_indexes = [0,7,12,15,16,17]



features = ['pmax', 'negpmax', 'area', 'tmax', 'rms']

drop_features = ['tmax', 'rms', 'area']

df_dev = df_dev.drop(columns=ph.get_column_names(features, noise_indexes)) 
df_dev = df_dev.drop(columns=ph.get_column_names(drop_features, acc_idxs))

df_ev = df_ev.drop(columns=ph.get_column_names(features, noise_indexes))
df_ev = df_ev.drop(columns=ph.get_column_names(drop_features, acc_idxs))
df_ev = df_ev.drop(columns="Id")

X_train = df_dev.drop(columns=['x', 'y'])
Y_train = df_dev[['x', 'y']]


features = X_train.columns

scaler = StandardScaler()
X_train_normalized = pd.DataFrame(scaler.fit_transform(X_train), columns=features)
eval_normalized = pd.DataFrame(scaler.transform(df_ev), columns=features)


outlier_clr = LocalOutlierFactor(contamination=0.025)

outlier_scores = outlier_clr.fit_predict(X_train_normalized)


In [290]:
df_outlier_scores = pd.DataFrame(outlier_scores, columns=['score'])
X_train_normalized = pd.concat([X_train_normalized, df_outlier_scores], axis=1)
score_mask = X_train_normalized['score'] >= 0
X_train_normalized = X_train_normalized[score_mask]
X_train_normalized

Unnamed: 0,pmax[1],negpmax[1],pmax[2],negpmax[2],pmax[3],negpmax[3],pmax[4],negpmax[4],pmax[5],negpmax[5],...,negpmax[9],pmax[10],negpmax[10],pmax[11],negpmax[11],pmax[13],negpmax[13],pmax[14],negpmax[14],score
2,-0.711719,0.106909,-0.441961,0.009510,-0.848685,0.137472,-0.832614,0.031988,-1.141005,0.858277,...,-0.748229,0.083624,-0.019371,-0.595660,0.081664,-1.210932,0.464910,-0.602589,0.039315,1
3,-0.684792,0.165512,-0.600365,0.000120,-0.892459,0.131305,-0.753976,0.038710,-1.036258,0.882947,...,-0.746272,-0.001023,0.000834,-0.519378,0.068776,-1.171136,0.448051,-0.574763,0.028081,1
4,-0.790196,0.113647,-0.526214,0.002648,-0.840038,0.139111,-0.890003,0.020064,-0.959713,0.902269,...,-0.828382,0.145767,0.004424,-0.377900,0.067464,-1.161445,0.502746,-0.684512,0.029485,1
5,-0.761922,0.193033,-0.605966,0.002262,-0.817495,0.138789,-0.827618,0.022207,-1.088317,0.892385,...,-0.660310,0.180483,0.015728,-0.583469,0.062709,-1.200117,0.484278,-0.725118,0.027887,1
6,-0.714769,0.152889,-0.434056,0.003651,-0.865775,0.140308,-0.800334,0.029229,-0.965757,0.845611,...,-0.777175,0.129421,-0.028425,-0.537785,0.072425,-1.193584,0.503548,-0.686299,0.031184,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
385494,2.320114,-0.724913,7.784918,-0.164526,1.729602,-0.363508,-0.188855,0.028398,-1.071899,0.876243,...,0.063022,-1.110332,0.126261,-0.665714,0.094045,-0.772206,0.462486,-0.352577,0.034247,1
385495,1.984914,-0.570141,6.092468,-0.140522,1.443518,-0.335376,-0.224333,0.045489,-0.948231,0.868763,...,0.043895,-1.093250,0.132743,-0.768189,0.055052,-0.778056,0.414618,-0.608070,0.037437,1
385496,2.056252,-0.565863,6.408689,-0.167727,1.319551,-0.347221,-0.253281,0.028416,-1.061089,0.844477,...,0.054937,-0.857272,0.132563,-0.319939,0.073038,-0.844124,0.471563,-0.272023,0.031033,1
385497,1.796743,-0.726257,6.896955,-0.173545,1.866020,-0.356328,0.144617,0.028420,-0.940133,0.899318,...,0.041449,-1.040976,0.140699,-0.776684,0.079367,-0.730563,0.448717,-0.367520,0.041807,1


In [291]:
Y_train = pd.concat([Y_train, df_outlier_scores], axis=1)
Y_train = Y_train[score_mask]

In [292]:
X_train_normalized.drop(columns=['score'], inplace=True)
Y_train.drop(columns=['score'], inplace=True)

In [294]:
mlp = MLPRegressor(random_state=42,hidden_layer_sizes=(500),verbose=1, n_iter_no_change=200, max_iter=3000, learning_rate_init=0.01, activation="logistic")
mlp.fit(X_train_normalized, Y_train)

Iteration 1, loss = 2735.67655480
Iteration 2, loss = 19.94618917
Iteration 3, loss = 12.82177277
Iteration 4, loss = 10.22372587
Iteration 5, loss = 8.85961349
Iteration 6, loss = 8.10760726
Iteration 7, loss = 7.65889021
Iteration 8, loss = 7.38769725
Iteration 9, loss = 7.14485781
Iteration 10, loss = 6.99602409
Iteration 11, loss = 6.86520675
Iteration 12, loss = 6.77984790
Iteration 13, loss = 6.66800194
Iteration 14, loss = 6.60699318
Iteration 15, loss = 6.54218998
Iteration 16, loss = 6.48961924
Iteration 17, loss = 6.45062009
Iteration 18, loss = 6.42232811
Iteration 19, loss = 6.37072551
Iteration 20, loss = 6.32525930
Iteration 21, loss = 6.31652125
Iteration 22, loss = 6.29179722
Iteration 23, loss = 6.26817226
Iteration 24, loss = 6.25211587
Iteration 25, loss = 6.21585744
Iteration 26, loss = 6.19423263
Iteration 27, loss = 6.18594447
Iteration 28, loss = 6.16900677
Iteration 29, loss = 6.14359470
Iteration 30, loss = 6.13028422
Iteration 31, loss = 6.12543620
Iteration 3

In [295]:
y_pred_eval = mlp.predict(eval_normalized)
print(type(y_pred_eval))
y_pred_rounded = round_to_nearest_5(y_pred_eval)

<class 'numpy.ndarray'>


In [296]:
y_pred_eval = pd.DataFrame(y_pred_eval, columns=['x', 'y'])
y_pred_eval['Predicted'] = y_pred_eval['x'].astype(str) + "|" + y_pred_eval['y'].astype(str)
y_pred_eval.drop(columns=['x','y'], inplace=True)
y_pred_eval.reset_index(inplace=True)
y_pred_eval.rename(columns={'index': 'Id'}, inplace=True)
y_pred_eval = y_pred_eval[['Id', 'Predicted']]

In [298]:
y_pred_eval.to_csv('output.csv', columns=["Id","Predicted"], index=False)