In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import itertools
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import make_scorer
import concurrent.futures
from threading import Lock

from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV
import keras.backend as K
import tensorflow as tf
from scikeras.wrappers import KerasRegressor

##Preprocessing

Load the CUP dataset

In [None]:
header=['ID','a','b','c','d','e','f', 'g', 'h', 'i', 'j']
blind = pd.read_csv("./CUP/ML-CUP21-TS.csv", header=None,delimiter=',', skiprows=7,names=header)
blind

In [None]:
blind.index=blind['ID'].values
blind.drop('ID', axis=1, inplace=True)
blind

In [None]:
blind=blind.values

In [None]:
header=['ID','a','b','c','d','e','f', 'g', 'h', 'i', 'j','Class_x', 'Class_y']
df = pd.read_csv("./CUP/ML-CUP21-TR.csv", header=None,delimiter=',', skiprows=7,names=header)
df.index=df['ID'].values
df.drop('ID', axis=1, inplace=True)

In [None]:
df

In [None]:
df.describe()

In [None]:
col=[c for c in df.columns if (c!='Class_x' and c!= 'Class_y')]
x= df[col].values
y= df[['Class_x', 'Class_y']].values

Training/Test splitting with Hold-out approch (90%-10%)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=0)
print(y_train.shape, y_test.shape)

MEE definition

In [None]:
def mean_euclidean_error_tf(y_true, y_pred):
    return K.mean(K.sqrt(K.sum(K.square(y_pred - y_true), axis=-1)))

In [None]:
def mean_euclidean_error(y_true, y_pred):
    return np.mean(np.sqrt(np.sum(np.square(y_pred-y_true), axis=-1)))

In [None]:
score = make_scorer(mean_euclidean_error, greater_is_better = False)

In [None]:
def build_model(weight_init=0.2, weight_distr=0, activ='relu',layer=1, unit=4, eta=0.2, alpha=0.5, lambd=0):
  
  tf.random.set_seed(0)  
    
  if weight_distr==0:
    init= tf.keras.initializers.RandomUniform(minval=-weight_init, maxval=weight_init)
  elif weight_distr==1:
    init= tf.keras.initializers.RandomNormal(mean=0., stddev=weight_init)
  else:
    init= tf.keras.initializers.GlorotNormal()

  reg= tf.keras.regularizers.l2(l2=lambd)


  model= tf.keras.models.Sequential()
  model.add(tf.keras.layers.Input(10,))
  for i in range(layer):
    model.add(tf.keras.layers.Dense(unit, activation='tanh', kernel_initializer=init, bias_initializer=init, kernel_regularizer=reg))
  model.add(tf.keras.layers.Dense(2, activation='linear', kernel_initializer=init, bias_initializer=init, kernel_regularizer=reg))

  loss=mean_euclidean_error_tf
  opt= tf.keras.optimizers.SGD(learning_rate=eta, momentum=alpha, nesterov=False)
  metric=mean_euclidean_error_tf
  model.compile(loss=loss, 
                optimizer=opt,
                metrics=[metric])
  
  #print(model.get_weights())
  return model

In [None]:
def build_model_Adam(weight_init=0.2, weight_distr=0, activ='relu',layer=1, unit=4, eta=0.2, alpha=0.5, lambd=0, beta_1=0.9, beta_2=0.999, epsilon=0.0000001):
  
  tf.random.set_seed(0)  
    
  if weight_distr==0:
    init= tf.keras.initializers.RandomUniform(minval=-weight_init, maxval=weight_init)
  elif weight_distr==1:
    init= tf.keras.initializers.RandomNormal(mean=0., stddev=weight_init)
  else:
    init= tf.keras.initializers.GlorotNormal()

  reg= tf.keras.regularizers.l2(l2=lambd)


  model= tf.keras.models.Sequential()
  model.add(tf.keras.layers.Input(10,))
  for i in range(layer):
    model.add(tf.keras.layers.Dense(unit, activation='tanh', kernel_initializer=init, bias_initializer=init, kernel_regularizer=reg))
  model.add(tf.keras.layers.Dense(2, activation='linear', kernel_initializer=init, bias_initializer=init, kernel_regularizer=reg))

  loss=mean_euclidean_error_tf
  opt= tf.keras.optimizers.Adam(learning_rate=eta, beta_1=beta_1, beta_2=beta_2,epsilon=epsilon)
  metric=mean_euclidean_error_tf
  model.compile(loss=loss, 
                optimizer=opt,
                metrics=[metric])
  
  #print(model.get_weights())
  return model

In [None]:
class haltCallback(tf.keras.callbacks.Callback):
    def __init__(self, err):
        super(tf.keras.callbacks.Callback, self).__init__()
        self.err=err

    def on_epoch_end(self, epoch, logs={}):
        if(logs.get('mean_euclidean_error_tf') <=self.err):
            self.model.stop_training = True

#Voting

In [None]:
class Voting():

  #Voting constructor which takes the best grid's output hyper-parameters for
  #the estimators to ensamble
  def __init__(self) -> None:
      nn_param_batch={
          'weight_init': 0.4,
          'weight_distr': 1,
          'unit': 40,
          'layer': 3,
          'eta': 0.03,
          'alpha': 0.95,
          'lambd': 0.001,
          'activ': 'tanh',
      }
      mean_err_batch= 0.8342830985784531
      trainingStopCallback_batch = haltCallback(mean_err_batch)


      nn_param_adam={
          'weight_init': 0.4,
          'weight_distr': 1,
          'unit': 40,
          'layer': 3,
          'eta': 0.025,
          'beta_1': 0.9,
          'beta_2': 0.7,
          'lambd': 0.001,
          'activ': 'tanh'
      }
      mean_err_adam= 0.8765210807323456
      trainingStopCallback_adam = haltCallback(mean_err_adam)




      #estimator definition with best hyper-parameters
      self.estimators=[
            ('knn', KNeighborsRegressor(n_neighbors= 9, p=1, weights='distance')),
            ('batch', KerasRegressor(build_model, **nn_param_batch, epochs=800, shuffle=True, batch_size=len(x_train), verbose=0, callbacks=[trainingStopCallback_batch])),
            ('adam', KerasRegressor(build_model_Adam, **nn_param_adam, epochs=800, shuffle=True, batch_size=len(x_train), verbose=0, callbacks=[trainingStopCallback_adam])),
            ('forest', RandomForestRegressor(max_depth=20, random_state=0, n_estimators=100, criterion='squared_error', min_samples_split=2, min_samples_leaf=2, 
                                    max_features = 3, bootstrap=False))
      ]
      
      self.col= [name for (name, estimator) in self.estimators]

  #In the predict method we first retrain the base estimators on the whole TR
  #And then we use the trained models to predict the TS, building the DataFrame on which we take the mean
  def predict(self, X_train, y_train, X_test):
      pred_test_x= pd.DataFrame(columns=self.col)
      pred_test_y= pd.DataFrame(columns=self.col)

      for (name, estimator) in self.estimators:
        estimator.fit(X_train, y_train)
        pred_test= estimator.predict(X_test)
        pred_test_x[name]= pred_test[:,0]
        pred_test_y[name]= pred_test[:,1]

      return np.column_stack((np.mean(pred_test_x.values, axis=1), np.mean(pred_test_y.values, axis=1)))

In [None]:
stk= Voting()


In [None]:
y_pred_train= stk.predict(x_train, y_train, x_train)

In [None]:
mean_euclidean_error(y_train, y_pred_train)

In [None]:
y_pred_test= stk.predict(x_train, y_train, x_test)

In [None]:
mean_euclidean_error(y_test, y_pred_test)

In [None]:
y_pred_blind= stk.predict(x_train, y_train, blind)

In [None]:
y_pred_blind

In [None]:
df_blind=pd.DataFrame(y_pred_blind)
df_blind

In [None]:
df_blind.index+=1
df_blind

In [None]:
df_blind.to_csv("./CUP/blind_prediction.csv")    
