In [None]:
import numpy as np
from mlinsights.mlmodel import PiecewiseRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor

import time
from tqdm import tqdm
import matplotlib.pyplot as plt
import pandas as pd
import random

from collections import defaultdict

import os 
import pickle

from helpers.quadratic import generate_Xy

In [None]:
# set experiment parameters, initialize lists/dictionaries, and set random seeds

n_max = 5 # max dimention of the input vector
n_sample = 10 # number of sampled quadratic function for each dimension
n_points = [100, 500, 2000, 5000, 10000] # n_points[n-1] points sampled for a quadratic function with input vector of dimension n

# list to store MSEs
mses_n_PWL = [] # PWL: model tree
mses_n_PWC = [] # PWC: CART
mses_n_RF = [] # RF: random forest
mses_n_RELU = [] #RELU: ReLU network
mses_RF = defaultdict(dict)
mses_RELU = defaultdict(dict)
# dictionary to store the best model trained
best_models_PWL = defaultdict(list)
best_models_PWC = defaultdict(list)
best_models_RF = {}
best_models_RELU = {}

# set random seed
global_seed = 777
random.seed(global_seed)
np.random.seed(global_seed)
seeds = np.random.randint(1,1000,(1,n_sample))[0] # 10 seeds to enable n_sample different quadratic functions generated for each dimension considered
print(seeds)

In [None]:
for n in tqdm(range(1,n_max+1)):
    
    mses_PWL = []
    mses_PWC = []
    mses_RF[n] = []
    mses_RELU[n] = []
    
    best_models_RF[n] = []
    best_models_RELU[n] = []
    
    columns_name = ['x' + str(j+1) for j in range(n)] + ['y'] # column names used when saving csv files for the training & testing data
    
    for i in tqdm(range(n_sample)):
        # set random seed
        random.seed(seeds[i])
        np.random.seed(seeds[i])
        
        # generate data points
        X, y = generate_Xy(n, n_points[n-1])   
        
        # data splitting and saving
        X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(X, y, random_state=seeds[i], test_size = 0.25)
        df_data_train = np.concatenate((X_train_raw,y_train_raw.reshape(-1,1)), axis = 1)
        df_train = pd.DataFrame(df_data_train, columns = columns_name)
        csv_name_train = './' + str(n) + '_' + str(i) + '_train.csv'
        df_train.to_csv(csv_name_train, index = False)
        
        df_data_test = np.concatenate((X_test_raw,y_test_raw.reshape(-1,1)), axis = 1)
        df_test = pd.DataFrame(df_data_test, columns = columns_name)
        csv_name_test = './' + str(n) + '_' + str(i) + '_test.csv'
        df_test.to_csv(csv_name_test, index = False)
        
        # scaling
        sc_X = MinMaxScaler()
        sc_y = MinMaxScaler()

        X_train = sc_X.fit_transform(X_train_raw)
        y_train = sc_y.fit_transform(y_train_raw.reshape(-1,1))
        y_train = np.squeeze(y_train)
        
        X_test = sc_X.transform(X_test_raw)
        y_test = sc_y.transform(y_test_raw.reshape(-1,1))
        
        
        #Model trees
        random.seed(global_seed)
        np.random.seed(global_seed)
        gs0 = GridSearchCV(PiecewiseRegressor(verbose=False, estimator = LinearRegression()),
				  param_grid = {'binner': [DecisionTreeRegressor(min_samples_leaf=i,
                                                                 random_state=0) for i in range(5, 20*n+1, 5)]},
				  return_train_score = True, cv = 5,
				  scoring = 'neg_mean_squared_error')
        gs0.fit(X_train, y_train)
        y_pred = gs0.predict(X_test)
        y_pred_inverse = sc_y.inverse_transform(y_pred.reshape(-1,1))
        mses_PWL.append(mean_squared_error(y_test, y_pred))
        best_models_PWL[n].append(gs0.best_estimator_)
        
        #CARTs
        random.seed(global_seed)
        np.random.seed(global_seed)
        gs = GridSearchCV(DecisionTreeRegressor(),
				  param_grid = {'min_samples_leaf': range(5, 20*n+1, 5)},
				  return_train_score = True, cv = 5,
				  scoring = 'neg_mean_squared_error')
        gs.fit(X_train, y_train)
        y_pred = gs.predict(X_test)
        y_pred_inverse = sc_y.inverse_transform(y_pred.reshape(-1,1))
        mses_PWC.append(mean_squared_error(y_test, y_pred))
        best_models_PWC[n].append(gs.best_estimator_)

        #Random forests 
        random.seed(global_seed)
        np.random.seed(global_seed)
        gs_rf = GridSearchCV(RandomForestRegressor(),
                             param_grid = {'min_samples_leaf': range(5, 20*n+1, 5),
                                           'n_estimators': [5, 10, 20, 50, 100],
                                           'max_features': range(1, n+1)
                                           },
                             return_train_score = True, cv = 5,
                     scoring = 'neg_mean_squared_error',
                     verbose = 0)
        gs_rf.fit(X_train, y_train)
        y_pred = gs_rf.predict(X_test)
        y_pred_inverse = sc_y.inverse_transform(y_pred.reshape(-1,1))
        mses_RF[n].append(mean_squared_error(y_test, y_pred))
        best_models_RF[n].append(gs_rf.best_estimator_)
            
        # RELU
        random.seed(global_seed)
        np.random.seed(global_seed)
        gs_relu = GridSearchCV(MLPRegressor(max_iter = 10000),
                               param_grid = {'learning_rate_init': [1e-2, 2e-2, 5e-2, 1e-3, 2e-3, 5e-3, 1e-4],
                                             'hidden_layer_sizes': [(10, 5), (20, 10), (40, 20), (50, 20), (50, 30, 30)],
                                             'learning_rate': ['constant']},
                               return_train_score = True, cv = 5,
                               scoring = 'neg_mean_squared_error',
                               verbose = 0)
        gs_relu.fit(X_train, y_train)
        y_pred = gs_relu.predict(X_test)
        y_pred_inverse = sc_y.inverse_transform(y_pred.reshape(-1,1))
        mses_RELU[n].append(mean_squared_error(y_test, y_pred))
        best_models_RELU[n].append(gs_relu.best_estimator_)
            
    mses_n_PWL.append(mses_PWL)
    mses_n_PWC.append(mses_PWC)    
    mses_n_RF.append(mses_RF)    
    mses_n_RELU.append(mses_RELU)

In [None]:
print(mses_n_PWL)
print(mses_n_PWC)
print(mses_n_RF)
print(mses_n_RELU)

In [None]:
def pickle_save(path, file, filename):
    file_loc = path + '/' + filename + '.pickle'
    with open(file_loc, 'wb') as handle:
        pickle.dump(file, handle, protocol=pickle.HIGHEST_PROTOCOL)

# create the directory to save the results
path = './results'
try:
    os.mkdir(path)
except FileExistsError:
    print('Folder already exists')

In [None]:
# save results

# Model trees
pickle_save(path, mses_n_PWL, 'pwl_mses')
pickle_save(path, best_models_PWL, 'pwl_best_models')

# CARTs
pickle_save(path, mses_n_PWC, 'pwc_mses')
pickle_save(path, best_models_PWC, 'pwc_best_models')

# # randome forests
pickle_save(path, mses_RF, 'rf_mses')
pickle_save(path, best_models_RF, 'rf_best_models')

# relu nns
pickle_save(path, mses_RELU, 'relu_mses')
pickle_save(path, best_models_RELU, 'relu_best_models')