In [1]:
import numpy as np
import pickle
import torch
import time

from sklearn.metrics import r2_score, mean_absolute_error, explained_variance_score, mean_squared_error
from sklearn.model_selection import train_test_split
import autosklearn.regression
from sklearn.ensemble import RandomForestRegressor

In [2]:
X = pickle.load(open('inputs_and_outputs.pkl', 'rb'))['X']
print("shape of X: ", np.shape(X))
y = pickle.load(open('inputs_and_outputs.pkl', 'rb'))['y']
nb_data = np.size(y)
nb_data

shape of X:  (69839, 12)


69839

In [3]:
ids_train = [2, 5]
np.linalg.norm(X[1, :] - X[ids_train, :], axis=1)

array([0.17621688, 0.2123145 ])

In [4]:
def diverse_train_test_split(X, train_size):
    ids_train = [np.random.randint(0, nb_data)] # initialize with one random point; pick others in a max diverse fashion
    # select remaining training points
    for j in range(train_size - 1):
        # for each point, compute its min distance to training set
        min_distances_to_train_set = np.zeros((nb_data, ))
        for i in range(nb_data):
            # compute its distance to all points in the training set
            distances_to_train_set = np.linalg.norm(X[i, :] - X[ids_train, :], axis=1)
            min_distances_to_train_set[i] = np.min(distances_to_train_set)
        # select point with max min distance to train set (Furthest from train set)
        ids_train.append(np.argmax(min_distances_to_train_set))
    assert np.size(np.unique(ids_train)) == train_size
    ids_test = [i for i in range(nb_data) if not i in ids_train]
    assert np.size(np.unique(ids_test)) == nb_data - train_size
    return np.array(ids_train), np.array(ids_test)

In [5]:
ids_train, ids_test = diverse_train_test_split(X, 25)

In [11]:
diversify_training = False

In [12]:
def rf_run(nb_training_data, nb_acquire):
    if diversify_training:
        print("diverse RF run")
    else:
        print("RF run")
    print("\teval budget", nb_training_data + nb_acquire, "=", nb_training_data, "training data and", nb_acquire, "acquired.")
    # test/train split
    if diversify_training:
        ids_train, ids_test = diverse_train_test_split(X, nb_training_data)
    else:
        ids_train, ids_test = train_test_split(np.arange(nb_data), train_size=nb_training_data)
    
    X_train = X[ids_train, :]
    X_test  = X[ids_test, :]
    
    y_train = y[ids_train]
    y_test  = y[ids_test]
    
    # train random forest on training data
    rf = RandomForestRegressor()
    rf.fit(X_train, y_train)

    # hv random forest make predictions on test data
    y_pred = rf.predict(X_test)

    # rank the test predictions
    ids_test_ranked = np.flip(np.argsort(y_pred))

    # acquire the COFs in the test set with highest predicted property
    ids_acquire = ids_test[ids_test_ranked[:nb_acquire]]

    # return the acquired COFs but also the trained COFs which count.
    ids_acquire_incld_training = np.concatenate((ids_acquire, ids_train))
    
    assert np.size(np.unique(ids_acquire_incld_training)) == nb_training_data + nb_acquire
    
    print("\tmax y acquired = ", np.max(y[ids_acquire_incld_training]))
    return ids_acquire_incld_training

In [16]:
rf_res = dict()
rf_res['nb_runs']          = 2
rf_res['nb_evals_budgets'] = [50 * i for i in range(1, 5)]
rf_res['ids_acquired']     = [[] for b in rf_res['nb_evals_budgets']]
for b in range(len(rf_res['nb_evals_budgets'])):
    print("budget for evals:", nb_evals_budget)
    nb_evals_budget = rf_res['nb_evals_budgets'][b]
    # decide how to spend the evals budget here. say 50/50
    nb_training_data = nb_evals_budget // 2
    nb_acquire = nb_evals_budget // 2
    assert nb_training_data + nb_acquire == nb_evals_budget
    for r in range(rf_res['nb_runs']):
        print("\trun", r)
        ids_acquired = rf_run(nb_training_data, nb_acquire)
        rf_res['ids_acquired'][b].append(ids_acquired)
#     torch.save({'ids_acquired': ids_acquired}, 'rf_run' + str(r) + '.pkl')

if diversify_training:
    with open('rf_results.pkl', 'wb') as file:
        pickle.dump(rf_res, file)
else:
    with open('rf_div_results.pkl', 'wb') as file:
        pickle.dump(rf_res, file)

budget for evals: 200
	run 0
RF run
	eval budget 50 = 25 training data and 25 acquired.
	max y acquired =  216.894110699
	run 1
RF run
	eval budget 50 = 25 training data and 25 acquired.
	max y acquired =  181.95215032099998
budget for evals: 50
	run 0
RF run
	eval budget 100 = 50 training data and 50 acquired.
	max y acquired =  193.360391942
	run 1
RF run
	eval budget 100 = 50 training data and 50 acquired.
	max y acquired =  194.938530808
budget for evals: 100
	run 0
RF run
	eval budget 150 = 75 training data and 75 acquired.
	max y acquired =  195.928348822
	run 1
RF run
	eval budget 150 = 75 training data and 75 acquired.
	max y acquired =  193.25083398700002
budget for evals: 150
	run 0
RF run
	eval budget 200 = 100 training data and 100 acquired.
	max y acquired =  193.620114578
	run 1
RF run
	eval budget 200 = 100 training data and 100 acquired.
	max y acquired =  208.120454446
