In [96]:
import numpy as np
import pickle
import torch
import time

from sklearn.metrics import r2_score, mean_absolute_error, explained_variance_score, mean_squared_error
from sklearn.model_selection import train_test_split
import autosklearn.regression
from sklearn.ensemble import RandomForestRegressor

In [97]:
X = pickle.load(open('inputs_and_outputs.pkl', 'rb'))['X']
y = pickle.load(open('inputs_and_outputs.pkl', 'rb'))['y']
nb_data = np.size(y)
nb_data

69839

In [98]:
def rf_run(nb_training_data, nb_acquire):
    print("RF run w ", nb_training_data, "training data and", nb_acquire, "acquired.")
    # test/train split
    X_train, X_test, y_train, y_test, ids_train, ids_test = train_test_split(X, y, np.arange(nb_data), train_size=nb_training_data)

    # train random forest on training data
    rf = RandomForestRegressor()
    rf.fit(X_train, y_train)

    # hv random forest make predictions on test data
    y_pred = rf.predict(X_test)

    # rank the test predictions
    ids_test_ranked = np.flip(np.argsort(y_pred))

    # acquire the COFs in the test set with highest predicted property
    ids_acquire = ids_test[ids_ranked[:nb_acquire]]

    # return the acquired COFs but also the trained COFs which count.
    ids_acquire_incld_training = np.concatenate((ids_acquire, ids_train))
    
    assert np.size(np.unique(ids_acquire_incld_training)) == nb_training_data + nb_acquire
    
    return ids_acquire_incld_training

In [99]:
nb_runs = 2
nb_evals_budgets = [50 * i for i in range(1, 5)]
for r in range(nb_runs):
    print("\n\nRUN", r)
    for nb_evals_budget in nb_evals_budgets:
        # decide how to spend the evals budget here. say 50/50
        nb_training_data = nb_evals_budget // 2
        nb_acquire = nb_evals_budget // 2
        assert nb_training_data + nb_acquire == nb_evals_budget
        
        ids_acquired = rf_run(nb_training_data, nb_acquire)
        print(ids_acquired)
    torch.save({'ids_acquired': ids_acquired}, 'rf_run' + str(r) + '.pkl')



RUN 0
RF run w  25 training data and 25 acquired.
[58527 62321 45853 13274 42290 51234 35298 63189  1273 28980 11906 60007
 55780 45555  1756 25299 37383 58011 56546 64500 69710  2201 43411 24978
  1311  2669  7365 69716 45263  2068 57313 16539 34581 33007 39590 55529
 26496 36215 31376 53126 38189 39054 58505 15733 42146 45071 39338 48444
 40797  7433]
RF run w  50 training data and 50 acquired.
[23681 16553   431 43235 44212 69826 23494 61211 45706 36239 25495 60465
 66325 38389 46947 23161 28664 39126 41091 34210 65566 40535 59766 49436
 31750 63486 18088 21971  6551 45159 10703 57890 51750 64584 18927  7910
 51565 46526 51216 51402 69408 27043 61787 12539 69609 11483 37919 48775
 21305 36931 29135 47822 43360 18751 35274 32053 22513 44924 63898 61109
 61769 29207 11269 57926  8550 41769 23917  7962 52619 12844 55227 52926
  6731 40969  8781 46004 29359  8866 22996 66153  1869 64551 69355 55074
 19210 62748 15950 57961 28854 67643 12500 69147 62542 55934 51438 24453
 59008 39278 6