In [1]:
import warnings
import numpy as np

from importlib import reload

import wandb

from context import omphalos

# Import Omphalos modules.
from omphalos import generate_inputs as gi
from omphalos import file_methods as fm
from omphalos import my_metrics as mm
from omphalos import omphalos_plotter as op
from omphalos import attributes as attr
from omphalos import labels as lbls
from omphalos import spatial_constructor

from matplotlib import pyplot as plt

%matplotlib inline
%precision 4

'%.4f'

In [6]:
%%time

np.random.seed(0)

train_set = fm.unpickle('data/old_rifle.pkl')


attributes_df = attr.boundary_condition(train_set, boundary='x_begin')
labels_df = lbls.secondary_precip(train_set)

CPU times: user 6min 16s, sys: 1min 24s, total: 7min 40s
Wall time: 8min 46s


In [7]:
x = attributes_df.loc[:, ['NH4+', 'SO4--','Ca++', 'Acetate', 'CO2(aq)']]
y = labels_df.sum(level=0)['FeS(am)'] + labels_df.sum(level=0)['FeS34(am)']

x = x.to_numpy()
y = y.to_numpy().reshape(-1,1)
y = y * 1e4


from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)

In [None]:
%%time
import xgboost as xgb

wandb.init(project='xgboost_rifle')

dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test, label=y_test)

evallist = [(dtest, 'eval'), (dtrain, 'train')]
param = {'max_depth': 6, 'eta': 0.0001, 'objective': 'reg:pseudohubererror', 'alpha': 1, 'lambda': 1}

results = {}

num_round = 1000000
bst = xgb.train(param, dtrain, num_round, evallist, evals_result=results,early_stopping_rounds=500, verbose_eval=False, callbacks=[wandb.xgboost.wandb_callback()])

In [8]:
import gpflow_plotting as gpfp
from importlib import reload  
reload(gpfp)

plt.style.use('seaborn-talk')


# Make figure and layout.

fig = plt.figure(figsize=(25, 16))

plt.subplots_adjust(wspace=1, hspace=0.25)

# Generate uneven subplots
sub1 = fig.add_subplot(2,6,(1,2), xlim=(0,30), ylim=(0, 0.004))
sub2 = fig.add_subplot(2,6,(3,4))
sub3 = fig.add_subplot(2,6,(5,6))
sub4 = fig.add_subplot(2,6,(8,9))
sub5 = fig.add_subplot(2,6,(10,11))

axis_list = [sub1, sub2, sub3, sub4, sub5]


species_list = ['NH4+', 'SO4--','Ca++', 'Acetate', 'CO2(aq)']
axis_labels = ['[NH$_4^+$] (mM)', '[SO$_4^{2-}$] (mM)','[Ca$^2+$] (mM)', '[Acetate] (mM)', '[CO$_{2(aq)}$] (mM)']

samples = 50
dims = 5
plot_vars = np.arange(dims)

predict_range = (0,30)

defaults = np.ones((samples, dims))
# Default (Druhan 2014)
defaults[:,0] = 1.5
defaults[:,1] = 8.8
defaults[:,2] = 4.8
defaults[:,3] = 9.7
defaults[:,4] = 0.0325


for plot_var, x_label, species, ax in zip(plot_vars, axis_labels, species_list, axis_list):

    path = 'data/rifle_sweeps/{}.pkl'.format(species)    

    sweep_x, sweep_y = gpfp.get_sweep(path, species, 'amendment', ('FeS(am)', 'FeS34(am)'))
    ax.scatter(sweep_x, sweep_y, c="k", marker='+', s=100,label='RTM results')

    ax.set_xlabel(x_label, fontsize=20)
    ax.set_ylabel('Net pyrite precipitation (vol. frac.)', fontsize=20)
    ax.set_xlim(0, 30)
    ax.set_xticks(ax.get_xticks()[::5])
    ax.set_ylim(0.00, 0.0040)
    #ax.tick_params('both', labelsize=18)

size_list = 10, 100, 1000, 5000, 10000
    
for forest_size in size_list:
    import xgboost as xgb
    
    dtrain = xgb.DMatrix(x_train, label=y_train)
    dtest = xgb.DMatrix(x_test, label=y_test)

    evallist = [(dtest, 'eval'), (dtrain, 'train')]
    param = {'max_depth': 6, 'eta': 0.0001, 'objective': 'reg:pseudohubererror', 'alpha': 1, 'lambda': 1}

    results = {}

    num_round = forest_size
    bst = xgb.train(param, dtrain, num_round, evallist, evals_result=results,early_stopping_rounds=500, verbose_eval=False)

    for plot_var, x_label, species, ax in zip(plot_vars, axis_labels, species_list, axis_list):
        defaults = np.ones((samples, dims))
        # Default (Druhan 2014)
        defaults[:,0] = 1.5
        defaults[:,1] = 8.8
        defaults[:,2] = 4.8
        defaults[:,3] = 9.7
        defaults[:,4] = 0.0325

        defaults[:,plot_var] =  np.linspace(predict_range[0], predict_range[1], len(defaults[:, plot_var]))

        ypred = bst.predict(xgb.DMatrix(defaults))

        ax.plot(defaults[:, plot_var], ypred * 1e-4, label='GBTree fit')

In [9]:
new_bst = xgb.Booster()
new_bst.load_model(fname='rifle_xgb.json')

for plot_var, x_label, species, ax in zip(plot_vars, axis_labels, species_list, axis_list):
    defaults = np.ones((samples, dims))
    # Default (Druhan 2014)
    defaults[:,0] = 1.5
    defaults[:,1] = 8.8
    defaults[:,2] = 4.8
    defaults[:,3] = 9.7
    defaults[:,4] = 0.0325

    defaults[:,plot_var] =  np.linspace(predict_range[0], predict_range[1], len(defaults[:, plot_var]))

    ypred = new_bst.predict(xgb.DMatrix(defaults))

    ax.plot(defaults[:, plot_var], ypred * 1e-4)
        
sub5.legend(('10 trees', '100 trees', '1000 trees', '5000 trees', '10000 trees', '1000000 trees', 'RTM results'), fontsize=20, bbox_to_anchor=(1, 0, 1, 1), loc="lower left", mode="expand", ncol=1)

plt.show()

In [13]:
fig.savefig('/Users/angus/Dropbox/ompahlos_paper/figures/convergence.png', dpi=300)