In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import neuralEOS
import pandas as pd
from datetime import datetime

In [None]:
params = neuralEOS.Parameters()

First we set up the parameters that are needed for the generation of the AA files.

In [None]:
# whether to run the atoMEC calculations
params.submit_aa_jobs = True

# where to put the processed file of FPEOS data
params.pressure_file = params.pressure_path + "pressure_elements.csv"

# for this example notebook, we generate a sample dataset
# if you want to run the full dataset, comment out the following line
params.n_aa_samples = 400

# optional parameter: where FPEOS datasets are stored
# params.fpeos_data_path = <path_to_fpeos_data>

# optional parameter: where to put the AA i/o files
params.aa_dir = params.intermediate_data_path + "atoMEC_inputs_v2/"

# optional parameter: in case not all elements are desired
# params.element_list = ["Al"]

aa_generator = neuralEOS.GenerateAA(params)

First, we convert the FPEOS data (stored in various .txt files) into a more useable format.

In [None]:
aa_generator.extract_raw_fpeos()

Next, we set up the atoMEC and slurm input files (and run them if desired).

Note that this generates input files from `templates/conv_template.py`. In this template, there is a parameter `"ngrid_min"`, which is the initial number of grid points when running convergence tests. To generate the atoMEC data in the paper, we first ran all calculations with this number set to `500`. However, some calculations need a higher initial value of `ngrid`. We therefore ran another pass-through for all the failed calculations with `"ngrid_min"=1000`. But for this sample script, we don't bother with this second pass-through.

In [None]:
aa_generator.make_all_files()

Now we extract the data from the atoMEC generated output files. First, some more relevant parameters.

In [None]:
# where to store the pressure file with output AA data
params.aa_pressure_file = params.pressure_path + "paper_sample/pressure_aa.csv"

aa_extractor = neuralEOS.ExtractAA(params)

In [None]:
aa_extractor.extract_aa_features()

Next, we pre-process the AA data. This involves two steps: 
1. The data is grouped into ten different bins based on the reference pressure for that data point.
2. The data is split randomly into 5 training and test sets. Although the splitting is random, it is done using a stratified approach, meaning that each pair of training and test sets contains the same proportion of data from each bin.

In [None]:
# name of the pressure file with bins included
params.pressure_bins_file = params.pressure_path + "paper_sample/pressure_aa_bins.csv"
params.aa_pressure_file = params.pressure_path + "paper_sample/pressure_aa.csv"
preprocess = neuralEOS.Preprocess(params)

In [None]:
preprocess.cv_split()

In the next stage, we train the AA networks. First, we define the parameters used for the training.

In [None]:
params.elements_only = True # don't use mixtures
params.n_hyperopt_trials = 5 # number of hyperparam trials
params.n_cv_repeats = 1 # number of repeats in inner CV loop
params.n_cv_inner = 3 # number of inner CV loops
params.n_feature_trials = 5
params.feature_list_aa = [
    "P_aa_st_tr_i",
    "P_aa_st_rr_i",
    "P_aa_ideal_i",
    "P_aa_vir_corr_i",
    "P_aa_vir_nocorr_i",
    "P_aa_fd_i",
    "dn_dR",
    "dv_dR",
    "n_R",
    "temp",
    "rho_Z",
    "rho",
    "vol",
    "MIS",
]
params.sum_e_ion_pressures = False # don't sum electron with ion pressures
params.n_features_min = 3 # minimum number of features
params.n_features_max = 8 # maximum number of features
params.slurm_train_file = params.scripts_dir + "/train_hyper_aa.slurm"
training_file_prefix = params.pressure_path + "paper_sample/pressure_aa_train_"

# where to store the results from the hyperopt runs
# we use today's date to store them
savedir_prefix = params.cv_inner_path + "aa_14_11_23_cv_iter_"

In [None]:
# set up the training object
trainer = neuralEOS.Training(params)

In [None]:
trainer.submit_inner_training_jobs(training_file_prefix, savedir_prefix, True)

Now we are going to extract the results of the inner loop training

In [None]:
for i_cv in range(1, 6):
    savedir = savedir_prefix + str(i_cv) + "/"
    trainer.gather_cv_scores(savedir, filename_out=savedir+"cv_summary.txt")

Now, we train the 3 best models on the full inner CV loops.

In [None]:
model_file_prefix = params.model_path +  "aa_14_11_23_cv_iter_"

In [None]:
trainer.submit_outer_training_jobs(training_file_prefix, savedir_prefix, model_file_prefix, True)

Now, we are going to test our trained models on the outer CV loops.

In [None]:
# prefix for test files
test_file_prefix = params.pressure_path + "paper_sample/pressure_aa_test_"

# set up the tester object
tester = neuralEOS.Testing(params)

In [None]:
# make predictions on the list of test files
test_csv_list = []
for i_cv in range(1,6):
    test_csv_in = test_file_prefix + str(i_cv) + ".csv"
    test_csv_out = test_file_prefix + str(i_cv) + "_pred.csv"
    model_file_list = []
    for k in range(3):
        model_file = model_file_prefix + str(i_cv) + "_"+str(k)+".pkl"
        model_file_list.append(model_file)
    tester.predict(test_csv_in, model_file_list, output_file=test_csv_out)
    test_csv_list.append(test_csv_out)

In [None]:
# summarize and show the results
for ptype in [
    "P_pred",
    "P_aa_fd",
    "P_aa_st_tr",
    "P_aa_st_rr",
    "P_aa_vir_corr",
    "P_aa_vir_nocorr",
    "P_aa_ideal",
]:
    if ptype != "P_pred":
        results = tester.evaluate(test_csv_list, ptype, P_ion="P_ion")
    else:
        results = tester.evaluate(test_csv_list, ptype)
    print("Results for " + ptype)
    print(results)

The above workflow enables us to estimate the generalization error for our neural network trained with AA features. In the following steps, we train the final model. This uses all the available data for training, since there is no need to hold back any test data.

In [None]:
savedir_final = params.cv_inner_path + "aa_14_11_23_final_models/"
training_file = params.pressure_bins_file

In [None]:
trainer.submit_final_training_hyperopt_jobs(training_file, savedir_final,True)

In [None]:
# gather the scores
trainer.gather_cv_scores(savedir_final, filename_out=savedir_final+"cv_summary.txt")

Now we have the hyperparameters of the best performing models, we train the final models with those hyperparameters.

In [None]:
model_file_prefix = params.model_path +  "aa_14_11_23_final_model_"

In [None]:
trainer.submit_final_outer_training_jobs(training_file, savedir_final, model_file_prefix, True)

We have now trained the final models. We shall test these on the FP-Be dataset. First, we extract the data from the raw input files into a more usable format.

In [None]:
params.Be_pressure_file = params.pressure_path + "paper_sample/pressure_Be.csv"

In [None]:
aa_generator.extract_raw_fp_Be()

Now we run the atoMEC calculations for the FP-Be dataset. We will use a sample of 100 points.

In [None]:
params.n_aa_samples = 100
params.element_list = ["Be"]
#aa_generator.make_all_files(from_Hu=True)

In [None]:
# extract the Be data
aa_extractor = neuralEOS.ExtractAA(params)
params.aa_Be_pressure_file = params.pressure_path + "paper_sample/pressure_aa_Be.csv"
aa_extractor.extract_aa_features(from_Hu=True)

Now, using the final trained models, we make predictions on the Beryllium data.

In [None]:
model_file_prefix = params.model_path +  "aa_14_11_23_final_model_"
model_file_list = []
for k in range(3):
    model_file = model_file_prefix + str(k) + ".pkl"
    model_file_list.append(model_file)

test_csv_in = params.pressure_path + "paper_sample/aa_pressure_Be.csv"
test_csv_out = params.pressure_path + "paper_sample/aa_pressure_Be_pred.csv"
tester.predict(test_csv_in, model_file_list, output_file=test_csv_out)

In [None]:
# summarize the results
for ptype in [
    "P_pred",
    "P_aa_fd",
    "P_aa_st_tr",
    "P_aa_st_rr",
    "P_aa_vir_corr",
    "P_aa_vir_nocorr",
    "P_aa_ideal",
]:
    if ptype != "P_pred":
        results = tester.evaluate(test_csv_out, ptype, P_ion="P_ion")
    else:
        results = tester.evaluate(test_csv_out, ptype)
    print("Results for " + ptype)
    print(results)


This completes the training and evaluation of the neural network trained *with* AA features as input parameters. In the following sections, we follow the same steps, but this time for the network trained *without* AA features.

In [None]:
params.elements_only = True # don't use mixtures
params.n_hyperopt_trials = 5 # number of hyperparam trials
params.n_cv_repeats = 1 # number of repeats in inner CV loop
params.n_cv_inner = 3 # number of inner CV loops
params.n_feature_trials = 5
params.feature_list_no_aa = [
    "temp",
    "rho_Z",
    "rho",
    "vol",
    "P_ion",
]
params.sum_e_ion_pressures = False # don't sum electron with ion pressures
params.n_features_min = 3 # minimum number of features
params.n_features_max = 5 # maximum number of features
training_file_prefix = params.pressure_path + "paper_sample/pressure_aa_train_"

# where to store the results from the hyperopt runs
# we use today's date to store them
savedir_prefix = params.cv_inner_path + "10_11_23_cv_iter_"

In [None]:
trainer = neuralEOS.Training(params)

In [None]:
trainer.submit_inner_training_jobs(training_file_prefix, savedir_prefix, False)

Extract the results from the inner loop training.

In [None]:
for i_cv in range(1, 6):
    savedir = savedir_prefix + str(i_cv) + "/"
    trainer.gather_cv_scores(savedir, filename_out=savedir+"cv_summary.txt")

In [None]:
model_file_prefix = params.model_path +  "10_11_23_cv_iter_"
params.scripts_dir = "/home/callow46/neuralEOS//scripts/aa_workflow_example/"

In [None]:
trainer.submit_outer_training_jobs(training_file_prefix, savedir_prefix, model_file_prefix, False)

In [None]:
# prefix for test files
test_file_prefix = params.pressure_path + "paper_sample/pressure_aa_test_"

# set up the tester object
tester = neuralEOS.Testing(params)

In [None]:
# make predictions on the list of test files
test_csv_list = []
for i_cv in range(1,6):
    test_csv_in = test_file_prefix + str(i_cv) + ".csv"
    test_csv_out =  params.pressure_path + "paper_sample/pressure_no_aa_test_" + str(i_cv) + "_pred.csv"
    model_file_list = []
    for k in range(3):
        model_file = model_file_prefix + str(i_cv) + "_"+str(k)+".pkl"
        model_file_list.append(model_file)
    tester.predict(test_csv_in, model_file_list, output_file=test_csv_out)
    test_csv_list.append(test_csv_out)

In [None]:
# summarize and show the results
for ptype in [
    "P_pred",
    "P_aa_fd",
    "P_aa_st_tr",
    "P_aa_st_rr",
    "P_aa_vir_corr",
    "P_aa_vir_nocorr",
    "P_aa_ideal",
]:
    if ptype != "P_pred":
        results = tester.evaluate(test_csv_list, ptype, P_ion="P_ion")
    else:
        results = tester.evaluate(test_csv_list, ptype)
    print("Results for " + ptype)
    print(results)

In [None]:
savedir_final = params.cv_inner_path + "10_11_23_final_models/"
training_file = params.pressure_bins_file
#trainer.submit_final_training_hyperopt_jobs(training_file, savedir_final,False)

In [None]:
# gather the scores
trainer.gather_cv_scores(savedir_final, filename_out=savedir_final+"cv_summary.txt")

In [None]:
model_file_prefix = params.model_path +  "10_11_23_final_model_"
trainer.submit_final_outer_training_jobs(training_file, savedir_final, model_file_prefix, False)

In [None]:
model_file_prefix = params.model_path +  "10_11_23_final_model_"
model_file_list = []
for k in range(3):
    model_file = model_file_prefix + str(k) + ".pkl"
    model_file_list.append(model_file)

test_csv_in = params.pressure_path + "paper_sample/aa_pressure_Be.csv"
test_csv_out = params.pressure_path + "paper_sample/no_aa_pressure_Be_pred.csv"
tester.predict(test_csv_in, model_file_list, output_file=test_csv_out)

In [None]:
# summarize the results
for ptype in [
    "P_pred",
    "P_aa_fd",
    "P_aa_st_tr",
    "P_aa_st_rr",
    "P_aa_vir_corr",
    "P_aa_vir_nocorr",
    "P_aa_ideal",
]:
    if ptype != "P_pred":
        results = tester.evaluate(test_csv_out, ptype, P_ion="P_ion")
    else:
        results = tester.evaluate(test_csv_out, ptype)
    print("Results for " + ptype)
    print(results)
