ENH: fit single phase data (pycalphad#52)

Fit single phase data. Data is weighted based on what type of quantity it is and directly added to the phase equilibria error. * WIP: initial sketch of single phase error * WIP: Use constant weight factors in single phase error * ENH: Integrate single phase error into lnprob * Explictly use delayed database and phase models for multi phase mcmc * Handle null case in finding optimal parameters. This is a stopgap for now, it should be investigated why this is happening. * Integrate calculate_single_phase_error into existing tests * MAINT: maintain tests for delayed_dbf * TST: Write test for single phase lnprob error * ENH: use ravel_conditions method in single phase error calculation * Remove cache * MAINT: Remove 'delayed' code * MAINT: Fix typo * TST: Remove delayed from tests
bocklund · Mar 1, 2018 · e5f0e03 · e5f0e03
1 parent ecf88cc
commit e5f0e03
Show file tree

Hide file tree

Showing 4 changed files with 268 additions and 14 deletions.
diff --git a/espei/mcmc.py b/espei/mcmc.py
@@ -1,6 +1,6 @@
 """Module for running MCMC in ESPEI"""
 
-import textwrap, time, sys, operator, logging
+import textwrap, time, sys, operator, logging, itertools
 from collections import OrderedDict, defaultdict
 
 import dask
@@ -12,6 +12,7 @@
 import emcee
 
 from espei.utils import database_symbols_to_fit, optimal_parameters
+from espei.core_utils import ravel_conditions
 
 
 def estimate_hyperplane(dbf, comps, phases, current_statevars, comp_dicts, phase_models, parameters):
@@ -180,20 +181,237 @@ def safe_get(itms, idxx):
     return errors
 
 
+def calculate_points_array(phase_constituents, configuration, occupancies=None):
+    """
+    Calculate the points array to use in pycalphad calculate calls.
+
+    Converts the configuration data (and occupancies for mixing data) into the
+    points array by looking up the indices in the active phase constituents.
+
+    Parameters
+    ----------
+    phase_constituents : list
+        List of active constituents in a phase
+    configuration : list
+        List of the sublattice configuration
+    occupancies : list
+        List of sublattice occupancies. Required for mixing sublattices, otherwise takes no effect.
+
+    Returns
+    -------
+    np.ndarray
+
+    Notes
+    -----
+    Errors will be raised if components in the configuration are not in the
+    corresponding phase constituents sublattice.
+    """
+    # pad the occupancies for zipping if none were passed (the case for non-mixing)
+    if occupancies is None:
+        occupancies = [0] * len(configuration)
+
+    # construct the points array from zeros
+    points = np.zeros(sum(len(subl) for subl in phase_constituents))
+    current_subl_idx = 0  # index that marks the beginning of the sublattice
+    for phase_subl, config_subl, subl_occupancies in zip(phase_constituents,
+                                                         configuration,
+                                                         occupancies):
+        phase_subl = list(phase_subl)
+        if isinstance(config_subl, (tuple, list)):
+            # we have mixing on the sublattice
+            for comp, comp_occupancy in zip(config_subl, subl_occupancies):
+                points[
+                    current_subl_idx + phase_subl.index(comp)] = comp_occupancy
+        else:
+            points[current_subl_idx + phase_subl.index(config_subl)] = 1
+        current_subl_idx += len(phase_subl)
+    return points
+
+
+def get_prop_data(comps, phase_name, prop, datasets):
+    """
+    Return datasets that match the components, phase and property
+
+    Parameters
+    ----------
+    comps : list
+        List of components to get data for
+    phase_name : str
+        Name of the phase to get data for
+    prop : str
+        Property to get data for
+    datasets : espei.utils.PickleableTinyDB
+        Datasets to search for data
+
+    Returns
+    -------
+    list
+        List of dictionary datasets that match the criteria
+
+    """
+    # TODO: we should only search and get phases that have the same sublattice_site_ratios as the phase in the database
+    desired_data = datasets.search(
+        (tinydb.where('output').test(lambda x: x in prop)) &
+        (tinydb.where('components').test(lambda x: set(x).issubset(comps))) &
+        (tinydb.where('phases') == [phase_name])
+    )
+    return desired_data
+
+
+def get_prop_samples(dbf, comps, phase_name, desired_data):
+    """
+    Return data values and the conditions to calculate them by pycalphad
+    calculate from the datasets
+
+    Parameters
+    ----------
+    dbf : pycalphad.Database
+        Database to consider
+    comps : list
+        List of active component names
+    phase_name : str
+        Name of the phase to consider from the Database
+    desired_data : list
+        List of dictionary datasets that contain the values to sample
+
+    Returns
+    -------
+    dict
+        Dictionary of condition kwargs for pycalphad's calculate and the expected values
+
+    """
+    # TODO: assumes T, P as conditions
+    phase_constituents = dbf.phases[phase_name].constituents
+    # phase constituents must be filtered to only active:
+    phase_constituents = [sorted(subl_constituents.intersection(set(comps))) for subl_constituents in phase_constituents]
+
+    # calculate needs points, state variable lists, and values to compare to
+    calculate_dict = {
+        'P': np.array([]),
+        'T': np.array([]),
+        'points': np.atleast_2d([[]]).reshape(-1, sum([len(subl) for subl in phase_constituents])),
+        'values': np.array([]),
+    }
+
+    for datum in desired_data:
+        # extract the data we care about
+        datum_T = datum['conditions']['T']
+        datum_P = datum['conditions']['P']
+        configurations = datum['solver']['sublattice_configurations']
+        occupancies = datum['solver'].get('sublattice_occupancies')
+        values = np.array(datum['values'])
+
+        # broadcast and flatten the conditions arrays
+        P, T = ravel_conditions(values, datum_P, datum_T)
+        if occupancies is None:
+            occupancies = [None] * len(configurations)
+
+        # calculate the points arrays, should be 2d array of points arrays
+        points = np.array([calculate_points_array(phase_constituents, config, occup) for config, occup in zip(configurations, occupancies)])
+
+        # add everything to the calculate_dict
+        calculate_dict['P'] = np.concatenate([calculate_dict['P'], P])
+        calculate_dict['T'] = np.concatenate([calculate_dict['T'], T])
+        calculate_dict['points'] = np.concatenate([calculate_dict['points'], points], axis=0)
+        calculate_dict['values'] = np.concatenate([calculate_dict['values'], values.flatten()])
+
+    return calculate_dict
+
+
+def calculate_single_phase_error(dbf, comps, phases, datasets, parameters=None, phase_models=None):
+    """
+    Calculate the weighted single phase error in the Database
+
+    Parameters
+    ----------
+    dbf : pycalphad.Database
+        Database to consider
+    comps : list
+        List of active component names
+    phases : list
+        List of phases to consider
+    datasets : espei.utils.PickleableTinyDB
+        Datasets that contain single phase data
+    parameters : dict
+        Dictionary of symbols that will be overridden in pycalphad.calculate
+    phase_models : dict
+        Phase models to pass to pycalphad calculations
+
+    Returns
+    -------
+    float
+        A single float of the residual sum of square errors
+
+    Notes
+    -----
+    There are different single phase values, HM_MIX, SM_FORM, CP_FORM, etc.
+    Each of these have different units and the error cannot be compared directly.
+    To normalize all of the errors, a normalization factor must be used.
+    Equation 2.59 and 2.60 in Lukas, Fries, and Sundman "Computational Thermodynamics" shows how this can be considered.
+    Each type of error will be weighted by the reciprocal of the estimated uncertainty in the measured value and conditions.
+    The weighting factor is calculated by
+    $p_i = (\Delta L_i)^{-1}$
+    where $\Delta L_i$ is the uncertainty in the measurement.
+    We will neglect the uncertainty for quantities such as temperature, assuming they are small.
+
+    """
+    if parameters is None:
+        parameters = {}
+
+    # property weights factors as fractions of the parameters
+    # for now they are all set to 5%
+    property_prefix_weight_factor = {
+        'HM': 0.05,
+        'SM': 0.05,
+        'CPM': 0.05,
+    }
+    propery_suffixes = ('_FORM', '_MIX')
+    # the kinds of properties, e.g. 'HM'+suffix =>, 'HM_FORM', 'HM_MIX'
+    # we could also include the bare property ('' => 'HM'), but these are rarely used in ESPEI
+    properties = [''.join(prop) for prop in itertools.product(property_prefix_weight_factor.keys(), propery_suffixes)]
+
+    sum_square_error = 0
+    for phase_name in phases:
+        for prop in properties:
+            desired_data = get_prop_data(comps, phase_name, prop, datasets)
+            if len(desired_data) == 0:
+                #logging.debug('Skipping {} in phase {} because no data was found.'.format(prop, phase_name))
+                continue
+            calculate_dict = get_prop_samples(dbf, comps, phase_name, desired_data)
+            if prop.endswith('_FORM'):
+                calculate_dict['output'] = ''.join(prop.split('_')[:-1])
+                params = parameters.copy()
+                params.update({'GHSER' + (c.upper() * 2)[:2]: 0 for c in comps})
+            else:
+                calculate_dict['output'] = prop
+                params = parameters
+            sample_values = calculate_dict.pop('values')
+            results = calculate(dbf, comps, phase_name, broadcast=False, parameters=params, model=phase_models, **calculate_dict)[calculate_dict['output']].values
+            weight = (property_prefix_weight_factor[prop.split('_')[0]]*np.abs(np.mean(sample_values)))**(-1.0)
+            error = np.sum((results-sample_values)**2) * weight
+            #logging.debug('Weighted sum of square error for property {} of phase {}: {}'.format(prop, phase_name, error))
+            sum_square_error += error
+    return -sum_square_error
+
+
 def lnprob(params, comps=None, dbf=None, phases=None, datasets=None,
-           symbols_to_fit=None, phase_models=None, scheduler=None):
+           symbols_to_fit=None, phase_models=None, scheduler=None,
+           ):
     """
     Returns the error from multiphase fitting as a log probability.
     """
     parameters = {param_name: param for param_name, param in zip(symbols_to_fit, params)}
     try:
-        iter_error = multi_phase_fit(dbf, comps, phases, datasets, phase_models,
-                                     parameters=parameters, scheduler=scheduler)
+        multi_phase_error = multi_phase_fit(dbf, comps, phases, datasets, phase_models, parameters=parameters, scheduler=scheduler)
     except (ValueError, LinAlgError) as e:
-        iter_error = [np.inf]
-    iter_error = [np.inf if np.isnan(x) else x ** 2 for x in iter_error]
-    iter_error = -np.sum(iter_error)
-    return np.array(iter_error, dtype=np.float64)
+        multi_phase_error = [np.inf]
+    multi_phase_error = [np.inf if np.isnan(x) else x ** 2 for x in multi_phase_error]
+    multi_phase_error = -np.sum(multi_phase_error)
+
+    single_phase_error = calculate_single_phase_error(dbf, comps, phases, datasets, parameters, phase_models=phase_models)
+    total_error = multi_phase_error + single_phase_error
+    logging.debug('Single phase error: {:0.2f}. Multi phase error: {:0.2f}. Total error: {:0.2f}'.format(single_phase_error, multi_phase_error, total_error))
+    return np.array(total_error, dtype=np.float64)
 
 
 def generate_parameter_distribution(parameters, num_samples, std_deviation, deterministic=True):
@@ -297,12 +515,11 @@ def mcmc_fit(dbf, datasets, mcmc_steps=1000, save_interval=100, chains_per_param
         mod = CompiledModel(dbf, comps, phase_name, parameters=OrderedDict([(sympy.Symbol(s), 0) for s in symbols_to_fit]))
         phase_models[phase_name] = mod
     logging.debug('Finished building phase models')
-    dbf = dask.delayed(dbf, pure=True)
-    phase_models = dask.delayed(phase_models, pure=True)
 
     # contect for the log probability function
     error_context = {'comps': comps, 'dbf': dbf,
-                     'phases': phases, 'phase_models': phase_models,
+                     'phases': phases,
+                     'phase_models': phase_models,
                      'datasets': datasets, 'symbols_to_fit': symbols_to_fit,
                      }
 
@@ -358,7 +575,6 @@ def save_sampler_state(sampler):
     logging.debug('Intial parameters: {}'.format(initial_parameters))
     logging.debug('Optimal parameters: {}'.format(optimal_params))
     logging.debug('Change in parameters: {}'.format(np.abs(initial_parameters - optimal_params) / initial_parameters))
-    dbf = dbf.compute()
     for param_name, value in zip(symbols_to_fit, optimal_params):
         dbf.symbols[param_name] = value
     logging.info('MCMC complete.')

diff --git a/espei/tests/test_mcmc.py b/espei/tests/test_mcmc.py
@@ -145,16 +145,49 @@
 """
 zpf_json = json.loads(zpf_data)
 
-def test_lnprob_calculates_probability_for_success(datasets_db):
+single_phase_data = """
+{
+    "components": ["CU", "MG"],
+    "phases": ["FCC_A1"],
+    "solver": {
+        "sublattice_site_ratios": [1],
+        "sublattice_occupancies": [[[0.5, 0.5], 1]],
+        "sublattice_configurations": [[["CU", "MG"], "VA"]],
+        "mode": "manual"
+    },
+    "conditions": {
+        "P": 101325,
+        "T": 298.15
+    },
+    "output": "HM_MIX",
+    "values": [[[-1000]]]
+}
+"""
+single_phase_json = json.loads(single_phase_data)
+
+def test_lnprob_calculates_multi_phase_probability_for_success(datasets_db):
     """lnprob() successfully calculates the probability for equilibrium """
     datasets_db.insert(zpf_json)
     res = lnprob([10], comps=['CU','MG', 'VA'], dbf=dbf,
                  phases=['LIQUID', 'FCC_A1', 'HCP_A3', 'LAVES_C15', 'CUMG2'],
-                 datasets=datasets_db, symbols_to_fit=['VV0001'], phase_models=None, scheduler=None)
+                 datasets=datasets_db, symbols_to_fit=['VV0001'],
+                 phase_models=None,
+                 scheduler=None,)
     assert np.isreal(res)
     assert np.isclose(res, -5740.542839073727)
 
 
+def test_lnprob_calculates_single_phase_probability_for_success(datasets_db):
+    """lnprob() succesfully calculates the probability from single phase data"""
+    datasets_db.insert(single_phase_json)
+    res = lnprob([10], comps=['CU','MG', 'VA'], dbf=dbf,
+                 phases=['LIQUID', 'FCC_A1', 'HCP_A3', 'LAVES_C15', 'CUMG2'],
+                 datasets=datasets_db, symbols_to_fit=['VV0001'],
+                 phase_models=None,
+                 scheduler=None,)
+    assert np.isreal(res)
+    assert np.isclose(res, -19859.38)
+
 def _eq_LinAlgError(*args, **kwargs):
     raise LinAlgError()
 

diff --git a/espei/tests/test_parameter_generation.py b/espei/tests/test_parameter_generation.py
@@ -51,6 +51,10 @@ def test_mixing_energies_are_fit(datasets_db):
     assert set(read_dbf.phases.keys()) == {'LIQUID', 'FCC_A1'}
     assert len(read_dbf._parameters.search(where('parameter_type') == 'L')) == 1
 
+    # the error should be exactly 0 because we are only fitting to one point
+    from espei.mcmc import calculate_single_phase_error
+    assert calculate_single_phase_error(read_dbf, sorted(dbf.elements), sorted(dbf.phases.keys()), datasets_db) == 0
+
 def test_sgte_reference_state_naming_is_correct_for_character_element(datasets_db):
     """Elements with single character names should get the correct GHSER reference state name (V => GHSERVV)"""
     phase_models = {

diff --git a/espei/utils.py b/espei/utils.py
@@ -94,6 +94,7 @@ def optimal_parameters(trace_array, lnprob_array, kth=0):
         # if we have found the kth set of unique parameters, stop
         if unique_params_found == kth:
             return unique_params
+    return np.zeros(trace_array.shape[-1])
 
 
 def database_symbols_to_fit(dbf, symbol_regex="^V[V]?([0-9]+)$"):