Skip to content

Commit

Permalink
ENH: fit single phase data (pycalphad#52)
Browse files Browse the repository at this point in the history
Fit single phase data. Data is weighted based on what type of quantity it is and directly added to the phase equilibria error.

* WIP: initial sketch of single phase error
* WIP: Use constant weight factors in single phase error
* ENH: Integrate single phase error into lnprob
* Explictly use delayed database and phase models for multi phase mcmc
* Handle null case in finding optimal parameters.
This is a stopgap for now, it should be investigated why this is happening.
* Integrate calculate_single_phase_error into existing tests
* MAINT: maintain tests for delayed_dbf
* TST: Write test for single phase lnprob error
* ENH: use ravel_conditions method in single phase error calculation
* Remove cache
* MAINT: Remove 'delayed' code
* MAINT: Fix typo
* TST: Remove delayed from tests
  • Loading branch information
bocklund committed Mar 1, 2018
1 parent ecf88cc commit e5f0e03
Show file tree
Hide file tree
Showing 4 changed files with 268 additions and 14 deletions.
240 changes: 228 additions & 12 deletions espei/mcmc.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Module for running MCMC in ESPEI"""

import textwrap, time, sys, operator, logging
import textwrap, time, sys, operator, logging, itertools
from collections import OrderedDict, defaultdict

import dask
Expand All @@ -12,6 +12,7 @@
import emcee

from espei.utils import database_symbols_to_fit, optimal_parameters
from espei.core_utils import ravel_conditions


def estimate_hyperplane(dbf, comps, phases, current_statevars, comp_dicts, phase_models, parameters):
Expand Down Expand Up @@ -180,20 +181,237 @@ def safe_get(itms, idxx):
return errors


def calculate_points_array(phase_constituents, configuration, occupancies=None):
"""
Calculate the points array to use in pycalphad calculate calls.
Converts the configuration data (and occupancies for mixing data) into the
points array by looking up the indices in the active phase constituents.
Parameters
----------
phase_constituents : list
List of active constituents in a phase
configuration : list
List of the sublattice configuration
occupancies : list
List of sublattice occupancies. Required for mixing sublattices, otherwise takes no effect.
Returns
-------
np.ndarray
Notes
-----
Errors will be raised if components in the configuration are not in the
corresponding phase constituents sublattice.
"""
# pad the occupancies for zipping if none were passed (the case for non-mixing)
if occupancies is None:
occupancies = [0] * len(configuration)

# construct the points array from zeros
points = np.zeros(sum(len(subl) for subl in phase_constituents))
current_subl_idx = 0 # index that marks the beginning of the sublattice
for phase_subl, config_subl, subl_occupancies in zip(phase_constituents,
configuration,
occupancies):
phase_subl = list(phase_subl)
if isinstance(config_subl, (tuple, list)):
# we have mixing on the sublattice
for comp, comp_occupancy in zip(config_subl, subl_occupancies):
points[
current_subl_idx + phase_subl.index(comp)] = comp_occupancy
else:
points[current_subl_idx + phase_subl.index(config_subl)] = 1
current_subl_idx += len(phase_subl)
return points


def get_prop_data(comps, phase_name, prop, datasets):
"""
Return datasets that match the components, phase and property
Parameters
----------
comps : list
List of components to get data for
phase_name : str
Name of the phase to get data for
prop : str
Property to get data for
datasets : espei.utils.PickleableTinyDB
Datasets to search for data
Returns
-------
list
List of dictionary datasets that match the criteria
"""
# TODO: we should only search and get phases that have the same sublattice_site_ratios as the phase in the database
desired_data = datasets.search(
(tinydb.where('output').test(lambda x: x in prop)) &
(tinydb.where('components').test(lambda x: set(x).issubset(comps))) &
(tinydb.where('phases') == [phase_name])
)
return desired_data


def get_prop_samples(dbf, comps, phase_name, desired_data):
"""
Return data values and the conditions to calculate them by pycalphad
calculate from the datasets
Parameters
----------
dbf : pycalphad.Database
Database to consider
comps : list
List of active component names
phase_name : str
Name of the phase to consider from the Database
desired_data : list
List of dictionary datasets that contain the values to sample
Returns
-------
dict
Dictionary of condition kwargs for pycalphad's calculate and the expected values
"""
# TODO: assumes T, P as conditions
phase_constituents = dbf.phases[phase_name].constituents
# phase constituents must be filtered to only active:
phase_constituents = [sorted(subl_constituents.intersection(set(comps))) for subl_constituents in phase_constituents]

# calculate needs points, state variable lists, and values to compare to
calculate_dict = {
'P': np.array([]),
'T': np.array([]),
'points': np.atleast_2d([[]]).reshape(-1, sum([len(subl) for subl in phase_constituents])),
'values': np.array([]),
}

for datum in desired_data:
# extract the data we care about
datum_T = datum['conditions']['T']
datum_P = datum['conditions']['P']
configurations = datum['solver']['sublattice_configurations']
occupancies = datum['solver'].get('sublattice_occupancies')
values = np.array(datum['values'])

# broadcast and flatten the conditions arrays
P, T = ravel_conditions(values, datum_P, datum_T)
if occupancies is None:
occupancies = [None] * len(configurations)

# calculate the points arrays, should be 2d array of points arrays
points = np.array([calculate_points_array(phase_constituents, config, occup) for config, occup in zip(configurations, occupancies)])

# add everything to the calculate_dict
calculate_dict['P'] = np.concatenate([calculate_dict['P'], P])
calculate_dict['T'] = np.concatenate([calculate_dict['T'], T])
calculate_dict['points'] = np.concatenate([calculate_dict['points'], points], axis=0)
calculate_dict['values'] = np.concatenate([calculate_dict['values'], values.flatten()])

return calculate_dict


def calculate_single_phase_error(dbf, comps, phases, datasets, parameters=None, phase_models=None):
"""
Calculate the weighted single phase error in the Database
Parameters
----------
dbf : pycalphad.Database
Database to consider
comps : list
List of active component names
phases : list
List of phases to consider
datasets : espei.utils.PickleableTinyDB
Datasets that contain single phase data
parameters : dict
Dictionary of symbols that will be overridden in pycalphad.calculate
phase_models : dict
Phase models to pass to pycalphad calculations
Returns
-------
float
A single float of the residual sum of square errors
Notes
-----
There are different single phase values, HM_MIX, SM_FORM, CP_FORM, etc.
Each of these have different units and the error cannot be compared directly.
To normalize all of the errors, a normalization factor must be used.
Equation 2.59 and 2.60 in Lukas, Fries, and Sundman "Computational Thermodynamics" shows how this can be considered.
Each type of error will be weighted by the reciprocal of the estimated uncertainty in the measured value and conditions.
The weighting factor is calculated by
$p_i = (\Delta L_i)^{-1}$
where $\Delta L_i$ is the uncertainty in the measurement.
We will neglect the uncertainty for quantities such as temperature, assuming they are small.
"""
if parameters is None:
parameters = {}

# property weights factors as fractions of the parameters
# for now they are all set to 5%
property_prefix_weight_factor = {
'HM': 0.05,
'SM': 0.05,
'CPM': 0.05,
}
propery_suffixes = ('_FORM', '_MIX')
# the kinds of properties, e.g. 'HM'+suffix =>, 'HM_FORM', 'HM_MIX'
# we could also include the bare property ('' => 'HM'), but these are rarely used in ESPEI
properties = [''.join(prop) for prop in itertools.product(property_prefix_weight_factor.keys(), propery_suffixes)]

sum_square_error = 0
for phase_name in phases:
for prop in properties:
desired_data = get_prop_data(comps, phase_name, prop, datasets)
if len(desired_data) == 0:
#logging.debug('Skipping {} in phase {} because no data was found.'.format(prop, phase_name))
continue
calculate_dict = get_prop_samples(dbf, comps, phase_name, desired_data)
if prop.endswith('_FORM'):
calculate_dict['output'] = ''.join(prop.split('_')[:-1])
params = parameters.copy()
params.update({'GHSER' + (c.upper() * 2)[:2]: 0 for c in comps})
else:
calculate_dict['output'] = prop
params = parameters
sample_values = calculate_dict.pop('values')
results = calculate(dbf, comps, phase_name, broadcast=False, parameters=params, model=phase_models, **calculate_dict)[calculate_dict['output']].values
weight = (property_prefix_weight_factor[prop.split('_')[0]]*np.abs(np.mean(sample_values)))**(-1.0)
error = np.sum((results-sample_values)**2) * weight
#logging.debug('Weighted sum of square error for property {} of phase {}: {}'.format(prop, phase_name, error))
sum_square_error += error
return -sum_square_error


def lnprob(params, comps=None, dbf=None, phases=None, datasets=None,
symbols_to_fit=None, phase_models=None, scheduler=None):
symbols_to_fit=None, phase_models=None, scheduler=None,
):
"""
Returns the error from multiphase fitting as a log probability.
"""
parameters = {param_name: param for param_name, param in zip(symbols_to_fit, params)}
try:
iter_error = multi_phase_fit(dbf, comps, phases, datasets, phase_models,
parameters=parameters, scheduler=scheduler)
multi_phase_error = multi_phase_fit(dbf, comps, phases, datasets, phase_models, parameters=parameters, scheduler=scheduler)
except (ValueError, LinAlgError) as e:
iter_error = [np.inf]
iter_error = [np.inf if np.isnan(x) else x ** 2 for x in iter_error]
iter_error = -np.sum(iter_error)
return np.array(iter_error, dtype=np.float64)
multi_phase_error = [np.inf]
multi_phase_error = [np.inf if np.isnan(x) else x ** 2 for x in multi_phase_error]
multi_phase_error = -np.sum(multi_phase_error)

single_phase_error = calculate_single_phase_error(dbf, comps, phases, datasets, parameters, phase_models=phase_models)
total_error = multi_phase_error + single_phase_error
logging.debug('Single phase error: {:0.2f}. Multi phase error: {:0.2f}. Total error: {:0.2f}'.format(single_phase_error, multi_phase_error, total_error))
return np.array(total_error, dtype=np.float64)


def generate_parameter_distribution(parameters, num_samples, std_deviation, deterministic=True):
Expand Down Expand Up @@ -297,12 +515,11 @@ def mcmc_fit(dbf, datasets, mcmc_steps=1000, save_interval=100, chains_per_param
mod = CompiledModel(dbf, comps, phase_name, parameters=OrderedDict([(sympy.Symbol(s), 0) for s in symbols_to_fit]))
phase_models[phase_name] = mod
logging.debug('Finished building phase models')
dbf = dask.delayed(dbf, pure=True)
phase_models = dask.delayed(phase_models, pure=True)

# contect for the log probability function
error_context = {'comps': comps, 'dbf': dbf,
'phases': phases, 'phase_models': phase_models,
'phases': phases,
'phase_models': phase_models,
'datasets': datasets, 'symbols_to_fit': symbols_to_fit,
}

Expand Down Expand Up @@ -358,7 +575,6 @@ def save_sampler_state(sampler):
logging.debug('Intial parameters: {}'.format(initial_parameters))
logging.debug('Optimal parameters: {}'.format(optimal_params))
logging.debug('Change in parameters: {}'.format(np.abs(initial_parameters - optimal_params) / initial_parameters))
dbf = dbf.compute()
for param_name, value in zip(symbols_to_fit, optimal_params):
dbf.symbols[param_name] = value
logging.info('MCMC complete.')
Expand Down
37 changes: 35 additions & 2 deletions espei/tests/test_mcmc.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,16 +145,49 @@
"""
zpf_json = json.loads(zpf_data)

def test_lnprob_calculates_probability_for_success(datasets_db):
single_phase_data = """
{
"components": ["CU", "MG"],
"phases": ["FCC_A1"],
"solver": {
"sublattice_site_ratios": [1],
"sublattice_occupancies": [[[0.5, 0.5], 1]],
"sublattice_configurations": [[["CU", "MG"], "VA"]],
"mode": "manual"
},
"conditions": {
"P": 101325,
"T": 298.15
},
"output": "HM_MIX",
"values": [[[-1000]]]
}
"""
single_phase_json = json.loads(single_phase_data)

def test_lnprob_calculates_multi_phase_probability_for_success(datasets_db):
"""lnprob() successfully calculates the probability for equilibrium """
datasets_db.insert(zpf_json)
res = lnprob([10], comps=['CU','MG', 'VA'], dbf=dbf,
phases=['LIQUID', 'FCC_A1', 'HCP_A3', 'LAVES_C15', 'CUMG2'],
datasets=datasets_db, symbols_to_fit=['VV0001'], phase_models=None, scheduler=None)
datasets=datasets_db, symbols_to_fit=['VV0001'],
phase_models=None,
scheduler=None,)
assert np.isreal(res)
assert np.isclose(res, -5740.542839073727)


def test_lnprob_calculates_single_phase_probability_for_success(datasets_db):
"""lnprob() succesfully calculates the probability from single phase data"""
datasets_db.insert(single_phase_json)
res = lnprob([10], comps=['CU','MG', 'VA'], dbf=dbf,
phases=['LIQUID', 'FCC_A1', 'HCP_A3', 'LAVES_C15', 'CUMG2'],
datasets=datasets_db, symbols_to_fit=['VV0001'],
phase_models=None,
scheduler=None,)
assert np.isreal(res)
assert np.isclose(res, -19859.38)

def _eq_LinAlgError(*args, **kwargs):
raise LinAlgError()

Expand Down
4 changes: 4 additions & 0 deletions espei/tests/test_parameter_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,10 @@ def test_mixing_energies_are_fit(datasets_db):
assert set(read_dbf.phases.keys()) == {'LIQUID', 'FCC_A1'}
assert len(read_dbf._parameters.search(where('parameter_type') == 'L')) == 1

# the error should be exactly 0 because we are only fitting to one point
from espei.mcmc import calculate_single_phase_error
assert calculate_single_phase_error(read_dbf, sorted(dbf.elements), sorted(dbf.phases.keys()), datasets_db) == 0

def test_sgte_reference_state_naming_is_correct_for_character_element(datasets_db):
"""Elements with single character names should get the correct GHSER reference state name (V => GHSERVV)"""
phase_models = {
Expand Down
1 change: 1 addition & 0 deletions espei/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ def optimal_parameters(trace_array, lnprob_array, kth=0):
# if we have found the kth set of unique parameters, stop
if unique_params_found == kth:
return unique_params
return np.zeros(trace_array.shape[-1])


def database_symbols_to_fit(dbf, symbol_regex="^V[V]?([0-9]+)$"):
Expand Down

0 comments on commit e5f0e03

Please sign in to comment.