In [1]:
API_KEY = "O2aTxtQ6kG5hCglB32JznFNCPtOJCYt5C0ToTGkvu39ePrMV"
import pandas as pd
import numpy as np
import tqdm
from matminer.data_retrieval.retrieve_MPDS import MPDSDataRetrieval
from matminer.figrecipes.plot import PlotlyFig

In [2]:
def calculate_lengths(ase_obj, limit=4):
    """
    Short helper function to get bond lengths between element A
    and element B.
    """
    assert elA != elB
    lengths = []
    all_lengths = ase_obj.get_all_distances()

    for n, atom in enumerate(ase_obj):
        for m, neighbor in enumerate(ase_obj):
            if neighbor.symbol != atom.symbol:
                dist = round(all_lengths[n][m], 2) # NB occurrence <-> rounding
                if dist < limit:
                    lengths.append(dist)
    return lengths

In [3]:
client = MPDSDataRetrieval(api_key=API_KEY)

answer = client.get_data(criteria={ "props": "linear thermal expansion coefficient", "classes": "oxide"},
                         fields={'S':['phase_id', 'entry', 'chemical_formula', 'cell_abc']})


	50%
Got 1597 hits


In [6]:
crystals = []
for item in tqdm.tqdm(answer):
    crystal = MPDSDataRetrieval.compile_crystal(item, 'ase')
    if not crystal: continue
    crystals.append(crystal)

100%|██████████| 1597/1597 [00:00<?, ?it/s]


[]

In [None]:
lengths = []
for item in crystals:
    lengths.extend(calculate_lengths(item))

tec = []


data = {"tec": tec, "lengths": lengths}

dfrm = pd.DataFrame(data, columns=['length'])
dfrm['occurrence'] = dfrm.groupby('tec')['tec'].transform('count')
dfrm.drop_duplicates('tec', inplace=True)
pf = PlotlyFig(dfrm, mode='notebook', x_title="TEC (10e-6/K)")
pf.histogram(cols=['tec'], n_bins=30)

In [None]:
## AUTOMATMINER ##
from matminer.datasets import load_dataset

df = load_dataset("matbench_expt_gap")

# Let's look at our dataset
df.describe()
df["composition"].unique().shape[0]

In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, shuffle=True, random_state=20191014)

In [None]:
target = "tec"
prediction_df = test_df.drop(columns=[target])
prediction_df.head()

In [None]:
prediction_df.describe()

In [None]:
from automatminer import MatPipe

pipe = MatPipe.from_preset("express")

In [None]:
pipe.fit(train_df, target)

In [None]:
prediction_df = pipe.predict(prediction_df)

In [None]:
prediction_df.head()

In [None]:
# SCORE PREDICTIONS
from sklearn.metrics import mean_absolute_error
from sklearn.dummy import DummyRegressor

# fit the dummy
dr = DummyRegressor()
dr.fit(train_df["composition"], train_df[target])
dummy_test = dr.predict(test_df["composition"])


# Score dummy and MatPipe
true = test_df[target]
matpipe_test = prediction_df[target + " predicted"]

mae_matpipe = mean_absolute_error(true, matpipe_test)
mae_dummy = mean_absolute_error(true, dummy_test)

print("Dummy MAE: {} eV".format(mae_dummy))
print("MatPipe MAE: {} eV".format(mae_matpipe))

In [None]:
# Examine inside pipeline
import pprint

# Get a summary and save a copy to json
summary = pipe.summarize(filename="MatPipe_predict_experimental_TEC_composition_summary.json")

pprint.pprint(summary)

In [None]:
print(pipe.learner.best_pipeline)

In [None]:
print(pipe.autofeaturizer.featurizers["composition"])

In [None]:
# Save to File
filename = "MatPipe_predict_experimental_TEC_from_composition.p"
pipe.save(filename)

In [None]:
# Load from file
pipe_loaded = MatPipe.load(filename)