
Getting data from the online databases with Optimade
==========

- **Complexity level**: intermediate
- **Requirements**: crystallography, Python, understanding how the internet works



In [None]:
!pip install optimade[http_client]
from optimade.client import OptimadeClient

In [None]:
!pip install ase                                 
import re                                        
from ase import Atom, Atoms                      
                                                 
client = OptimadeClient(max_results_per_provider=5)

In [None]:
def optimade_to_ase(structure, skip_disorder=False):                                      
                                                                                          
    def extract_chemical_element(str):                                                    
        return re.sub('\W', '', str)                                                     
                                                                                          
    if type(structure) == str:                                                            
        structure = json.loads(structure)                                                 
                                                                                          
    if 'cartesian_site_positions' not in structure['attributes'] or \
        'lattice_vectors' not in structure['attributes']:                                 
        return None, 'Invalid structure'                                                  
                                                                                          
    if 'data' in structure and type(structure['data']) == list and len(structure['data']):
        structure = structure['data'][0]                                                  
                                                                                          
    elems_src, atom_data, atom_meta = [], [], {}                                          
    if 'species' in structure['attributes']:                                              
        for n, specie in enumerate(structure['attributes']['species']):                   
            # account isotopes                                                            
            if specie['chemical_symbols'][0] == 'D':                                      
                specie['chemical_symbols'][0] = 'H'                                       
                atom_meta[n] = 'D'                                                        
                                                                                          
            elems_src.append(specie['chemical_symbols'][0])                               
            if not skip_disorder and len(specie['chemical_symbols']) > 1:                 
                if 'concentration' not in specie:                                         
                    return None, 'Atomic disorder data incomplete'                        
                return None, 'Structural disorder is not supported'                       
                                                                                          
    if len(structure['attributes'].get('species', [])) != \
        len(structure['attributes']['cartesian_site_positions']):                         
        elems_src = structure['attributes'].get('species_at_sites',                       
            structure['attributes'].get('elements', [])                                   
        )                                                                                 
    for n, pos in enumerate(structure['attributes']['cartesian_site_positions']):         
        try:                                                                              
            atom_data.append(                                                             
                Atom(extract_chemical_element(elems_src[n]), pos)                         
            )                                                                             
        except KeyError as exc: # TODO link *species_at_sites* <-> *species*              
            return None, 'Unrecognized atom symbol: %s' % exc                             
                                                                                          
    if not atom_data:                                                                     
        return None, 'Atoms missing'                                                      
                                                                                          
    return Atoms(                                                                         
        atom_data,                                                                        
        cell=structure['attributes']['lattice_vectors'],                                  
        pbc=structure['attributes'].get('dimension_types') or True,                       
        info=dict(isotopes=atom_meta) if atom_meta else {}                                
    ), None                                                                                 

In [None]:
query = '' # put your query  
howmany = client.count(query)
from pprint import pprint    
pprint(howmany)                


Let's try to prepare our results for a machine-learning task, calculating the structure descriptors. Recall, the term *descriptor* stands for the compact information-rich number, allowing the convenient mathematical treatment of the encoded complex data (crystalline structure, in our case). The descriptors are heavily used in machine learning to predict the new information statistically based on the existing information. Let's have a look at the *atomic packing factor descriptor*, defined per a unit cell, as given in the function `get_apf` below. (It is up to the reader to check the other possible descriptors for the crystalline structures!)
                                                                                             
Then let's try to do the following:                                                           
                                                                                             
- retrieve one or many structures for certain queries                                        
- visualize any retrieved structure using an online visualizer                               
- calculate the atomic packing factor *descriptor* for the structures                        



In [None]:
import numpy as np                                                                    
from ase.data import covalent_radii, chemical_symbols                                 
                                                                                      
def get_apf(ase_obj):                                                                 
    volume = 0.0                                                                      
    for atom in ase_obj:                                                              
        volume += 4/3 * np.pi * covalent_radii[chemical_symbols.index(atom.symbol)]**3
    return volume / abs(np.linalg.det(ase_obj.cell))                                    

In [None]:
client = OptimadeClient(max_results_per_provider=5)          
query = 'chemical_formula_anonymous="ABCD4"'               
results = client.get(query)                                  
for provider, answer in results['structures'][query].items():
    print('=' * 25, provider, '=' * 25)                      
    for optimade_item in answer['data']:                     
        ase_obj, error = optimade_to_ase(optimade_item)      
        if error:                                            
            print('=' * 25, error)                           
            continue                                         
        print(ase_obj)                                         