# OQMD Structure Filter
# Goals:
- Extract all OQMD structures which have the relevant target elements from the database provided by Murat Aykol. In this case, a transition metal and an oxygen atom.
- Associate these OQMD structures to Bader charges provided by Brian Rohr.
- Associate these OQMD structures to ICSD labels provided by Brian Rohr.


## Uses:
- Murat-provided OQMD database mirror, oqmd1.2_structs
- Murat-provided OQMD energies mirror, oqmd1.2_energies
- Brian-provided list of OQMD-ICSD associations
- Brian-provifed listt of OQMD-Bader chg. Associations

## Outputs:

oqmd_{}\_{}\_collations.json: File containing the structures in XAS_Collation form, serialized as dictionaries using the as_dict() method built into MSONables. This is the preferred format for later steps in the pre-processing.

##### Saved as well, but optional:
oqmd_{}\_{}\_structures.json: File containing the structures in pymatgen Structure, serialized as dictionaries using the as_dict() method built in to Pymatgen structures.


In [1]:
import os

data_prefix = '/Users/steventorrisi/Documents/TRIXS/data/'
oqmd_struct_string = os.path.join(data_prefix,'oqmd1.2_structures/oqmd1.2.structs_')
oqmd_nrg_string = os.path.join(data_prefix,'oqmd1.2_structures/oqmd1.2.energies_')
oqmd_icsd_string = os.path.join(data_prefix,'oqmd1.2_structures/icsd_entry_id_mapping.csv')
oqmd_bader_dir = os.path.join(data_prefix,'OQMD_bader')
storage_directory = os.path.join(data_prefix,'Pre-Processing')

# Will attempt to associate the 
associate_icsd = True
associate_bader = True

target_elements_groups=[('Co','O'),('Fe','O'),('V','O'),('Cu','O'),
                        ('Ni','O'),('Cr','O'),('Mn','O'),('Ti','O')]
target_metals = set(['Co','Fe','V','Cu','Ni','Cr','Mn','Ti'])
O = set(['O'])
# If exclusive, will only include structures with all of the elements in each tuple
exclusive = True

## Set up files

In [2]:
import json
import os
import numpy as np
from tqdm import tqdm_notebook
from pymatgen.core.structure import Structure
from trixs.spectra.core import XAS_Collation

# Load in and index ICSD ids
with open(oqmd_icsd_string,'r') as f:
    the_lines = [line.strip().split(',') for line in f.readlines()]
    oqmd_to_icsd = {int(line[0]):line[1] for line in the_lines[1:]}


### Loop through to find all structures which correspond to this chemistry

In [3]:
filtered_structures = {pair:[] for pair in target_elements_groups}

# Intervals define the individual file handles
intervals = list(range(50000,850000,50000))
intervals.append(824909)

# Loop through individual files
for interval in tqdm_notebook(intervals):
    suffix = str(interval)+'.json'
    struct_str = oqmd_struct_string + suffix
    energy_str = oqmd_nrg_string + suffix
    
    strucs = json.load(open(struct_str,'r'))
    energies = json.load(open(energy_str,'r'))
    struc_ids = list(strucs.keys())
    
    # Main loop
    for qmid,struc in strucs.items():
        present_species = set([spec['label'] for spec in struc['sites']])
        has_metals = bool(len(target_metals.intersection(present_species)))
        is_oxide = O.issubset(present_species)
        is_stable = energies[qmid] < 0       
        # Decide if structure is relevant
        if has_metals and is_oxide and is_stable:
            for metal in target_metals:
                if metal in present_species:
                    # Generate OQMD ID string
                    struc['id'] = 'oqmd-'+str(qmid)
                    # Associate ICSD
                    if associate_icsd:
                        icsd = oqmd_to_icsd.get(int(qmid),None)
                        struc['icsd'] = icsd
                    # Associate Bader
                    if associate_bader:
                        bader_target = os.path.join(oqmd_bader_dir,str(qmid+'_charges.npy'))
                        if os.path.exists(bader_target):
                            struc['baders'] = list(np.load(bader_target))
                    # Append to list
                    filtered_structures[(metal,'O')].append(struc)
for pair in target_elements_groups:
    print("Found:",len(filtered_structures[pair]),' structures for ',pair)

HBox(children=(IntProgress(value=0, max=17), HTML(value='')))


Found: 1254  structures for  ('Co', 'O')
Found: 1697  structures for  ('Fe', 'O')
Found: 1819  structures for  ('V', 'O')
Found: 1836  structures for  ('Cu', 'O')
Found: 1221  structures for  ('Ni', 'O')
Found: 1418  structures for  ('Cr', 'O')
Found: 2251  structures for  ('Mn', 'O')
Found: 1605  structures for  ('Ti', 'O')


# Write OQMD Structures
Written in format where each structure is an individual json object in the Pymatgen Structure format. Load in using Pymatgen.core.Structure.from_dict() .

In [4]:
# Write Collations (For future pre-processing use)
for pair in tqdm_notebook(target_elements_groups):
    file_name= "{}_{}_OQMD_collations.json".format(pair[0],pair[1])
    write_target = os.path.join(storage_directory,file_name)
    with open(write_target,'w') as f:
        for struc in filtered_structures[pair]:
            
            cur_struc = Structure.from_dict(struc)
            if struc.get('baders',None):
                cur_baders = [(str(specie),bader) for specie,bader in zip(cur_struc.species,struc['baders'])]
            else:
                cur_baders = []
            cur_col = XAS_Collation(structure = cur_struc,
                                    oqmd_id = struc['id'],
                                    icsd_ids = struc.get('icsd',None),
                                    oqmd_bader = cur_baders)
            f.write(json.dumps(cur_col.as_dict())+'\n')

# Write Structures (For optional use)
for pair in tqdm_notebook(target_elements_groups):
    file_name= "{}_{}_OQMD_structures.json".format(pair[0],pair[1])
    write_target = os.path.join(storage_directory,file_name)
    with open(write_target,'w') as f:
        for struc in filtered_structures[pair]:
            f.write(json.dumps(struc)+'\n')
            

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))




HBox(children=(IntProgress(value=0, max=8), HTML(value='')))


