# Combining MP and OQMD Collations, Matching Like Bader Charges

## Goals: 
- Identify MP / OQMD collations with the same strucutre in order to compare bader charges.
- Do this by structure matching and by ICSD number.
- Identify redundant OQMD structures in order to know which to prioritize computing spectra for.

## Inputs:
- {}\_{}\_MP_API+Spec+Symm_collations.json: The collations of MP structures which have been decorated with spectra and symmetrized.
- {}\_{}\_OQMD_collations.json: The collations of OQMD structures.

## Outputs: 
- {}\_{}\_MP_OQMD_combined_collations.json : Collations of MP / OQMD structures which have been pruned for redundant structures, with matched MP / Bader charges.

In [None]:
import os
from pymatgen.ext.matproj import MPRester
data_prefix = '/Users/steventorrisi/Documents/TRIXS/data/'
storage_directory = os.path.join(data_prefix,'MP_OQMD_combined')

target_elements_groups=[('Co','O'),('Fe','O'),('V','O'),('Cu','O'),
                        ('Ni','O'),('Cr','O'),('Mn','O'),('Ti','O')]
target_metals = set(['Co','Ni','Fe','Cr','V','Mn','Cu','Ti'])
mpr = MPRester('80n2gkFfpXbPxZJTxD')

In [None]:
from pymatgen.ext.matproj import MPRester
from pymatgen.core import Structure
from pymatgen.analysis.structure_matcher import StructureMatcher, ElementComparator
from trixs.spectra.core import XAS_Spectrum, XAS_Collation
from trixs.spectra.util import NumpyEncoder
from tqdm import tqdm, tqdm_notebook
from pprint import pprint
import json
import os
import numpy as np

matcher = StructureMatcher(comparator = ElementComparator())
def get_unique_structures(structures,**kwargs):
    """Gets unique structures from a list of structures using structure matcher"""
    unique_structs = []
    structure_matcher = StructureMatcher(attempt_supercell=True,
                            comparator = ElementComparator(),**kwargs)
    for new_struct in tqdm_notebook(structures, desc='reducing structures'):
        if not any([structure_matcher.fit(new_struct, struct) for struct in unique_structs]):
            unique_structs.append(new_struct)
    return unique_structs

Helper Functions

In [None]:
def determine_uniqueness(strucs1,struc2):
    structure_matcher = StructureMatcher(attempt_supercell=True,
                        comparator = ElementComparator())
    for struc in strucs1:
        if structure_matcher.fit(struc,struc2):
            return False
    return True

def trim_non_alpha(string):
    return ''.join([x for x in string if x.isalpha()])

## Load the MP collations into 'unified collations' and OQMD ones to be sorted in after

In [None]:
unif_cols = {pair: [] for pair in target_elements_groups}
oqmd_cols  = {pair: [] for pair in target_elements_groups}
for pair in tqdm_notebook(target_elements_groups,desc='Parsing {}'.format(pair)):
    mp_collation_file = storage_directory+ "/{}_{}_MP_API+Spec+Symm_collations.json".format(pair[0],pair[1])
    with open(mp_collation_file,'r') as f:
        the_lines = f.readlines()
        for line in the_lines:
            col = XAS_Collation.from_dict(json.loads(line.strip()))
            unif_cols[pair].append(col)
    oqmd_collation_file = storage_directory+ "/{}_{}_OQMD_collations.json".format(pair[0],pair[1])
    with open(oqmd_collation_file,'r') as f:
        the_lines = f.readlines()
        for line in the_lines:
            cur_dict = json.loads(line.strip())
            col = XAS_Collation.from_dict(json.loads(line.strip()))
            oqmd_cols[pair].append(col)

## Match Bader charges by ICSD

In [None]:
for pair in target_elements_groups:    
    mp_by_icsd = {None : None}
    # Index MP collations by their ICSD ids
    for mp_col in [col for col in unif_cols[pair] if col.has_mp_bader()]:     
        cur_icsds = mp_col.icsd_ids if mp_col.icsd_ids else [None]
        for icsd in cur_icsds:
            mp_by_icsd[icsd] = mp_col
    # Match OQMD collations to MP by  ICSDs
    for oqmd_col in [col for col in oqmd_cols[pair] if col.has_oqmd_bader()]:     
        cur_icsd = oqmd_col.icsd_ids
        icsd_num = int(cur_icsd.split('-')[1]) if cur_icsd else None
        mp_col = mp_by_icsd.get(icsd_num,None)
        # Check for matches
        if cur_icsd and mp_col:
            oqmd_struc = oqmd_col.structure
            mp_struc = mp_col.structure
            if matcher.fit(oqmd_struc,mp_struc):# and sites_are_ordered(oqmd_struc,mp_struc):
                mp_col.oqmd_id = oqmd_col.oqmd_id      
                mp_col.associated_ids[oqmd_col.oqmd_id] = 'bader'
                
                #oqmd_specs = [oqmd_struc.sites[i].as_dict()['species'][0]['element'] 
                #               for i in range(len(oqmd_struc.sites))]
                #oqmd_bader = oqmd_col.oqmd_bader#[oqmd_struc.sites[i].as_dict()['species'][0]['oxidation_state'] 
                             #      for i in range(len(oqmd_struc.sites))]
                mp_col.oqmd_bader = oqmd_col.oqmd_bader #[(spec,bader) for spec,bader in zip(oqmd_specs,oqmd_bader)]
    new_matches = [col for col in unif_cols[pair] if col.has_oqmd_bader()]
    print('For {}, we found {} MP structures and {} OQMD, '
          'with ICSD-based overlap of {}.'.format(pair,
                    len(unif_cols[pair]),len(oqmd_cols[pair]),len(new_matches)))
   


# Match MP - OQMD by Structure Matching
## to associate Bader charges 
## and find redundant structures
We attempt to match OQMD bader charges into MP structures for two reasons.

1. We count structures which weren't matched by ICSD previously.
2. If the structures match, that means we have a duplicate between MP and OQMD; thus, we will not carry the OQMD forward.
3. If the structures match, then the MP structure will get to have an OQMD bader charge associated; this will be useful for later regression.

This also means that we have a duplicate between OQMD and MP; so, we should not carry the OQMD structure forward.



In [None]:
redundant_oqmds = set()
for pair in target_elements_groups:
    newly_found = 0 
    for col in tqdm_notebook(unif_cols[pair], desc= str(pair)):
        
        oqmd_bader_candidates = [ x for x in oqmd_cols[pair]if \
                              col.elements==x.elements \
                                and x.mp_id ==None]
        
        for candidate in oqmd_bader_candidates:
            cand_struc = candidate.structure
            if matcher.fit(col.structure,cand_struc):
                redundant_oqmds.add(candidate.oqmd_id)
                if candidate.has_oqmd_bader():
                    
                    col.oqmd_bader = candidate.oqmd_bader
                    col.associated_ids[candidate.oqmd_id] = 'bader'
                    newly_found+=1
                    candidate.mp_id = col.mp_id
                    break
        
    unif_cols[pair]+= [col for col in oqmd_cols[pair] if col.oqmd_id not in redundant_oqmds]
    print("Found for {} {} redundant OQMD structures.".format(pair,newly_found))
    print("Found for {} {} total MP_OQMD collations.".format(pair,len(unif_cols[pair])))


In [None]:
for pair in target_elements_groups:
    print('------')
    print("Summary for {}".format(pair))
    print("Total collations:",len(unif_cols[pair]))
    print("Total Novel OQMD structures (Bader incl.):",len([col for col in unif_cols[pair] if not col.mp_id]))
    print("Number of Bader charges from MP:",len([col for col in unif_cols[pair] if col.has_mp_bader()]),'/', len(unif_cols[pair]))
    print("Number of Bader charges from OQMD:",len([col for col in unif_cols[pair] if col.has_oqmd_bader()]))
    print("Structures with both OQMD / MP Bader chg:",len([col for col in unif_cols[pair] 
                                       if (col.has_oqmd_bader() and col.has_mp_bader())]))
    print("Structures with any Bader chg:",len([col for col in unif_cols[pair] 
                                       if col.has_bader()]))

In [None]:
suffix = '_MP_OQMD_combined_collations.json'
for pair in tqdm_notebook(target_elements_groups):
    target = storage_directory + "/{}_{}".format(pair[0],pair[1])+ suffix
    
    with open(target,'w') as f:
        for col in unif_cols[pair]:
            f.write(json.dumps(col.as_dict())+'\n')