The purpose of this script is to read downloaded .cif files from Springer Materials, filter out compositions of non-interest, identify problematic/unusable entries, and write the composition and crystal structure(s) to a .csv file.

In [None]:
import numpy as np
import pandas as pd
import pymatgen.core as mg
import matplotlib as plt
import glob
import shutil
from collections import defaultdict
from pymatgen.io import cif as mg_cif
from itertools import combinations
from CifFile import ReadCif
from tqdm import tqdm

# First define some helper functions

In [None]:
def hh_comp_rearrange(comp): # rearrange compositions to match with standard HH naming convention 
    try:
        elements = comp.keys()
        groups = [x.group for x in elements]
        stoichiometry = comp.values()
        is_rare_earth = [x.is_rare_earth_metal for x in elements]

        if True in is_rare_earth:
            for i in range(len(is_rare_earth)):
                if is_rare_earth[i] == True: groups[i] = -1 # move rare earths to the front of composition
        ordered_pbin_comp = np.array(sorted(zip(groups,elements,stoichiometry)))
        
        ordered_elements = ordered_pbin_comp[:,1]
        ordered_stoichiometry = ordered_pbin_comp[:,2]
        
        comp_str = [str(ordered_elements[i])+str(ordered_stoichiometry[i]) if ordered_stoichiometry[i] != 1 else str(ordered_elements[i]) for i in range(len(ordered_elements))]
        
        return ''.join(comp_str)
    except:
        return comp

def check_cif_equivalent(fname1,fname2): # check for identical / equivalent cif files
    cif1 = mg.Structure.from_file(fname1).as_dict()
    cif2 = mg.Structure.from_file(fname2).as_dict()
    
    lat_1 = cif1['lattice']['matrix']
    lat_2 = cif2['lattice']['matrix']
    
    vol_1 = cif1['lattice']['volume']
    vol_2 = cif2['lattice']['volume']
    

    tolerance = 0.05 # relative tolerance
    
    # check if lattice parameters are similar
    if set(np.isclose(lat_1,lat_2,rtol=tolerance).flatten()) != {True}: return False
    
    # check if volumes are similar
    elif not np.isclose(vol_1,vol_2,rtol=tolerance): return False
    
    # check if number of species are the same
    elif len(cif1['sites']) != len(cif2['sites']): return False
    
    # check if sublattices are similar
    for i in range(len(cif1['sites'])):
        for key in cif1['sites'][i].keys():
            if key == 'xyz': continue # skip absolute coordinates
            try: # for integer values
                if set(np.isclose(cif1['sites'][i][key],cif2['sites'][i][key],atol=tolerance)) != {True}: 
                    return False
            except: # for non integer values (like strings)
                if cif1['sites'][i][key] != cif2['sites'][i][key]: return False
    
    
    return True

## Get list of all downloaded cifs and create dictionary to write final list to

In [None]:
downloaded_cifs = glob.glob('previous_downloads/*')

results = defaultdict(list)

# Parse through each cif

In [None]:
for cif in tqdm(downloaded_cifs):
    try:
        mg_comp = mg.Structure.from_file(cif).composition
        formula = mg_comp.reduced_formula
        mg_comp = mg.Composition(formula)
        # ignore compositions that do not have ABC stoichiometry or have unknown species in the cif file
        dummyspecies = [type(x) for x in mg_comp.elements]
        if set(mg_comp.values()) != {1.0} or mg.periodic_table.DummySpecie in dummyspecies or len(mg_comp.values()) != 3: continue

        cif_obj = mg_cif.CifParser(cif)
        cif_dict = cif_obj.as_dict()
        cif_key = list(cif_dict.keys())[1]
        hm_spacegroup = cif_dict[cif_key]['_symmetry_space_group_name_H-M']

        # get errors from parsing cif file
        error_status = cif_obj.has_errors

        # try to get the prototype name
        try:
            cif_pycifrw = ReadCif(cif)
            cif_prototype = cif_pycifrw['sm_global']['_sm_phase_prototype']
        except:
            cif_prototype = None
    
    except: # in the case of corrupt cif files
        formula = cif
        hm_spacegroup = None
        error_status = 'FATAL'
        cif_prototype = None
        
    # unify prototype names for easier processing

    
    try: cif_prototype = cif_prototype.replace(' ','')
    except: pass
    
    if cif_prototype == None: pass
    elif 'MgZn2' in cif_prototype and hm_spacegroup == "P63/mmc": cif_prototype = 'MgZn2'

    results['Composition'] += [hh_comp_rearrange(formula)]
    results['Space Group'] += [hm_spacegroup]
    results['Errors'] += [error_status]
    results['Filename'] += [cif.split('/')[1]]
    results['Prototype'] += [cif_prototype]

# Write results to dataframe and check the dataframe for duplicate compositions and errors. Also, fix compositions.

In [None]:
df_results = pd.DataFrame(results)

processed_results = defaultdict(list)

### Go through results for duplicate compositions and errors

In [None]:
covered_unprocessed_formulas = []

for i, row in tqdm(df_results.iterrows(),total=len(df_results)):
    
    comp = row['Composition']
    
    # skip if composition has already been analyzed
    if comp in covered_unprocessed_formulas: continue 
    else: covered_unprocessed_formulas += [comp]
        
    # skip corrupt cif files
    if row['Errors'] == 'FATAL': 
        error_code = 7        
        processed_results['Composition'] += [comp]
        processed_results['Space Groups'] += ['ERROR']
        processed_results['Replicates'] += ['ERROR']        
        processed_results['Error Code'] += [error_code]
        processed_results['Filename'] += [[row['Filename']]]
        processed_results['Prototype'] += ['Error']
        continue

    
    # get all matches to the particular composition
    matches = df_results.loc[df_results['Composition'] == comp]

    # get set of space groups
    match_space_groups = sorted(set(matches['Space Group']))
    comp_new_filenames = []
    comp_prototypes = []
    
    for sg in match_space_groups:
        # find all instances of a given space group
        sg_paths = np.array(matches[matches['Space Group'] == sg]['Filename'])
        sg_prototypes = np.array(matches[matches['Space Group'] == sg]['Prototype'])
        if len(sg_prototypes) == 0: sg_prototypes = [None]
        comparison_combos = combinations(range(len(sg_paths)),2)
        
        # identify equivalent cifs
        unique_prototypes = []
        repeated_prototypes = []
        
        for indices in comparison_combos:
            i0, i1 = indices[0], indices[1]
            fname1 = f'previous_downloads/{sg_paths[i0]}'
            fname2 = f'previous_downloads/{sg_paths[i1]}'
            if not check_cif_equivalent(fname1,fname2): 
                unique_prototypes += [i for i in [i0,i1] if i not in repeated_prototypes]
            else: 
                repeated_prototypes += [i1]
        
        unique_prototypes = list(set(unique_prototypes))
        
        # for compositions with only one associated file
        if len(unique_prototypes) == 0: 
            new_filename = [sg_paths[0]]
            prototype = [sg_prototypes[0]]
        
        # in case that not all cifs are the same prototype
        else: 
            new_filename = list(sg_paths[unique_prototypes])
            prototype = list(sg_prototypes[unique_prototypes])
            # this next line implies that prototypes labeled the same name with the same space group are identical!
            if len(list(set(prototype))) == 1: prototype = [prototype[0]]
            
        comp_new_filenames += [new_filename]
        comp_prototypes += [prototype]
        
        
    # determine the error code
    ## 0 = no errors, single space group, single prototype
    ## 1 = no errors, single space group, multiple prototypes
    ## 2 = no errors, multiple space groups, (multiple prototypes)
    ## 3 = errors with at least one .cif, single space group, single prototype
    ## 4 = errors with at least one .cif, single space group, multiple prototypes
    ## 5 =  errors with at least one .cif, multiple space groups, (multiple prototypes)
    ## 6 = other error (there shouldn't be any 6's)
    ## 7 = .cif file(s) couldn't be opened (FATAL)

    if len(match_space_groups) == 1: 
        space_group_conflicts = False
        
        if len(comp_prototypes[0]) == 1: single_prototype = True
        else: single_prototype = False
    
    else: space_group_conflicts = True
    
    if list(matches['Errors']).count(True) > 0: any_errors = True
    else: any_errors = False

        
    if not space_group_conflicts and not any_errors and single_prototype: error_code = 0
    elif not space_group_conflicts and not any_errors and not single_prototype: error_code = 1
    elif space_group_conflicts and not any_errors: error_code = 2
    elif not space_group_conflicts and any_errors and single_prototype: error_code = 3
    elif not space_group_conflicts and any_errors and not single_prototype: error_code = 4
    elif space_group_conflicts and any_errors: error_code = 5
    else: error_code = 6

    processed_results['Composition'] += [hh_comp_rearrange(mg.Composition(comp))]
    processed_results['Space Groups'] += [list(match_space_groups)]
    processed_results['Replicates'] += [len(matches)]    
    processed_results['Error Code'] += [error_code]
    processed_results['Filename'] += [str(comp_new_filenames)]
    processed_results['Prototype'] += [str(comp_prototypes)]

# Write processed results to .csv

In [None]:
df_processed_results = pd.DataFrame(processed_results)
df_processed_results.to_csv('../Data Files/springer_results.csv')