In [11]:
import os
from pymatgen.core import Structure
from collections import defaultdict

def get_compound_info(structure):
    """
    Extract compound name and space group from a pymatgen Structure object.
    """
    compound_name = structure.composition.reduced_formula
    space_group = structure.get_space_group_info()[0]
    return compound_name, space_group

def main(folder_path):
    compound_files = defaultdict(list)
    full_list = []
    
    # Iterate through all .cif files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".cif"):
            file_path = os.path.join(folder_path, filename)
            try:
                # Read the .cif file using pymatgen
                structure = Structure.from_file(file_path)
                compound_name, space_group = get_compound_info(structure)
                
                # Save the complete list of compound names and space groups
                full_list.append((compound_name, space_group))
                
                # Record the compound name, space group, and corresponding file name
                compound_files[(compound_name, space_group)].append(filename)
            except Exception as e:
                print(f"Error reading {filename}: {e}")
                # Skip this compound if there's an error
                continue
    
    # Generate a list of unique compound names and space groups
    unique_compounds = list(set(full_list))
    
    # Find compounds with multiple .cif files and count them
    duplicates = {comp: (len(files), comp[1]) for comp, files in compound_files.items() if len(files) > 1}
    
    # Return the three lists
    return full_list, unique_compounds, duplicates



# Run the function and receive the returned lists
folder_path = '/Users/yanjunliu/Documents/MEAI_TSM/resources/cifs/CIF'  # Modify with your folder path
full_list, unique_compounds, duplicates = main(folder_path)


  CIF={'Bi': 2.0, 'Ce': 1.0, 'Zn': 0.47}
  PMG={'Ce': 2.0, 'Zn': 0.95, 'Bi': 4.0}
  ratios={'Zn': 2.021276595744681, 'Ce': 2.0, 'Bi': 2.0}
  CIF={'Ag': 1.5, 'Mn': 1.0, 'O': 2.0, 'Se': 2.0, 'Sr': 2.0}
  PMG={'Sr': 4.0, 'Mn': 2.0, 'Ag': 3.0676, 'Se': 4.0, 'O': 4.0}
  ratios={'Mn': 2.0, 'Se': 2.0, 'Sr': 2.0, 'O': 2.0, 'Ag': 2.0450666666666666}
Species occupancies sum to more than 1!
No structure parsed for section 1 in CIF.
Species occupancies sum to more than 1!


Error reading ICSD_CollCode181530.cif: Invalid CIF file with no structures!
Error reading ICSD_CollCode150382.cif: Invalid CIF file with no structures!


No structure parsed for section 1 in CIF.
Species occupancies sum to more than 1!
  CIF={'Sb': 2.0, 'Sm': 1.0, 'Zn': 0.62}
  PMG={'Sm': 2.0, 'Zn': 1.23, 'Sb': 4.0}
  ratios={'Sb': 2.0, 'Sm': 2.0, 'Zn': 1.9838709677419355}
  CIF={'Ce': 1.0, 'Ga': 3.89, 'Zn': 1.05}
  PMG={'Ce': 4.0, 'Zn': 4.214000000000001, 'Ga': 15.960000000000003}
  ratios={'Zn': 4.0133333333333345, 'Ga': 4.102827763496144, 'Ce': 4.0}


Error reading NbSiSb_646436 - Kopie(umbauNbBTe).cif: 


  CIF={'Bi': 0.27, 'Pr': 2.0, 'Sb': 1.73, 'Te': 2.0}
  PMG={'Pr': 2.0, 'Sb': 1.712, 'Bi': 0.288, 'Te': 2.0}
  ratios={'Sb': 0.9895953757225433, 'Pr': 1.0, 'Bi': 1.0666666666666664, 'Te': 1.0}
  CIF={'Fe': 3.0, 'Ge': 1.0, 'Te': 2.0}
  PMG={'Fe': 5.648, 'Ge': 5.934, 'Te': 4.0}
  ratios={'Ge': 5.934, 'Te': 2.0, 'Fe': 1.8826666666666665}
  CIF={'Ni': 0.6, 'Sb': 2.0, 'Tb': 1.0}
  PMG={'Tb': 2.0, 'Ni': 1.192, 'Sb': 4.0}
  ratios={'Sb': 2.0, 'Tb': 2.0, 'Ni': 1.9866666666666666}


Error reading ICSD_CollCode415616.cif: 'NoneType' object is not subscriptable


  CIF={'As': 2.0, 'Pr': 1.0, 'Zn': 0.667}
  PMG={'Pr': 1.996, 'Zn': 1.322, 'As': 4.0}
  ratios={'Zn': 1.9820089955022488, 'Pr': 1.996, 'As': 2.0}
  CIF={'As': 1.0, 'Fe': 1.0, 'Li': 1.0}
  PMG={'Li': 1.994, 'Fe': 2.0, 'As': 1.99}
  ratios={'Li': 1.994, 'As': 1.99, 'Fe': 2.0}
No structure parsed for section 1 in CIF.
Species occupancies sum to more than 1!
  CIF={'Cu': 4.0, 'O': 1.5, 'P': 4.0, 'Pr': 3.0}
  PMG={'Pr': 6.0, 'Cu': 8.0, 'P': 8.0, 'O': 2.976}
  ratios={'P': 2.0, 'Pr': 2.0, 'O': 1.984, 'Cu': 2.0}


Error reading ICSD_CollCode247686.cif: Invalid CIF file with no structures!


  CIF={'As': 1.43, 'Te': 0.46, 'Zr': 1.0}
  PMG={'Zr': 2.01, 'As': 2.85, 'Te': 0.91}
  ratios={'Zr': 2.01, 'As': 1.9930069930069931, 'Te': 1.9782608695652173}
No structure parsed for section 1 in CIF.
Species occupancies sum to more than 1!


Error reading EntryWithCollCode168881 (1).cif: Invalid CIF file with no structures!


  CIF={'In': 2.0, 'Mg': 1.0, 'Te': 4.0}
  PMG={'Mg': 2.0100000000000002, 'In': 3.99, 'Te': 8.0}
  ratios={'Mg': 2.0100000000000002, 'In': 1.995, 'Te': 2.0}
  CIF={'Cd': 0.66, 'Pr': 1.0, 'Sb': 2.0}
  PMG={'Pr': 2.0, 'Cd': 1.33, 'Sb': 4.0}
  ratios={'Sb': 2.0, 'Cd': 2.015151515151515, 'Pr': 2.0}
  CIF={'Ni': 0.57, 'Sb': 2.0, 'Y': 1.0}
  PMG={'Y': 2.0, 'Ni': 1.134, 'Sb': 4.0}
  ratios={'Sb': 2.0, 'Ni': 1.9894736842105263, 'Y': 2.0}
  CIF={'Bi': 0.84, 'Ce': 1.0, 'Ni': 0.8, 'Sb': 1.16}
  PMG={'Ce': 2.0, 'Ni': 1.6046, 'Bi': 1.6858, 'Sb': 2.3142}
  ratios={'Ni': 2.00575, 'Sb': 1.995, 'Ce': 2.0, 'Bi': 2.006904761904762}
No structure parsed for section 1 in CIF.
Species occupancies sum to more than 1!
  CIF={'Nd': 1.0, 'Sb': 2.0, 'Zn': 0.61}
  PMG={'Nd': 2.0, 'Zn': 1.228, 'Sb': 4.0}
  ratios={'Sb': 2.0, 'Zn': 2.0131147540983605, 'Nd': 2.0}
  CIF={'Al': 3.0, 'Ce': 1.0, 'Cu': 0.96}
  PMG={'Ce': 2.0, 'Al': 6.08, 'Cu': 1.92}
  ratios={'Ce': 2.0, 'Cu': 2.0, 'Al': 2.026666666666667}
  CIF={'Pr': 1.0,

Error reading ICSD_CollCode4652.cif: Invalid CIF file with no structures!


  CIF={'Ba': 1.0, 'Bi': 2.0, 'Nb': 2.0, 'O': 9.0}
  PMG={'Ba': 2.02, 'Bi': 3.98, 'Nb': 4.0, 'O': 18.0}
  ratios={'Ba': 2.02, 'Nb': 2.0, 'O': 2.0, 'Bi': 1.99}
  CIF={'Pt': 1.2, 'Si': 2.8, 'Sr': 1.0}
  PMG={'Sr': 2.0, 'Si': 5.62, 'Pt': 2.3800000000000003}
  ratios={'Si': 2.007142857142857, 'Pt': 1.9833333333333336, 'Sr': 2.0}
  CIF={'Fe': 1.0, 'Se': 0.92}
  PMG={'Fe': 2.0, 'Se': 1.82}
  ratios={'Se': 1.9782608695652173, 'Fe': 2.0}


Error reading ICSD_CollCode168881 (1).cif: Invalid CIF file with no structures!


  CIF={'Bi': 2.2, 'Nb': 1.9, 'O': 9.0, 'Sc': 0.1, 'Sr': 0.8}
  PMG={'Sr': 1.6200000000000006, 'Bi': 4.38, 'Nb': 3.8, 'Sc': 0.2, 'O': 18.0}
  ratios={'Sc': 2.0, 'Nb': 2.0, 'Bi': 1.9909090909090907, 'Sr': 2.0250000000000004, 'O': 2.0}
No structure parsed for section 1 in CIF.
Species occupancies sum to more than 1!


Error reading ICSD_CollCode73779.cif: Invalid CIF file with no structures!


No structure parsed for section 1 in CIF.
Species occupancies sum to more than 1!
  CIF={'Au': 6.0, 'Ba': 1.0, 'Zn': 7.0}
  PMG={'Ba': 4.0, 'Zn': 28.27199999999999, 'Au': 23.728000000000012}
  ratios={'Ba': 4.0, 'Zn': 4.038857142857141, 'Au': 3.9546666666666686}
  CIF={'Bi': 2.0, 'Ce': 1.0, 'Cu': 0.71}
  PMG={'Ce': 2.0, 'Cu': 1.411, 'Bi': 4.0}
  ratios={'Ce': 2.0, 'Bi': 2.0, 'Cu': 1.987323943661972}
No structure parsed for section 1 in CIF.
Species occupancies sum to more than 1!
  CIF={'As': 1.67, 'Cu': 1.05, 'P': 0.33, 'Sm': 1.0}
  PMG={'Sm': 2.0, 'Cu': 2.0919999999999996, 'As': 3.3339999999999996, 'P': 0.6659999999999999}
  ratios={'Sm': 2.0, 'P': 2.018181818181818, 'As': 1.9964071856287424, 'Cu': 1.992380952380952}


Error reading ICSD_CollCode95613.cif: Invalid CIF file with no structures!
Error reading ICSD_CollCode193327.cif: Invalid CIF file with no structures!


No structure parsed for section 1 in CIF.
Species occupancies sum to more than 1!
  CIF={'Eu': 1.0, 'Ni': 1.53, 'Sb': 2.0}
  PMG={'Eu': 2.0, 'Ni': 3.064, 'Sb': 3.98}
  ratios={'Sb': 1.99, 'Eu': 2.0, 'Ni': 2.0026143790849673}


Error reading La2Fe4Sb5_test.cif: Invalid CIF file with no structures!


No structure parsed for section 1 in CIF.
Species occupancies sum to more than 1!
  CIF={'Cu': 1.0, 'S': 1.0, 'Tl': 1.0}
  PMG={'Tl': 2.0, 'Cu': 1.94, 'S': 2.0}
  ratios={'Tl': 2.0, 'S': 2.0, 'Cu': 1.94}


Error reading ICSD_CollCode173302.cif: Invalid CIF file with no structures!




Error reading EntryWithCollCode168881.cif: Invalid CIF file with no structures!


No structure parsed for section 1 in CIF.
Species occupancies sum to more than 1!
No structure parsed for section 1 in CIF.
Species occupancies sum to more than 1!
No structure parsed for section 1 in CIF.
Species occupancies sum to more than 1!


Error reading ICSD_CollCode409664.cif: Invalid CIF file with no structures!
Error reading ICSD_CollCode173301.cif: Invalid CIF file with no structures!
Error reading ICSD_CollCode162556.cif: Invalid CIF file with no structures!


  CIF={'Ba': 0.81, 'Cl': 1.0, 'F': 1.0, 'Sr': 0.19}
  PMG={'Sr': 0.388, 'Ba': 1.612, 'Cl': 2.0, 'F': 2.0}
  ratios={'Ba': 1.9901234567901234, 'F': 2.0, 'Sr': 2.042105263157895, 'Cl': 2.0}
  CIF={'Ba': 1.0, 'Bi': 3.0, 'Br': 1.0, 'Nb': 2.0, 'O': 11.0, 'Pb': 1.0}
  PMG={'Ba': 0.9899999999999999, 'Bi': 3.01, 'Nb': 2.0, 'Pb': 1.0, 'Br': 1.0, 'O': 11.0}
  ratios={'Ba': 0.9899999999999999, 'Nb': 1.0, 'Bi': 1.0033333333333332, 'Pb': 1.0, 'Br': 1.0, 'O': 1.0}
No structure parsed for section 1 in CIF.
Species occupancies sum to more than 1!


Error reading EntryWithCollCode168882.cif: Invalid CIF file with no structures!


  CIF={'Ag': 2.8, 'Eu': 1.0, 'Na': 0.2, 'Te': 4.0}
  PMG={'Eu': 1.0, 'Ag': 2.79, 'Na': 0.21, 'Te': 4.0}
  ratios={'Eu': 1.0, 'Te': 1.0, 'Ag': 0.9964285714285716, 'Na': 1.0499999999999998}
  CIF={'Al': 6.0, 'Au': 2.25, 'Eu': 2.0, 'Si': 2.75}
  PMG={'Eu': 4.0, 'Al': 12.0, 'Au': 4.508, 'Si': 5.48}
  ratios={'Eu': 2.0, 'Si': 1.992727272727273, 'Au': 2.0035555555555558, 'Al': 2.0}
No structure parsed for section 1 in CIF.
Species occupancies sum to more than 1!
  CIF={'As': 2.0, 'Ba': 0.9, 'Fe': 2.0, 'Sn': 0.1}
  PMG={'Ba': 1.798, 'Fe': 4.0, 'Sn': 0.204, 'As': 4.0}
  ratios={'Ba': 1.9977777777777779, 'Sn': 2.0399999999999996, 'As': 2.0, 'Fe': 2.0}


Error reading ICSD_CollCode75793.cif: Invalid CIF file with no structures!


  CIF={'Fe': 1.13, 'S': 0.05, 'Te': 0.95}
  PMG={'Fe': 2.2460000000000004, 'Te': 1.896, 'S': 0.104}
  ratios={'S': 2.0799999999999996, 'Te': 1.9957894736842106, 'Fe': 1.987610619469027}
  CIF={'As': 1.0, 'Ca': 0.29, 'Fe': 1.0, 'O': 3.0, 'Sr': 1.71, 'V': 1.0}
  PMG={'Sr': 3.432, 'Ca': 0.5680000000000001, 'V': 2.0, 'Fe': 2.0, 'As': 2.0, 'O': 6.0}
  ratios={'V': 2.0, 'As': 2.0, 'Ca': 1.9586206896551728, 'Sr': 2.007017543859649, 'O': 2.0, 'Fe': 2.0}
  CIF={'Fe': 1.04, 'Se': 0.34, 'Te': 0.66}
  PMG={'Fe': 2.0700000000000003, 'Te': 1.31, 'Se': 0.688}
  ratios={'Te': 1.9848484848484849, 'Se': 2.0235294117647054, 'Fe': 1.9903846153846156}
'_atom_site_label'
No structure parsed for section 1 in CIF.
'_atom_site_label'
No structure parsed for section 1 in CIF.
Species occupancies sum to more than 1!


Error reading ICSD_CollCode196412.cif: Invalid CIF file with no structures!


'_atom_site_label'
No _symmetry_equiv_pos_as_xyz type key found. Spacegroup from _symmetry_space_group_name_H-M used.
No _symmetry_equiv_pos_as_xyz type key found. Spacegroup from _symmetry_space_group_name_H-M used.
No _symmetry_equiv_pos_as_xyz type key found. Defaulting to P1.
  CIF={'Ba': 2.0, 'Cu': 0.89, 'Mn': 2.0, 'O': 4.0, 'S': 1.0}
  PMG={'Ba': 4.0, 'Mn': 4.0, 'Cu': 1.798, 'S': 2.0, 'O': 8.0}
  ratios={'Mn': 2.0, 'Cu': 2.020224719101124, 'Ba': 2.0, 'S': 2.0, 'O': 2.0}
No structure parsed for section 1 in CIF.
Species occupancies sum to more than 1!


Error reading La2Cu4Sb5_168881.cif: Invalid CIF file with no structures!
Error reading ICSD_CollCode75791.cif: Invalid CIF file with no structures!


  CIF={'Fe': 0.6, 'Sb': 2.0, 'U': 1.0}
  PMG={'U': 2.0, 'Fe': 1.22, 'Sb': 4.0}
  ratios={'Sb': 2.0, 'U': 2.0, 'Fe': 2.033333333333333}
  CIF={'As': 1.0, 'Ca': 0.11, 'Fe': 1.0, 'O': 3.0, 'Sr': 1.89, 'V': 1.0}
  PMG={'Sr': 3.774, 'Ca': 0.22600000000000003, 'V': 2.0, 'Fe': 2.0, 'As': 2.0, 'O': 6.0}
  ratios={'V': 2.0, 'As': 2.0, 'Ca': 2.0545454545454547, 'Sr': 1.9968253968253968, 'O': 2.0, 'Fe': 2.0}
No structure parsed for section 1 in CIF.
Species occupancies sum to more than 1!


Error reading ICSD_CollCode173295.cif: Invalid CIF file with no structures!


  CIF={'As': 1.0, 'Ca': 0.2, 'Fe': 1.0, 'O': 3.0, 'Sr': 1.8, 'V': 1.0}
  PMG={'Sr': 3.592, 'Ca': 0.40800000000000003, 'V': 2.0, 'Fe': 2.0, 'As': 2.0, 'O': 6.0}
  ratios={'V': 2.0, 'As': 2.0, 'Ca': 2.04, 'Sr': 1.9955555555555555, 'O': 2.0, 'Fe': 2.0}
No structure parsed for section 1 in CIF.
Species occupancies sum to more than 1!
  CIF={'Fe': 1.0, 'Se': 0.44, 'Te': 0.56}
  PMG={'Fe': 2.0, 'Te': 1.1252, 'Se': 0.8748}
  ratios={'Te': 2.009285714285714, 'Se': 1.9881818181818183, 'Fe': 2.0}


Error reading ICSD_CollCode239734.cif: Invalid CIF file with no structures!
Error reading ThFeNAs.cif: Invalid CIF file with no structures!


No structure parsed for section 1 in CIF.
Species occupancies sum to more than 1!
  CIF={'Mn': 0.88, 'Pd': 2.87}
  PMG={'Mn': 3.504, 'Pd': 11.496}
  ratios={'Mn': 3.981818181818182, 'Pd': 4.005574912891986}
  CIF={'As': 2.0, 'La': 1.0, 'Zn': 0.66}
  PMG={'La': 4.0, 'Zn': 2.652, 'As': 8.0}
  ratios={'Zn': 4.0181818181818185, 'As': 4.0, 'La': 4.0}
  CIF={'As': 1.0, 'Fe': 1.0, 'Li': 1.0}
  PMG={'Li': 2.2, 'Fe': 2.0, 'As': 2.0}
  ratios={'Li': 2.2, 'As': 2.0, 'Fe': 2.0}
No structure parsed for section 1 in CIF.
Species occupancies sum to more than 1!
No structure parsed for section 1 in CIF.
Species occupancies sum to more than 1!


Error reading ICSD_CollCode173296.cif: Invalid CIF file with no structures!
Error reading ICSD_CollCode163244.cif: Invalid CIF file with no structures!


No structure parsed for section 1 in CIF.
Species occupancies sum to more than 1!
  CIF={'Ag': 2.0, 'Co': 1.0, 'O': 2.0, 'Se': 2.0, 'Sr': 2.0}
  PMG={'Sr': 4.0, 'Co': 2.0, 'Ag': 3.928, 'Se': 4.0, 'O': 4.0}
  ratios={'Co': 2.0, 'Se': 2.0, 'Sr': 2.0, 'O': 2.0, 'Ag': 1.964}


Error reading ICSD_CollCode239733.cif: Invalid CIF file with no structures!
Error reading ICSD_CollCode168881.cif: Invalid CIF file with no structures!


  CIF={'Fe': 1.0, 'Se': 0.42, 'Te': 0.58}
  PMG={'Fe': 2.0, 'Se': 0.832, 'Te': 1.168}
  ratios={'Te': 2.013793103448276, 'Se': 1.980952380952381, 'Fe': 2.0}
  CIF={'Co': 1.0, 'Cu': 2.0, 'O': 2.0, 'Se': 2.0, 'Sr': 2.0}
  PMG={'Sr': 4.0, 'Co': 2.0, 'Cu': 3.98, 'Se': 4.0, 'O': 4.0}
  ratios={'Co': 2.0, 'Se': 2.0, 'Cu': 1.99, 'Sr': 2.0, 'O': 2.0}
No structure parsed for section 1 in CIF.
Species occupancies sum to more than 1!
  CIF={'Sb': 2.0, 'Ti': 3.0, 'Zr': 1.0}
  PMG={'Zr': 2.056, 'Ti': 5.944, 'Sb': 4.0}
  ratios={'Sb': 2.0, 'Zr': 2.056, 'Ti': 1.9813333333333334}


Error reading ICSD_CollCode186542.cif: Invalid CIF file with no structures!
Error reading ICSD_CollCode170940.cif: Invalid CIF file with no structures!


No structure parsed for section 1 in CIF.
Species occupancies sum to more than 1!
  CIF={'Sb': 1.0, 'Ti': 1.75, 'Zr': 0.25}
  PMG={'Zr': 0.996, 'Ti': 7.004, 'Sb': 4.0}
  ratios={'Sb': 4.0, 'Zr': 3.984, 'Ti': 4.002285714285714}


Error reading ICSD_CollCode173296 (1).cif: Invalid CIF file with no structures!


  CIF={'Bi': 0.26, 'Ce': 1.0, 'Ni': 0.75, 'Sb': 1.74}
  PMG={'Ce': 2.0, 'Ni': 1.496, 'Bi': 0.53, 'Sb': 3.4699999999999998}
  ratios={'Ni': 1.9946666666666666, 'Sb': 1.9942528735632183, 'Ce': 2.0, 'Bi': 2.0384615384615383}
No structure parsed for section 1 in CIF.
Species occupancies sum to more than 1!


Error reading ICSD_CollCode169251.cif: Invalid CIF file with no structures!


  CIF={'Br': 1.0, 'Cu': 1.0, 'Te': 1.0}
  PMG={'Cu': 15.531999999999993, 'Te': 16.0, 'Br': 16.0}
  ratios={'Br': 16.0, 'Cu': 15.531999999999993, 'Te': 16.0}
  CIF={'In': 0.8, 'La': 1.0, 'Sb': 2.0}
  PMG={'La': 2.0, 'In': 1.624, 'Sb': 4.0}
  ratios={'Sb': 2.0, 'In': 2.03, 'La': 2.0}
  CIF={'As': 0.96, 'Ce': 1.0, 'Se': 1.03}
  PMG={'Ce': 2.0, 'As': 2.0, 'Se': 2.0}
  ratios={'Ce': 2.0, 'As': 2.0833333333333335, 'Se': 1.941747572815534}
No structure parsed for section 1 in CIF.
Species occupancies sum to more than 1!


Error reading ICSD_CollCode152825.cif: Invalid CIF file with no structures!
Error reading EntryWithCollCode168881 (3).cif: Invalid CIF file with no structures!


  CIF={'F': 2.0, 'O': 7.0, 'Ru': 1.0, 'Sr': 3.0, 'Ti': 1.0}
  PMG={'Sr': 6.0, 'Ru': 2.0, 'Ti': 2.0, 'O': 14.039999999999997, 'F': 3.9600000000000017}
  ratios={'Ti': 2.0, 'F': 1.9800000000000009, 'Ru': 2.0, 'O': 2.0057142857142853, 'Sr': 2.0}
  CIF={'Pd': 0.6, 'Sb': 2.0, 'U': 1.0}
  PMG={'U': 2.0, 'Sb': 4.0, 'Pd': 1.206}
  ratios={'Sb': 2.0, 'U': 2.0, 'Pd': 2.0100000000000002}
  CIF={'As': 2.0, 'Ba': 0.683, 'Fe': 2.0, 'K': 0.244, 'Sn': 0.073}
  PMG={'Ba': 1.366, 'K': 0.488, 'Fe': 4.0, 'Sn': 0.148, 'As': 4.0}
  ratios={'Ba': 2.0, 'As': 2.0, 'Sn': 2.0273972602739727, 'K': 2.0, 'Fe': 2.0}
  CIF={'Co': 0.88, 'Ga': 3.0, 'Ge': 1.0, 'Y': 1.0}
  PMG={'Y': 4.0, 'Ga': 12.0, 'Co': 3.507200000000001, 'Ge': 3.999999999999999}
  ratios={'Ga': 4.0, 'Co': 3.9854545454545467, 'Ge': 3.999999999999999, 'Y': 4.0}
  CIF={'Ni': 5.76, 'Sn': 1.0, 'Te': 2.0}
  PMG={'Ni': 11.584000000000003, 'Sn': 2.0, 'Te': 4.0}
  ratios={'Sn': 2.0, 'Ni': 2.011111111111112, 'Te': 2.0}
  CIF={'Ho': 1.0, 'Ni': 0.6, 'Sb': 2.0}
  

Error reading MnTe2_longscan_add_prof.cif: Invalid CIF file with no structures!


  CIF={'As': 2.0, 'Ca': 0.82, 'Fe': 1.0, 'La': 0.18}
  PMG={'Ca': 1.636, 'La': 0.364, 'Fe': 2.0, 'As': 4.0}
  ratios={'Fe': 2.0, 'As': 2.0, 'La': 2.022222222222222, 'Ca': 1.9951219512195122}


Error reading EntryWithCollCode168881 (2).cif: Invalid CIF file with no structures!


  CIF={'As': 2.0, 'Ba': 0.606, 'Fe': 2.0, 'K': 0.349, 'Sn': 0.045}
  PMG={'Ba': 1.212, 'K': 0.698, 'Fe': 4.0, 'Sn': 0.088, 'As': 4.0}
  ratios={'Ba': 2.0, 'As': 2.0, 'Sn': 1.9555555555555555, 'K': 2.0, 'Fe': 2.0}
  CIF={'As': 1.01, 'Ce': 1.0, 'Se': 0.99}
  PMG={'Ce': 2.0, 'As': 2.0, 'Se': 2.0}
  ratios={'Ce': 2.0, 'As': 1.9801980198019802, 'Se': 2.0202020202020203}
  CIF={'Nd': 2.0, 'Pd': 1.72, 'Sb': 4.0}
  PMG={'Nd': 2.0, 'Sb': 4.0, 'Pd': 1.7}
  ratios={'Sb': 1.0, 'Pd': 0.9883720930232558, 'Nd': 1.0}
No structure parsed for section 1 in CIF.
Species occupancies sum to more than 1!


Error reading EntryWithCollCode170939.cif: Invalid CIF file with no structures!


No structure parsed for section 1 in CIF.
Species occupancies sum to more than 1!
  CIF={'Ag': 2.8, 'Eu': 1.0, 'Na': 0.2, 'Te': 4.0}
  PMG={'Na': 1.0, 'Eu': 1.0, 'Ag': 2.0, 'Te': 4.0}
  ratios={'Eu': 1.0, 'Te': 1.0, 'Ag': 0.7142857142857143, 'Na': 5.0}


Error reading ICSD_CollCode244424.cif: Invalid CIF file with no structures!
Error reading HfCuSi2_87174.cif: could not convert string to float: '?'


  CIF={'Cu': 0.5, 'Ga': 0.5835, 'In': 0.5835, 'Se': 2.0}
  PMG={'In': 2.3320000000000007, 'Cu': 2.012, 'Ga': 2.332, 'Se': 8.0}
  ratios={'Ga': 3.9965724078834617, 'Se': 4.0, 'In': 3.996572407883463, 'Cu': 4.024}
  CIF={'Au': 1.0, 'In': 3.0, 'Sr': 1.0}
  PMG={'Sr': 2.0, 'In': 6.024000000000001, 'Au': 1.976}
  ratios={'Sr': 2.0, 'In': 2.0080000000000005, 'Au': 1.976}


In [12]:
import pandas as pd

def clean_compound_name(name):
    # Remove spaces and parentheses
    return name.replace(" ", "").replace("(", "").replace(")", "")

def normalize_label(label):
    # Treat 'yes*' as 'yes' and 'no*' as 'no'
    return label.lower().replace("*", "")

def compare_compounds_and_count(excel_file, sheet_name, full_list):
    # Read the Excel file, assuming labels are in the first column and compound names are in the second column, starting from the fourth row
    df = pd.read_excel(excel_file, sheet_name=sheet_name, usecols=[0, 1], skiprows=3, header=None)
    
    # Clean and normalize the data
    df[0] = df[0].apply(normalize_label)
    df[1] = df[1].apply(clean_compound_name)
    
    # Convert the data into a dictionary, with compound names as keys and labels (yes/no) as values
    excel_compounds = dict(zip(df[1], df[0]))
    
    # Compare and generate a list of matched compounds and space groups, while counting the number of 'yes' and 'no'
    matched_compounds = [(comp, sg, excel_compounds[clean_compound_name(comp)]) for comp, sg in full_list if clean_compound_name(comp) in excel_compounds]
    
    yes_count = sum(1 for _, _, label in matched_compounds if label == 'yes')
    no_count = sum(1 for _, _, label in matched_compounds if label == 'no')
    
    return matched_compounds, yes_count, no_count

# Run the comparison and get the results
excel_file = '/Users/yanjunliu/Downloads/code_record/TSM/CorrectedData2023.xls'  # Modify with your Excel file path
sheet_name = 'Sheet1'  # Modify with your Excel sheet name
matched_compounds, yes_count, no_count = compare_compounds_and_count(excel_file, sheet_name, full_list)

# Print the matched compounds, space groups, and their labels
print("Matched Compounds, Space Groups, and Labels:")
for comp, sg, label in matched_compounds:
    print(f"Compound: {comp}, Space Group: {sg}, Label: {label}")

# Print the statistics
print(f"\nNumber of 'yes' compounds: {yes_count}")
print(f"Number of 'no' compounds: {no_count}")


Matched Compounds, Space Groups, and Labels:
Compound: ZrCuSiAs, Space Group: P4/nmm, Label: yes
Compound: NaCuTe, Space Group: P4/nmm, Label: no
Compound: PuS2, Space Group: P4/nmm, Label: yes
Compound: NaMnP, Space Group: P4/nmm, Label: no
Compound: GdTiSi, Space Group: P4/nmm, Label: no
Compound: DyScSb, Space Group: P4/nmm, Label: no
Compound: Ni2.86Te2, Space Group: P4/nmm, Label: no
Compound: HfSiTe, Space Group: P4/nmm, Label: yes
Compound: CsMnAs, Space Group: P4/nmm, Label: no
Compound: SmTiGe, Space Group: P4/nmm, Label: no
Compound: ThSbTe, Space Group: P4/nmm, Label: yes
Compound: Sr2CrFeAsO3, Space Group: P4/nmm, Label: no
Compound: PrCoGe, Space Group: P4/nmm, Label: no
Compound: UAsSe, Space Group: P4/nmm, Label: yes
Compound: KCoO2, Space Group: P4/nmm, Label: no
Compound: PrMnSi, Space Group: P4/nmm, Label: no
Compound: DyFeSi, Space Group: P4/nmm, Label: no
Compound: NaAlGe, Space Group: P4/nmm, Label: no
Compound: TmSe2, Space Group: P4/nmm, Label: yes
Compound: ZrSi

In [14]:
import numpy as np
aaa=[[1,-1],[2,-2]]
print(np.abs(aaa))

[[1 1]
 [2 2]]
