In [3]:
import pandas as pd
import numpy as np

## Read database

In [4]:
df = pd.read_csv('datasets/MP_db_conv_unit_cell_CA.csv')
df

Unnamed: 0,id,Formule,BG,IM,CA,a,b,c,alpha,beta,gamma
0,mp-19,Te,0.1856,False,1.073863e+06,4.601353,4.601353,5.900062,90.0,90.0,120.0
1,mp-149,Si,0.6105,False,1.133154e+05,5.443702,5.443702,5.443702,90.0,90.0,90.0
2,mp-239,BaS3,1.3913,False,6.254652e+04,6.951955,6.951955,4.216011,90.0,90.0,90.0
3,mp-241,CdF2,2.8977,False,1.582357e+03,5.401038,5.401038,5.401038,90.0,90.0,90.0
4,mp-252,BeTe,2.0173,False,1.010930e+04,5.662918,5.662918,5.662918,90.0,90.0,90.0
...,...,...,...,...,...,...,...,...,...,...,...
935,mp-999472,NaLaSe2,2.2767,False,3.138570e+04,4.373467,4.373467,20.620553,90.0,90.0,120.0
936,mp-999474,NaHoSe2,1.8867,False,2.690055e+04,4.098760,4.098760,20.778108,90.0,90.0,120.0
937,mp-999488,NaDySe2,1.8635,False,3.031457e+04,4.116628,4.116628,20.748485,90.0,90.0,120.0
938,mp-999489,NaGdSe2,1.3585,False,1.219142e+05,4.170708,4.170708,20.737036,90.0,90.0,120.0


## Parsing function

In [5]:
def parse_chemical_formula(formula):
    """
    Parse a chemical formula and return a dictionary with elements as keys
    and the number of atoms as values.
    
    Examples:
    - H2O -> {'H': 2, 'O': 1}
    - K4[ON(SO3)2]2 -> {'K': 4, 'O': 14, 'N': 2, 'S': 4}
    """
    
    def parse_formula(formula, start_idx, multiplier):
        """
        Recursive helper function to parse a chemical formula.
        Returns a dictionary of elements and their counts, and the index where parsing ended.
        """
        element_counts = {}
        i = start_idx
        
        while i < len(formula):
            char = formula[i]
            
            # Handle closing brackets
            if char in ')}]':
                i += 1
                # Check if there's a number after the closing bracket
                num_str = ""
                while i < len(formula) and formula[i].isdigit():
                    num_str += formula[i]
                    i += 1
                    
                # If there's no number after the bracket, default to 1
                num = int(num_str) if num_str else 1
                return element_counts, i, num
            
            # Handle opening brackets (nested formula)
            elif char in '({[':
                # Parse the nested formula recursively
                nested_counts, new_i, nested_multiplier = parse_formula(formula, i + 1, multiplier)
                
                # Add nested counts to the current counts
                for element, count in nested_counts.items():
                    element_counts[element] = element_counts.get(element, 0) + count * nested_multiplier
                
                i = new_i
            
            # Handle element (starts with uppercase letter)
            elif char.isupper():
                # Get the element symbol (uppercase followed by lowercase letters)
                element = char
                i += 1
                while i < len(formula) and formula[i].islower():
                    element += formula[i]
                    i += 1
                
                # Get the number after the element
                num_str = ""
                while i < len(formula) and formula[i].isdigit():
                    num_str += formula[i]
                    i += 1
                
                # If there's no number after the element, default to 1
                num = int(num_str) if num_str else 1
                
                # Add to element counts
                element_counts[element] = element_counts.get(element, 0) + num * multiplier
            
            else:
                # Skip other characters
                i += 1
        
        return element_counts, i, 1
    
    # Start parsing the formula
    result, _, _ = parse_formula(formula, 0, 1)
    return result

# Test cases
test_formulas = [
    "H2O",
    "NaCl",
    "C6H12O6",
    "Ca(OH)2",
    "K4[ON(SO3)2]2",
    "Fe2(SO4)3"
]

# Run test cases
for formula in test_formulas:
    result = parse_chemical_formula(formula)
    print(f"{formula} -> {result}")

# Example usage as a function
if __name__ == "__main__":
    import sys
    
    if len(sys.argv) > 1:
        formula = sys.argv[1]
        result = parse_chemical_formula(formula)
        print(result)
    else:
        print("Please provide a chemical formula as an argument.")
        print("Example: python chemical_parser.py H2O")

H2O -> {'H': 2, 'O': 1}
NaCl -> {'Na': 1, 'Cl': 1}
C6H12O6 -> {'C': 6, 'H': 12, 'O': 6}
Ca(OH)2 -> {'Ca': 1, 'O': 2, 'H': 2}
K4[ON(SO3)2]2 -> {'K': 4, 'O': 14, 'N': 2, 'S': 4}
Fe2(SO4)3 -> {'Fe': 2, 'S': 3, 'O': 12}
{'Users': 1, 'Andre': 1, 'App': 1, 'Data': 1, 'Roaming': 1}


### Testing this function

In [6]:
formula = 'NaHoSe2'
parse_chemical_formula(formula)

{'Na': 1, 'Ho': 1, 'Se': 2}

## Deleting rare earths

In [7]:
rare_earths = ['Sc','Y','La','Ce','Pr','Nd',
               'Pm','Sm','Eu','Gd','Tb','Dy',
               'Ho','Er','Tm','Yb','Lu']

#Function to check if a specific
#row contains a rare earth
def contains_rare_earth(row):
    ans = False
    elements = parse_chemical_formula(row['Formule']).keys()

    for elem in elements:
        if elem in rare_earths:
            ans = True

    return ans

In [8]:
index = np.random.randint(
    low=0,
    high=940
)
row = df.loc[index]
print(f'Contains rare earths?') 
print(contains_rare_earth(row))
print()
print(row)
#rare_earths = ['Sc','Y','La','Ce','Pr','Nd',
#              'Pm','Sm','Eu','Gd','Tb','Dy',
#              'Ho','Er','Tm','Yb','Lu']

Contains rare earths?
False

id           mp-1078419
Formule         Na3PSe4
BG               1.1169
IM                False
CA         36186.588813
a              7.274003
b              7.274003
c              7.274003
alpha              90.0
beta               90.0
gamma              90.0
Name: 601, dtype: object


In [9]:
df['contains_rare_earth'] = df.apply(contains_rare_earth,axis=1)

In [10]:
df['contains_rare_earth'].value_counts()

contains_rare_earth
False    658
True     282
Name: count, dtype: int64

## Is element column

In [11]:
#Function to check if a specific
#row is an element
def is_element(row):
    ans = False
    elements = list(parse_chemical_formula(row['Formule']).items())

    if len(elements) == 1 and elements[0][1]==1:
        ans = True

    return ans

Testing this function.

In [12]:
dummy_element = {
    'Formule':'H2O'
}

print(is_element(dummy_element))

False


Applying this function to the dataframe

In [13]:
df['is_element'] = df.apply(is_element,axis=1)
df

Unnamed: 0,id,Formule,BG,IM,CA,a,b,c,alpha,beta,gamma,contains_rare_earth,is_element
0,mp-19,Te,0.1856,False,1.073863e+06,4.601353,4.601353,5.900062,90.0,90.0,120.0,False,True
1,mp-149,Si,0.6105,False,1.133154e+05,5.443702,5.443702,5.443702,90.0,90.0,90.0,False,True
2,mp-239,BaS3,1.3913,False,6.254652e+04,6.951955,6.951955,4.216011,90.0,90.0,90.0,False,False
3,mp-241,CdF2,2.8977,False,1.582357e+03,5.401038,5.401038,5.401038,90.0,90.0,90.0,False,False
4,mp-252,BeTe,2.0173,False,1.010930e+04,5.662918,5.662918,5.662918,90.0,90.0,90.0,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
935,mp-999472,NaLaSe2,2.2767,False,3.138570e+04,4.373467,4.373467,20.620553,90.0,90.0,120.0,True,False
936,mp-999474,NaHoSe2,1.8867,False,2.690055e+04,4.098760,4.098760,20.778108,90.0,90.0,120.0,True,False
937,mp-999488,NaDySe2,1.8635,False,3.031457e+04,4.116628,4.116628,20.748485,90.0,90.0,120.0,True,False
938,mp-999489,NaGdSe2,1.3585,False,1.219142e+05,4.170708,4.170708,20.737036,90.0,90.0,120.0,True,False


## Number of elements column

In [14]:
def num_of_elements(row):
    elements = parse_chemical_formula(row['Formule']).keys()

    num_elem = len(elements)

    return num_elem

In [15]:
df['num_of_elements'] = df.apply(num_of_elements,axis=1)

## Resulting dataframe

In [16]:
df

Unnamed: 0,id,Formule,BG,IM,CA,a,b,c,alpha,beta,gamma,contains_rare_earth,is_element,num_of_elements
0,mp-19,Te,0.1856,False,1.073863e+06,4.601353,4.601353,5.900062,90.0,90.0,120.0,False,True,1
1,mp-149,Si,0.6105,False,1.133154e+05,5.443702,5.443702,5.443702,90.0,90.0,90.0,False,True,1
2,mp-239,BaS3,1.3913,False,6.254652e+04,6.951955,6.951955,4.216011,90.0,90.0,90.0,False,False,2
3,mp-241,CdF2,2.8977,False,1.582357e+03,5.401038,5.401038,5.401038,90.0,90.0,90.0,False,False,2
4,mp-252,BeTe,2.0173,False,1.010930e+04,5.662918,5.662918,5.662918,90.0,90.0,90.0,False,False,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
935,mp-999472,NaLaSe2,2.2767,False,3.138570e+04,4.373467,4.373467,20.620553,90.0,90.0,120.0,True,False,3
936,mp-999474,NaHoSe2,1.8867,False,2.690055e+04,4.098760,4.098760,20.778108,90.0,90.0,120.0,True,False,3
937,mp-999488,NaDySe2,1.8635,False,3.031457e+04,4.116628,4.116628,20.748485,90.0,90.0,120.0,True,False,3
938,mp-999489,NaGdSe2,1.3585,False,1.219142e+05,4.170708,4.170708,20.737036,90.0,90.0,120.0,True,False,3


## Distribution of element count

In [17]:
df['num_of_elements'].value_counts()

num_of_elements
3    690
2    168
4     72
1      7
5      3
Name: count, dtype: int64

## Distribution of single elements

In [18]:
df['is_element'].value_counts() #They are Te, Si and C

is_element
False    937
True       3
Name: count, dtype: int64

## Distribution of rare earths

In [19]:
df['contains_rare_earth'].value_counts()

contains_rare_earth
False    658
True     282
Name: count, dtype: int64

## Dropping single elements and rare earths

The resulting dataframe has 655 entries, which is consistent with the total of entries without rare earths (658) minus the single elements (3).

In [20]:
resulting_df = df[ (df['contains_rare_earth']==False) & (df['is_element']==False) ]
resulting_df

Unnamed: 0,id,Formule,BG,IM,CA,a,b,c,alpha,beta,gamma,contains_rare_earth,is_element,num_of_elements
2,mp-239,BaS3,1.3913,False,62546.518850,6.951955,6.951955,4.216011,90.0,90.000000,90.0,False,False,2
3,mp-241,CdF2,2.8977,False,1582.356544,5.401038,5.401038,5.401038,90.0,90.000000,90.0,False,False,2
4,mp-252,BeTe,2.0173,False,10109.303511,5.662918,5.662918,5.662918,90.0,90.000000,90.0,False,False,2
5,mp-315,PbF2,4.3938,False,2088.183714,5.915897,5.915897,5.915897,90.0,90.000000,90.0,False,False,2
6,mp-361,Cu2O,0.5127,False,241827.782769,4.246699,4.246699,4.246699,90.0,90.000000,90.0,False,False,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
911,mp-998552,BaHfO3,3.5394,False,2506.196783,4.169567,4.169567,4.170377,90.0,90.000000,90.0,False,False,3
912,mp-998739,MgTlF3,4.2135,False,2006.880778,5.763086,5.766610,4.077045,90.0,90.000000,90.0,False,False,3
913,mp-998745,TlZnF3,3.9179,False,2366.630022,4.117342,4.115025,4.124462,90.0,90.035864,90.0,False,False,3
914,mp-998761,TlNiF3,3.4347,False,14303.261275,4.087358,4.087358,4.087358,90.0,90.000000,90.0,False,False,3


## Exporting to csv file

In [21]:
resulting_df.to_csv('datasets/MPDbConvCellCANoRareEarthsNoElem.csv',index=False)