In [32]:
from matminer.datasets import load_dataset
from matminer.featurizers.composition import ElementProperty
from pymatgen.core import Composition
import matminer.datasets
import pandas as pd
from pymatgen.ext.matproj import MPRester
from ase import Atoms
from pymatgen.io.ase import AseAtomsAdaptor
from dscribe.descriptors import SOAP
import numpy as np
import re


## Band Gap energy
Load in the dft and experimental data. Convert dft compositional data to formula to match the experimental formula.

In [2]:
# load experimental data
df_exp = load_dataset('expt_gap_kingsbury')
df_exp = df_exp.dropna(subset=['likely_mpid'])
print(df_exp.shape)
display(df_exp.head())

# load DFT data
df_dft = load_dataset('jarvis_dft_3d')
df_dft = df_dft.drop(columns=['epsilon_x opt', 'epsilon_y opt', 'epsilon_z opt', 'shear modulus', 'bulk modulus', 'jid', 'epsilon_x tbmbj', 'epsilon_y tbmbj', 'epsilon_z tbmbj'])
df_dft['formula'] = df_dft['composition'].apply(lambda x: x.reduced_formula) # convert composition to formula
df_dft = df_dft.dropna()
print(df_dft.shape)
display(df_dft.head())

(2481, 3)


Unnamed: 0,likely_mpid,expt_gap,formula
1,mp-29717,0.0,Ag(W3Br7)2
5,mp-23558,0.0,Ag2BiO3
6,mp-9900,1.98,Ag2GeS3
8,mp-23485,2.47,Ag2HgI4
9,mp-1203404,3.06,Ag2Mo(I2O7)2




(7349, 8)


Unnamed: 0,structure,e_form,structure initial,gap tbmbj,mpid,gap opt,composition,formula
59,"[[ 8.9342299 3.79961599 -6.97400918] Mg, [ 3...",-2.896,"[[ 8.9825888 3.8061472 -7.01147537] Mg, [ 3...",7.8043,mp-6596,5.2711,"(Mg, Al, P, O)",MgAlPO5
68,"[[1.25762093 4.13616104 2.44785671] K, [3.7426...",-2.657,"[[1.26002845 4.20573168 2.5197491 ] K, [3.7544...",8.3622,mp-7324,5.6141,"(K, Be, P, O)",KBePO4
75,"[[3.1506963 4.30596127 3.56982179] As, [0.384...",0.025,"[[3.17888147 4.36479751 3.69558323] As, [0.381...",0.6495,mp-158,0.0221,(As),As
80,"[[0.37213617 3.67987527 0.87369073] H, [1.4889...",-1.184,"[[0.34436223 3.69323659 0.87470146] H, [1.4990...",0.0083,mp-24242,0.0038,"(H, O, F, Cu)",CuHOF
83,"[[ 0.92796672 1.82915421 10.16747299] O, [2.7...",-2.086,"[[ 0.938753 1.84002706 10.93783201] O, [2.8...",2.4642,mp-510584,1.9202,"(O, Mo)",MoO3


Create a dataframe of dft data for MP-ID's that are in both the experimental and dft datasets.

In [None]:
df_dft_filtered = df_dft[df_dft['mpid'].isin(df_exp['likely_mpid'])].reset_index(drop=True)
print(df_dft_filtered.shape)

(329, 8)


## Extract the total number of species in the dataset
This is necessary because our vector size for the algorithm must be the same shape. This ensures that the SOAP vector (see below) is of equal shape. Will will feed in all the species in a dataset. This will generate a lot of entries that are zero. This should be good for ML since its an additional source of information.

In [37]:
species_regex = r'[A-Z][a-z]*'  # Matches capital letter followed by lowercase letters

# Function to extract species from a single formula
def extract_species(formula):
    return set(re.findall(species_regex, formula))

# Function to extract species from a formula
def extract_species_from_df_column(df_column):
    # Apply the function to the dataframe column and get species
    all_species = set()
    species = df_column.apply(extract_species)
    # Collect all species into a single set
    for species_list in species:
        all_species.update(species_list)
    return list(all_species)

dft_unique_species = extract_species_from_df_column(df_dft['formula'])
print('Unique species in DFT data:', dft_unique_species)

exp_unique_species = extract_species_from_df_column(df_exp['formula'])
print('Unique species in experimental data:', exp_unique_species)

Unique species in DFT data: ['F', 'Pb', 'Mn', 'H', 'As', 'Hg', 'Sr', 'Ru', 'P', 'Ni', 'Tc', 'Ar', 'Co', 'Sc', 'B', 'Si', 'Yb', 'Ba', 'W', 'I', 'O', 'N', 'Zn', 'Nd', 'Sm', 'Lu', 'Cr', 'Ga', 'Na', 'Ta', 'S', 'Te', 'Pt', 'Mo', 'Pd', 'Ag', 'Pr', 'Tb', 'Pa', 'Re', 'Ho', 'Tm', 'Cs', 'Ne', 'Os', 'Ir', 'C', 'V', 'Sn', 'U', 'Sb', 'Rb', 'Y', 'Hf', 'Li', 'K', 'Cd', 'Th', 'Ti', 'Be', 'Mg', 'Ge', 'Rh', 'Nb', 'Fe', 'Zr', 'In', 'Kr', 'Tl', 'Cu', 'Se', 'Dy', 'Bi', 'Cl', 'Ca', 'Ce', 'He', 'Er', 'Al', 'Au', 'Br', 'La']
Unique species in experimental data: ['F', 'Pb', 'Mn', 'H', 'Hg', 'As', 'Sr', 'Ru', 'P', 'Ni', 'Tc', 'Co', 'Sc', 'B', 'Si', 'Yb', 'Xe', 'Ba', 'W', 'I', 'O', 'N', 'Zn', 'Nd', 'Sm', 'Lu', 'Cr', 'Ga', 'Na', 'Gd', 'Ta', 'S', 'Te', 'Pt', 'Mo', 'Pd', 'Ag', 'Pr', 'Tb', 'Re', 'Ho', 'Tm', 'Cs', 'Os', 'Ir', 'C', 'Sn', 'V', 'U', 'Sb', 'Rb', 'Y', 'Hf', 'Li', 'K', 'Cd', 'Th', 'Ti', 'Be', 'Mg', 'Eu', 'Ge', 'Rh', 'Fe', 'Nb', 'Zr', 'In', 'Tl', 'Cu', 'Se', 'Dy', 'Bi', 'Cl', 'Ca', 'Ce', 'Er', 'Al', 'Au', '

## Generate the SOAP descriptor

The **SOAP (Smooth Overlap of Atomic Positions)** descriptor is a powerful tool used in computational materials science to represent the local atomic environment of a material. It encodes information about the local symmetry and structure of atoms in a material, making it suitable for use in machine learning models that predict material properties.

### Key Features:
- **Local Environment Representation**: The SOAP descriptor captures the atomic environment around each atom in a structure, using a smooth overlap of atomic positions.
- **Spherical Harmonics**: The descriptor uses spherical harmonics to represent angular information, allowing it to effectively capture local symmetries.
- **Radial Basis Functions**: The radial part of the descriptor is represented using radial basis functions, which help model the distances between atoms.
- **Species Specific**: The descriptor can incorporate the chemical species present in the structure, allowing it to adapt to different materials.

### How It Works:
1. **Structure Representation**: The local atomic environment of each atom is represented by a combination of radial and angular functions.
2. **Cutoff Radius**: Only atoms within a certain cutoff radius contribute to the descriptor, ensuring computational efficiency.
3. **Smearing**: Gaussian smearing is used to smooth the atomic positions, ensuring a continuous representation.
4. **Periodic Structures**: SOAP can be used to model periodic structures such as crystals, making it suitable for large-scale materials simulations.


In [None]:
API_KEY = "cmeSCicjBiIc0Nk02lArDwjO4V0jyXKw" # get your API key by making a materials project account https://materialsproject.org/


def generate_soap_descriptor(mp_id, species_list, API_KEY=API_KEY):
    # 1. Connect to Materials Project
    mpr = MPRester(API_KEY)
    
    # 2. Fetch structure by MP-ID
    structure_pmg = mpr.get_structure_by_material_id(mp_id)

    # 3. Convert pymatgen structure to ASE Atoms
    ase_structure = AseAtomsAdaptor.get_atoms(structure_pmg)

    # 4. Check if species from provided list matches with the structure
    structure_species = list(set(ase_structure.get_chemical_symbols()))  # Unique elements in the structure
    
    # Ensure that the provided species list is a subset of the species in the structure
    if not all(species in structure_species for species in species_list):
        raise ValueError("The provided species list does not match the species in the structure.")

    # 5. Set up SOAP descriptor
    soap = SOAP(
        species=species_list,   # List of elements present (user-specified)
        periodic=True,          # Is the structure periodic? (crystals = True, molecules = False)
        r_cut=5.0,              # Cutoff radius (Å)
        n_max=8,                # Number of radial basis functions
        l_max=6,                # Maximum degree of spherical harmonics
        sigma=0.5,              # Width (smearing) of Gaussians placed on atoms (Å)
        sparse=False            # Should output be dense NumPy array? (False = full array; True = sparse matrix for memory saving)
    )

    # 6. Create SOAP descriptors
    soap_descriptors = soap.create(ase_structure)

    return soap_descriptors

mpids = df_dft['mpid'].tolist()
soap_descriptor = generate_soap_descriptor(mp_id= mpids[0], species_list=dft_unique_species)

Retrieving MaterialsDoc documents: 100%|██████████| 1/1 [00:00<00:00, 16912.52it/s]


ValueError: The provided species list does not match the species in the structure.