In [None]:
from matminer.datasets import load_dataset
import pandas as pd
from pymatgen.ext.matproj import MPRester
from ase import Atoms
from pymatgen.io.ase import AseAtomsAdaptor
from dscribe.descriptors import SOAP
import numpy as np
import pickle
import gzip
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from pymatgen.core import Composition
from mendeleev import element
from tqdm import tqdm  # optional, for progress bar


## Band Gap energy
Load in the dft and experimental data. Convert dft compositional data to formula to match the experimental formula.

In [None]:
# load experimental data
df_exp = load_dataset('expt_gap_kingsbury')
df_exp = df_exp.dropna(subset=['likely_mpid'])
print(df_exp.shape)
display(df_exp.head())

# load DFT data
df_dft = load_dataset('jarvis_dft_3d')
df_dft = df_dft.drop(columns=['epsilon_x opt', 'epsilon_y opt', 'epsilon_z opt', 'shear modulus', 'bulk modulus', 'jid', 'epsilon_x tbmbj', 'epsilon_y tbmbj', 'epsilon_z tbmbj'])
df_dft['formula'] = df_dft['composition'].apply(lambda x: x.reduced_formula) # convert composition to formula
df_dft = df_dft.dropna()
print(df_dft.shape)
display(df_dft.head())

Create a dataframe of dft data for MP-ID's that are in both the experimental and dft datasets.

In [None]:
df_dft_filtered = df_dft[df_dft['mpid'].isin(df_exp['likely_mpid'])].reset_index(drop=True)
print(df_dft_filtered.shape)
mpids = df_dft['mpid'].tolist() # list of mpids
print(mpids[0:5])

## Generate the SOAP descriptor

The **SOAP (Smooth Overlap of Atomic Positions)** descriptor is a powerful tool used in computational materials science to represent the local atomic environment of a material. It encodes information about the local symmetry and structure of atoms in a material, making it suitable for use in machine learning models that predict material properties.

### Key Features:
- **Local Environment Representation**: The SOAP descriptor captures the atomic environment around each atom in a structure, using a smooth overlap of atomic positions.
- **Spherical Harmonics**: The descriptor uses spherical harmonics to represent angular information, allowing it to effectively capture local symmetries.
- **Radial Basis Functions**: The radial part of the descriptor is represented using radial basis functions, which help model the distances between atoms.
- **Species Specific**: The descriptor can incorporate the chemical species present in the structure, allowing it to adapt to different materials.

### How It Works:
1. **Structure Representation**: The local atomic environment of each atom is represented by a combination of radial and angular functions.
2. **Cutoff Radius**: Only atoms within a certain cutoff radius contribute to the descriptor, ensuring computational efficiency.
3. **Smearing**: Gaussian smearing is used to smooth the atomic positions, ensuring a continuous representation.
4. **Periodic Structures**: SOAP can be used to model periodic structures such as crystals, making it suitable for large-scale materials simulations.


In [None]:

def generate_soap_descriptor(mp_id, API_KEY=API_KEYcm):
    # 1. Connect to Materials Project
    mpr = MPRester(API_KEY)
    
    # 2. Fetch structure by MP-ID
    structure_pmg = mpr.get_structure_by_material_id(mp_id)

    # 3. Convert pymatgen structure to ASE Atoms
    ase_structure = AseAtomsAdaptor.get_atoms(structure_pmg)

    # 4. Check if species from provided list matches with the structure
    structure_species = list(set(ase_structure.get_chemical_symbols()))  # Unique elements in the structure
    
    # 5. Set up SOAP descriptor
    soap = SOAP(
        species=structure_species,   # List of elements present (user-specified)
        periodic=True,          # Is the structure periodic? (crystals = True, molecules = False)
        r_cut=5.0,              # Cutoff radius (Å)
        n_max=8,                # Number of radial basis functions
        l_max=6,                # Maximum degree of spherical harmonics
        sigma=0.5,              # Width (smearing) of Gaussians placed on atoms (Å)
        sparse=False            # Should output be dense NumPy array? (False = full array; True = sparse matrix for memory saving)
    )

    # 6. Create SOAP descriptors
    soap_descriptors = soap.create(ase_structure)

    return soap_descriptors

# Example usage
# soap_descriptor = generate_soap_descriptor(mp_id=mpids[0])

# print(soap_descriptor.shape)



## Calculate Soap Descriptors
Below code generates soap descriptors for all mpids in the df_dft dataset and creates a dataframe

In [None]:
# Generate SOAP descriptors for all mpids
soap_dict = {}
for i, mpid in enumerate(mpids):
    try:
        soap_descriptor = generate_soap_descriptor(mp_id=mpid)
        soap_dict[df_dft['formula'].iloc[i]] = soap_descriptor
        soap_dict[df_dft['mpid'].iloc[i]] = soap_descriptor
    except Exception as e:
        print(f"Skipping {mpid} due to error: {e}")


df_soap = pd.DataFrame({
    'formula': soap_dict.keys(),
    'soap_descriptor': soap_dict.values()
})
df_soap = df_soap.merge(df_dft[['formula', 'mpid']], on='formula', how='left')
df_soap.drop_duplicates(subset=['formula'], inplace=True)
df_soap.dropna(inplace=True)
df_soap.reset_index(drop=True, inplace=True)
print(df_soap.shape)
save_path = '/Users/cadenmyers/billingelab/dev/ml4ms_bandgap_final/data/soap_descriptors_raw.pkl.gz'
# # Save as compressed pickle
# with gzip.open(save_path, 'wb') as f:
#     pickle.dump(df_soap, f)

In [None]:
df_soap.head(10)
df_dft_dedup = df_dft.drop_duplicates(subset='formula', keep='first')
df_soap_raw = df_soap.merge(df_dft_dedup[['formula', 'gap opt']], on='formula', how='left')
print(df_soap_raw.shape)
df_soap_raw.head(10)


In [None]:

def pad_or_truncate_descriptor(descriptor, target_shape=(64, 800)):
    """Ensure a SOAP descriptor has the shape (64, 800) by truncating or zero-padding."""
    if descriptor is None:
        return np.zeros(target_shape)

    descriptor = np.array(descriptor)
    padded = np.zeros(target_shape)

    # Determine actual slice size
    rows = min(target_shape[0], descriptor.shape[0])
    cols = min(target_shape[1], descriptor.shape[1])

    padded[:rows, :cols] = descriptor[:rows, :cols]
    return padded

# Apply to the 'soap_descriptor' column and store in new column 'padded_soap'
# df_soap_raw['padded_soap'] = df_soap_raw['soap_descriptor'].apply(pad_or_truncate_descriptor)
# df_soap_raw.drop(columns=['soap_descriptor'], inplace=True)
print(df_soap_raw.shape)
display(df_soap_raw.head(10))

In [None]:
# # Save the padded descriptors
# save_path = '/Users/cadenmyers/billingelab/dev/ml4ms_bandgap_final/data/padded_soap_descriptors.pkl.gz'
# with gzip.open(save_path, 'wb') as f:
#     pickle.dump(df_soap_raw, f)

In [None]:
def find_largest_shape_indices_and_values(soap_descriptors):
    """
    Find the indices and values of the largest x and y dimensions among SOAP descriptors.

    Parameters:
    soap_descriptors (list or pd.Series): List of numpy arrays representing SOAP descriptors.

    Returns:
    tuple: (index_max_x, max_x, index_max_y, max_y)
    """
    max_x = 0
    max_y = 0
    index_max_x = -1
    index_max_y = -1

    for i, desc in enumerate(soap_descriptors):
        if desc is not None and len(desc.shape) >= 2:
            x, y = desc.shape
            if x > max_x:
                max_x = x
                index_max_x = i
            if y > max_y:
                max_y = y
                index_max_y = i

    return index_max_x, max_x, index_max_y, max_y


index_max_x, max_x, index_max_y, _ = find_largest_shape_indices_and_values(df_soap['soap_descriptor'])



def zero_pad_soap_descriptors(soap_descriptors, target_shape):
    """
    Zero-pad each SOAP descriptor to the target shape.

    Parameters:
    soap_descriptors (list or pd.Series): List of numpy arrays representing SOAP descriptors.
    target_shape (tuple): (target_x, target_y) shape to pad to.

    Returns:
    list: List of zero-padded numpy arrays.
    """
    padded_descriptors = []

    for desc in soap_descriptors:
        if desc is None:
            padded = np.zeros(target_shape)
        else:
            padded = np.zeros(target_shape)
            x_size = min(desc.shape[0], target_shape[0])
            y_size = min(desc.shape[1], target_shape[1]) if len(desc.shape) > 1 else 1

            # Copy data into padded array
            if len(desc.shape) == 1:
                padded[:x_size, 0] = desc[:x_size]
            else:
                padded[:x_size, :y_size] = desc[:x_size, :y_size]

        padded_descriptors.append(padded)

    return padded_descriptors

# limit `max_y`to 800 because not much info pasted 800
# max_y = 800
# print(max_x, max_y)
# padded_descriptors = zero_pad_soap_descriptors(df_soap['soap_descriptor'], (max_x, max_y))

# Get periodic table properties
get the properties and save them in a df

In [None]:
# Choose your formula
formula_list = soap_df['formula'].tolist()

# Properties to extract
properties = [
    "electronegativity",
    "atomic_radius",
    "ionenergies",
    "covalent_radius",
    "nvalence",  # number of valence electrons
]


# Aggregation function per formula
def get_composition_features(formula):
    try:
        comp = Composition(formula)
        el_amt_dict = comp.get_el_amt_dict()
        total_atoms = comp.num_atoms
        
        data = {prop: [] for prop in properties}
        fractions = []

        for el, amt in el_amt_dict.items():
            try:
                elem = element(el)
            except:
                continue

            frac = amt / total_atoms
            fractions.append(frac)

            for prop in properties:
                val = getattr(elem, prop)

                if callable(val):
                    val = val()

                # Special case: ionenergies is a dict
                if isinstance(val, dict):
                    val = val.get(1, np.nan)
                elif val is None:
                    val = np.nan

                data[prop].append(val)

        # Compute aggregated features
        features = {"formula": formula}
        for prop in properties:
            values = np.array(data[prop])
            fracs = np.array(fractions)
            weighted_avg = np.nansum(values * fracs)
            max_val = np.nanmax(values)
            min_val = np.nanmin(values)
            std_val = np.nanstd(values)

            features[f"{prop}_mean"] = weighted_avg
            features[f"{prop}_max"] = max_val
            features[f"{prop}_min"] = min_val
            # features[f"{prop}_range"] = max_val - min_val
            features[f"{prop}_std"] = std_val

        return features

    except Exception as e:
        print(f"Failed on {formula}: {e}")
        return {"formula": formula}

# Run for all formulas with progress bar
# feature_list = [get_composition_features(f) for f in tqdm(formula_list)]
# features_df = pd.DataFrame(feature_list)
# print(features_df.shape)
# print(features_df.columns)
# features_df.head()

In [None]:
features_df = pd.read_excel('/Users/cadenmyers/billingelab/dev/ml4ms_bandgap_final/data/elemental_features.xlsx')

features_df.head()

In [None]:
soap_df['soap_flat'] = soap_df['padded_soap'].apply(lambda x: x.flatten())
print(soap_df['soap_flat'].iloc[0].shape)
soap_df.head()

In [None]:
merged_df = pd.merge(soap_df, features_df, on='formula', how='left')
print(merged_df.shape)
merged_df.head()

In [None]:
# # Save the merged DataFrame as a compressed pickle
# save_path = '/Users/cadenmyers/billingelab/dev/ml4ms_bandgap_final/data/soap_and_atomic_features.pkl.gz'
# with gzip.open(save_path, 'wb') as f:
#     pickle.dump(merged_df, f)