In [4]:
import string
from pathlib import Path
import numpy as np
import pandas as pd
import os
import timeit
from typing import List
from pymatgen.io.cif import CifParser
# Old paper features
from matminer.featurizers.base import BaseFeaturizer
from matminer.featurizers.site import GaussianSymmFunc, SiteElementalProperty,AGNIFingerprints
from matminer.utils.data import MagpieData
from pymatgen.analysis.local_env import VoronoiNN
import os
# New Features
from matminer.featurizers.structure.bonding import BagofBonds, BondFractions, GlobalInstabilityIndex, StructuralHeterogeneity, MinimumRelativeDistances
from matminer.featurizers.structure.matrix import CoulombMatrix, SineCoulombMatrix, OrbitalFieldMatrix
from pymatgen.analysis import local_env

In [5]:
def featurize_dataset(cifs: list, verbos=False, saveto: str = "features.csv") -> pd.DataFrame:
    """Featurize crystal structures using elemetal, geometric, and chemical descriptors for local environments.

    :params cifs: list of paths to crystal structure in cif format
    :params verbos: printing the steps
    :params saveto: filename to save the generated features
    """
    
    ## Process Input Files
    features = {}
    for cif in cifs:
        structure = CifParser(cif).get_structures()[0]
        structure_name = Path(cif).name
        features[structure_name] = {}
        features[structure_name]["structure"] = structure
        features[structure_name]["structure_name"] = Path(cif).name
        features[structure_name]["structure_path"] = str(Path(cif).parent)
    data = pd.DataFrame.from_dict(features).T
    
    ### SITE PROPERTIES ###
    # These will be paired as features
    ## 1. Initialize the dictionary for each site
    #  TODO: Combine these into one big loop with easily expandable features
    print("Assembling site property dictionary")
    site_features = {}
    for index, row in data.iterrows():
        structure = row["structure"]
        for atomidx in range(structure.num_sites):
            site_name = "%s_%i" % (index, atomidx)
            site_features[site_name] = {}
            site_features[site_name] = {"structure_name": row["structure_name"]}
            site_features[site_name].update({"structure_path": row["structure_path"]})
    
    ## 1. Site Elemental Property
    print("site elemental properties")
    property_list = ("Number", "AtomicWeight", "Row", "Column", "Electronegativity", "CovalentRadius")
    SEP = SiteElementalProperty(properties=property_list)
    colnames = SEP._generate_column_labels(multiindex=False, return_errors=False)
    for index, row in data.iterrows():
        structure = row["structure"]
        if verbos:
            print(index)
        for atomidx in range(structure.num_sites):
            feat = SEP.featurize(structure, idx=atomidx)
            site_name = "%s_%i" % (index, atomidx)
            site_features[site_name].update(dict(zip(colnames, feat)))

    ## 2. AGNI
    print("AGNI")
    property_list = ("Number", "AtomicWeight", "Row", "Column", "Electronegativity", "CovalentRadius")
    AGNI = AGNIFingerprints(cutoff=5, directions=[None])
    colnames = AGNI._generate_column_labels(multiindex=False, return_errors=False)
    for index, row in data.iterrows():
        structure = row["structure"]
        if verbos:
            print(index)
        for atomidx in range(structure.num_sites):
            feat = AGNI.featurize(structure, idx=atomidx)
            site_name = "%s_%i" % (index, atomidx)
            site_features[site_name].update(dict(zip(colnames, feat)))

    ## 3. Gaussian Symmetry Functions 
    print("GSF")
    GSF = GaussianSymmFunc(cutoff=5)
    colnames = GSF._generate_column_labels(multiindex=False, return_errors=False)
    for index, row in data.iterrows():
        structure = row["structure"]
        if verbos:
            print(index)
        for atomidx in range(structure.num_sites):
            feat = GSF.featurize(structure, idx=atomidx)
            site_name = "%s_%i" % (index, atomidx)
            site_features[site_name].update(dict(zip(colnames, feat)))

 
    ### BOND PAIRS AND BOND PROPERTIES ###
    print("Generating bond library")
    # Structures_bonds is a list for each structure, each containing a list of tuples indicating bonded indices
    structures_bonds = {}  # Store bond pairs
    bond_properties = {}  # Store bond properties
    for index, row in data.iterrows():
        print(index)
        structure = row["structure"]
        voronoi = VoronoiNN()
        structures_bonds[index] = []  
        bond_properties[index] = []
        for atomidx in range(structure.num_sites):
            bonded_atoms = voronoi.get_nn_info(structure, atomidx)
            # Get Bond Information Here
            # Bond Index Pairs
            structures_bonds[index] += [(atomidx, info['site_index']) for info in bonded_atoms if info['site_index']>atomidx]
            # Bond properties
            bond_properties[index] += [(info['poly_info']['n_verts'], info['poly_info']['face_dist']) for info in bonded_atoms if info['site_index']>atomidx]
    
    # Build Dataframe by bonds
    print("Copying over data to final dataframe")
    delta_properties = ["site Electronegativity", "site AtomicWeight"]  # For these properties, take the difference as a feature
    bond_features = {}  # Final dictionary for saving features format: bond_features['material_bond#']["feature_name"] = data
    for index, row in data.iterrows():
        print(index)
        bond_len_sum = 0
        for bond_idx in range(len(structures_bonds[index])):
            bond = structures_bonds[index][bond_idx]
            bond_name = "%s_Atom%i_Bond%i" % (index, bond[0], bond_idx)
            bond_features[bond_name] = {}
            site1_name = "%s_%i" % (index, bond[0])
            site2_name = "%s_%i" % (index, bond[1])
            
            # Add Site features to dictionary
            # Order putting heavier element first
            # TODO: this is not very efficient, save data directly to final dataframe in the end?
            site_feat_labels = site_features[site1_name].keys()
            site_feat_labels = [k for k in site_feat_labels if k not in ["structure_path", "structure_name"]]
            bond_features[bond_name]["structure_name"] = site_features[site1_name]["structure_name"]
            bond_features[bond_name]["structure_path"] = site_features[site1_name]["structure_path"]
            if site_features[site1_name]["site AtomicWeight"] > site_features[site2_name]["site AtomicWeight"]:
                for k in site_feat_labels:
                    if k in delta_properties:
                        bond_features[bond_name][k+"_diff"] = site_features[site1_name][k] - site_features[site2_name][k]
                    bond_features[bond_name][k+"_atom1"] = site_features[site1_name][k]
                    bond_features[bond_name][k+"_atom2"] = site_features[site2_name][k]
            else:
                for k in site_feat_labels:
                    if k in delta_properties:
                        bond_features[bond_name][k+"_diff"] = site_features[site2_name][k] - site_features[site1_name][k]
                    bond_features[bond_name][k+"_atom1"] = site_features[site2_name][k]
                    bond_features[bond_name][k+"_atom2"] = site_features[site1_name][k]
                    
            # Insert bond properties        
            coord_num, bond_len = bond_properties[index][bond_idx]
            bond_features[bond_name]["coordination_number"] = coord_num
            bond_features[bond_name]["bond_length"] = bond_len
            bond_len_sum += bond_len
            
        # Now add each bond's fraction of lattice volume
        for bond_idx in range(len(structures_bonds[index])):
            bond = structures_bonds[index][bond_idx]
            bond_name = "%s_Atom%i_Bond%i" % (index, bond[0], bond_idx)
            _, bond_len = bond_properties[index][bond_idx]
            bond_features[bond_name]["volume_fraction"] = bond_len/bond_len_sum
    
    ### SAVE FILE
    print("Saving to File!")
    df_features = pd.DataFrame.from_dict(bond_features).T
    if os.path.isfile(saveto):  # Append
        df_features.to_csv(saveto, mode='a', index=False, header=False)
    else:  # New file
        df_features.to_csv(saveto)

    return df_features


In [3]:
# Batching files to reduce memory use
BATCH_SIZE = 5

# Load all CIF files in directory
file_type = "_super.cif"  # Use files with this ending in input_dir
input_dir = "test_supercell_volume_cubic/"  # Input data directory
output_dir = "features/"
filename = "volume_cubic_features.csv"  # Output filename for features

if not os.path.isdir(output_dir):
    os.mkdir(output_dir)
    
files = os.listdir(input_dir)
cif_files = [input_dir+file for file in files if file.endswith(file_type)]

# Featurize all sturctures
n_batches = int(np.ceil(len(cif_files)/BATCH_SIZE))
if os.path.isfile(filename):  # Clean up any previous runs
    os.remove(filename)
    
for b in range(n_batches):
    print("Starting batch ", b)
    # Define which files to 
    idx_start = int(b*BATCH_SIZE)
    idx_end = int(min((b+1)*BATCH_SIZE, len(cif_files)))
    start = timeit.timeit()
    data_frame = featurize_dataset(cif_files[idx_start:idx_end], saveto=output_dir+filename)
    end = timeit.timeit()
    print("Time elapsed: ", end - start)

print("Files processed: ", len(cif_files))

Starting batch  0




Assembling site property dictionary
site elemental properties
AGNI
GSF
Generating bond library
10682_super.cif
12734_super.cif
127779_super.cif
130688_super.cif


KeyboardInterrupt: 



Assembling site property dictionary
site elemental properties
AGNI
GSF
Generating bond library
10682_super.cif
12734_super.cif
127779_super.cif
130688_super.cif


KeyboardInterrupt: 

In [3]:
   ## 1. Bag of Bonds
"""
print("bag of bonds")
BB = BagofBonds()
for index, row in data.iterrows():
    structure = row["structure"]
    if verbos:
        print(index)
    BB.fit([structure])
    feat = BB.bag(structure)
    print(feat)
    site = list(feat.keys())
    print(site[0])
    print(structure[site])

print("bond fraction")
BF = BondFractions()
for index, row in data.iterrows():
    structure = row["structure"]
    if verbos:
        print(index)
    feat = BF.fit_transform([structure])
    #print(feat)
    #print(BF.feature_labels())
"""
## 5. site difference stats 
"""
print("LPD")
LPD = LocalPropertyStatsNew(properties=property_list)
colnames = LPD._generate_column_labels(multiindex=False, return_errors=False)
for index, row in data.iterrows():
    structure = row["structure"]
    if verbos:
        print(index)
    for atomidx in range(structure.num_sites):
        feat = LPD.featurize(structure, idx=atomidx)
        site_name = "%s_%i" % (index, atomidx)
        site_features[site_name].update(dict(zip(colnames, feat)))
"""
    