In [2]:
from pathlib import Path
import os
import numpy as np
import pandas as pd
import timeit
from pymatgen.io.cif import CifParser
# Bond Features
from matminer.featurizers.site import GaussianSymmFunc, SiteElementalProperty,AGNIFingerprints

# Structure Features
from matminer.featurizers.structure.bonding import GlobalInstabilityIndex, StructuralHeterogeneity
from matminer.featurizers.structure.composite import JarvisCFID
from matminer.featurizers.structure.order import StructuralComplexity, MaximumPackingEfficiency


BOND_MAX_DIST = 2.85  # Max distance for a bond in angstroms

In [3]:
def featurize_bonds(cifs: list, verbose=False, saveto: str = "features.csv") -> pd.DataFrame:
    """Featurize crystal structures using elemetal, geometric, and chemical descriptors for local environments.

    :params cifs: list of paths to crystal structure in cif format
    :params verbos: prints each step of the processing
    :params saveto: filename to save the generated features
    """
    
    ## Process Input Files
    if verbose: print("Parsing CIFs")
    features = {}
    for cif in cifs:
        structure = CifParser(cif).get_structures()[0]
        structure_name = Path(cif).name
        features[structure_name] = {}
        features[structure_name]["structure"] = structure
        features[structure_name]["structure_name"] = Path(cif).name
        features[structure_name]["structure_path"] = str(Path(cif).parent)
    data = pd.DataFrame.from_dict(features).T
    
    ### SITE PROPERTIES ###
    # These will be paired as features
    ## 1. Initialize the dictionary for each site
    if verbose: print("Assembling site property dictionary")
    site_features = {}
    for index, row in data.iterrows():
        structure = row["structure"]
        for atomidx in range(structure.num_sites):
            site_name = "%s_%i" % (index, atomidx)
            site_features[site_name] = {}
            site_features[site_name] = {"structure_name": row["structure_name"]}
            site_features[site_name].update({"structure_path": row["structure_path"]})

    ## Loop through Site featurizers
    property_list = ("Number", "AtomicWeight", "Row", "Column", "Electronegativity", "CovalentRadius")  # For SiteElementalProperty function
    site_feature_functions = [SiteElementalProperty(properties=property_list), AGNIFingerprints(cutoff=5, directions=[None]), GaussianSymmFunc(cutoff=5)]

    for featurizer in site_feature_functions:
        if verbose: print("Using: ", featurizer)
        colnames = featurizer._generate_column_labels(multiindex=False, return_errors=False)
        for index, row in data.iterrows():
            structure = row["structure"]
            if verbose: print(index)
            for atomidx in range(structure.num_sites):
                feat = featurizer.featurize(structure, idx=atomidx)
                site_name = "%s_%i" % (index, atomidx)
                site_features[site_name].update(dict(zip(colnames, feat)))
 
    ### BOND PAIRS AND BOND PROPERTIES ###
    if verbose: print("Generating bond library")
    structures_bonds = {}  # Store bond pairs
    bond_properties = {}  # Store bond properties
    for index, row in data.iterrows():
        if verbose: print(index)
        structure = row["structure"]
        structures_bonds[index] = []
        bond_properties[index] = []
        neighbors = structure.get_neighbor_list(BOND_MAX_DIST)  # (center_indices, points_indices, offset_vectors, distances)
        for bond in range(len(neighbors[0])):
            if neighbors[0][bond] < neighbors[1][bond]:  # Don't double count bonds
                # Bonded indices
                structures_bonds[index].append((neighbors[0][bond], neighbors[1][bond]))
                # Bond properties (coord-num, bond-len)
                coord_num = list(neighbors[0]).count(neighbors[0][bond])
                bond_properties[index].append((coord_num, neighbors[3][bond]))

    # Build Dataframe by bonds
    if verbose: print("Copying over data to final dataframe")
    delta_properties = ["site Electronegativity", "site AtomicWeight"]  # For these properties, take the difference as a feature
    bond_features = {}  # Final dictionary for saving features format: bond_features['material_bond#']["feature_name"] = data
    for index, row in data.iterrows():
        bond_len_sum = 0
        if verbose: print(index)
        for bond_idx in range(len(structures_bonds[index])):
            bond = structures_bonds[index][bond_idx]
            bond_name = "%s_Atom%i_Bond%i" % (index, bond[0], bond_idx)
            bond_features[bond_name] = {}
            site1_name = "%s_%i" % (index, bond[0])
            site2_name = "%s_%i" % (index, bond[1])
            
            # Add Site features to dictionary
            # Order putting heavier element first
            # TODO: this works but is not very efficient, save data directly to final dataframe in the end?
            site_feat_labels = site_features[site1_name].keys()
            site_feat_labels = [k for k in site_feat_labels if k not in ["structure_path", "structure_name"]]
            bond_features[bond_name]["structure_name"] = site_features[site1_name]["structure_name"]
            bond_features[bond_name]["structure_path"] = site_features[site1_name]["structure_path"]
            if site_features[site1_name]["site AtomicWeight"] > site_features[site2_name]["site AtomicWeight"]:
                for k in site_feat_labels:
                    if k in delta_properties:
                        bond_features[bond_name][k+"_diff"] = site_features[site1_name][k] - site_features[site2_name][k]
                    bond_features[bond_name][k+"_atom1"] = site_features[site1_name][k]
                    bond_features[bond_name][k+"_atom2"] = site_features[site2_name][k]
            else:
                for k in site_feat_labels:
                    if k in delta_properties:
                        bond_features[bond_name][k+"_diff"] = site_features[site2_name][k] - site_features[site1_name][k]
                    bond_features[bond_name][k+"_atom1"] = site_features[site2_name][k]
                    bond_features[bond_name][k+"_atom2"] = site_features[site1_name][k]
                    
            # Insert bond properties        
            coord_num, bond_len = bond_properties[index][bond_idx]
            bond_features[bond_name]["coordination_number"] = coord_num
            bond_features[bond_name]["bond_length"] = bond_len
            bond_len_sum += bond_len  # TODO: There's a bug somewhere around here
            
        # Now add each bond's fraction of lattice volume
        for bond_idx in range(len(structures_bonds[index])):
            bond = structures_bonds[index][bond_idx]
            bond_name = "%s_Atom%i_Bond%i" % (index, bond[0], bond_idx)
            _, bond_len = bond_properties[index][bond_idx]
            bond_features[bond_name]["volume_fraction"] = bond_len/bond_len_sum
    
    ### SAVE FILE
    bond_feat_df = pd.DataFrame.from_dict(bond_features).T
    if os.path.isfile(saveto+"_bond.csv"):  # Append
        bond_feat_df.to_csv(saveto+"_bond.csv", mode='a', header=False)
    else:  # New file
        bond_feat_df.to_csv(saveto+"_bond.csv")
        
    return bond_feat_df


In [11]:
def featurize_structure(cifs: list, verbose=False, saveto: str = "features.csv") -> pd.DataFrame:
    ## Process Input Files
    if verbose: print("Parsing CIFs")
    features = {}
    for cif in cifs:
        structure = CifParser(cif).get_structures()[0]
        structure_name = Path(cif).name
        features[structure_name] = {}
        features[structure_name]["structure"] = structure
        features[structure_name]["structure_name"] = Path(cif).name
        features[structure_name]["structure_path"] = str(Path(cif).parent)
    data = pd.DataFrame.from_dict(features).T
    
    ### STRUCTURE PROPERTIES ###
    ## 1. Initialize the dictionary for each site
    if verbose: print("Assembling Structure property dictionary")
    structure_features = {}
    for index, row in data.iterrows():
        structure = row["structure"]
        structure_features[index] = {}
        structure_features[index] = {"structure_name": row["structure_name"]}
        structure_features[index].update({"structure_path": row["structure_path"]})

    ## Structure Featurizers
    structure_feature_functions = [ StructuralComplexity(), JarvisCFID(use_chem=False, use_rdf=False, use_chg=False, use_adf=False, use_ddf=False, use_nn=False), MaximumPackingEfficiency()]
    # TODO: add [StructuralHeterogeneity(stats=('range', 'avg_dev')), MaximumPackingEfficiency()]
    for index, row in data.iterrows():
        structure = row["structure"]
        for featurizer in structure_feature_functions:
            if verbose: print(featurizer)
            colnames = featurizer._generate_column_labels(multiindex=False, return_errors=False)
            feat = featurizer.featurize(structure)
            structure_features[index].update(dict(zip(colnames, feat)))
            # TODO: Structural Complexity only first entry, select certain features from others

    structure_feat_df = pd.DataFrame.from_dict(structure_features).T
    if os.path.isfile(saveto+"_structure.csv"):  # Append
        structure_feat_df.to_csv(saveto+"_structure.csv", mode='a', header=False)
    else:  # New file
        structure_feat_df.to_csv(saveto+"_structure.csv")
        
    return structure_feat_df

In [12]:
def remove_files(filename):
    if os.path.isfile(filename+"_bond.csv"):  # Clean up any previous runs
        os.remove(filename+"_bond.csv")
    if os.path.isfile(filename+"_structure.csv"):  # Clean up any previous runs
        os.remove(filename+"_structure.csv")

## Test Featurizing with a single file

In [13]:
remove_files('test_feat')
bond_test = featurize_bonds(['supercells_data/15284_super.cif'], saveto='test_feat')
struc_test = featurize_structure(['supercells_data/15284_super.cif'], saveto='test_feat')
struc_test.head()

Unnamed: 0,jml_density,jml_log_vpa,jml_pack_frac,jml_vpa,max packing efficiency,structural complexity per atom,structural complexity per cell,structure_name,structure_path
15284_super.cif,11.854676,0.763711,2.47272,6.800585,0.44692,1.370951,6.854753,15284_super.cif,supercells_data


## Featurize all data in target folder in a batch system to manage memory better
Note: be sure to delete any previous features.csv files as the function will append if the file exists

In [None]:
# Batching files to reduce memory use
BATCH_SIZE = 5

# Load all CIF files in directory
file_type = "_super.cif"  # Use files with this ending in input_dir
input_dir = "supercells_data/"  # Input data directory
output_dir = "features/"  # Output directory
filename = "features2"  # Output filename for features, no file extension

if not os.path.isdir(output_dir):
    os.mkdir(output_dir)
elif os.path.isdir(output_dir+filename):
    os.remove(output_dir+filename)  # Remove existing file

files = os.listdir(input_dir)
cif_files = [input_dir+file for file in files if file.endswith(file_type)]

# Featurize all structures
n_batches = int(np.ceil(len(cif_files)/BATCH_SIZE))
# Remove previous output files
#remove_files(output_dir+filename)
    
# Solve in batches to limit memory use
print("{} Batches Total: ".format(n_batches))
for b in range(n_batches):
    print("Starting batch ", b)
    # Define which files are in each batch
    idx_start = int(b*BATCH_SIZE)
    idx_end = int(min((b+1)*BATCH_SIZE, len(cif_files)))
    start = timeit.default_timer()
    #bond_df = featurize_bonds(cif_files[idx_start:idx_end], saveto=output_dir+filename, verbose=False)
    struc_df = featurize_structure(cif_files[idx_start:idx_end], saveto=output_dir+filename, verbose=False)
    print("Time elapsed: ", timeit.default_timer() - start)

print("Files processed: ", len(cif_files))

109 Batches Total: 
Starting batch  0




Time elapsed:  106.54788049962372
Starting batch  1




Time elapsed:  186.8581594005227
Starting batch  2




Time elapsed:  222.9998145001009
Starting batch  3




Time elapsed:  213.87902210000902
Starting batch  4




Time elapsed:  190.0914074005559
Starting batch  5




Time elapsed:  33.39806959964335
Starting batch  6




Time elapsed:  345.1488674003631
Starting batch  7




Time elapsed:  172.22175259981304
Starting batch  8




Time elapsed:  123.56311890017241
Starting batch  9




Time elapsed:  14.755867999978364
Starting batch  10




Time elapsed:  425.2668192004785
Starting batch  11




Time elapsed:  203.3850723998621
Starting batch  12




Time elapsed:  49.72375490050763
Starting batch  13
Time elapsed:  97.50387770030648
Starting batch  14




Time elapsed:  106.34120159968734
Starting batch  15




Time elapsed:  36.531112999655306
Starting batch  16




Time elapsed:  139.92432100046426
Starting batch  17




Time elapsed:  122.60213569924235
Starting batch  18




Time elapsed:  34.23407819960266
Starting batch  19
Time elapsed:  174.12370989937335
Starting batch  20




Time elapsed:  273.23746789991856
Starting batch  21
Time elapsed:  362.32868899963796
Starting batch  22
Time elapsed:  30.405936500057578
Starting batch  23




Time elapsed:  298.9472751999274
Starting batch  24




Time elapsed:  177.87313399929553
Starting batch  25




Time elapsed:  2.415377600118518
Starting batch  26




Time elapsed:  10.106378600001335
Starting batch  27
Time elapsed:  43.68761119991541
Starting batch  28




Time elapsed:  217.42960129957646
Starting batch  29
Time elapsed:  2.219573499634862
Starting batch  30




Time elapsed:  34.02298180013895
Starting batch  31




Time elapsed:  13.486468899995089
Starting batch  32




Time elapsed:  19.076334999874234
Starting batch  33
Time elapsed:  2.8200999004766345
Starting batch  34




Time elapsed:  15.00035039987415
Starting batch  35




Time elapsed:  4.265710500068963
Starting batch  36




Time elapsed:  296.9421705994755
Starting batch  37




Time elapsed:  51.413422600366175
Starting batch  38




Time elapsed:  38.307588699273765
Starting batch  39




Time elapsed:  41.57796800043434
Starting batch  40




Time elapsed:  27.33416340034455
Starting batch  41




Time elapsed:  67.69070569984615
Starting batch  42




Time elapsed:  23.027568800374866
Starting batch  43




Time elapsed:  95.45575390011072
Starting batch  44




Time elapsed:  90.6120744002983
Starting batch  45




Time elapsed:  22.40599639993161
Starting batch  46




Time elapsed:  64.30059000011533
Starting batch  47




Time elapsed:  17.045019800774753
Starting batch  48
Time elapsed:  4.838813999667764
Starting batch  49




Time elapsed:  71.19688769988716
Starting batch  50




Time elapsed:  45.64385540038347
Starting batch  51




Time elapsed:  108.03221359942108
Starting batch  52




Time elapsed:  460.73208730015904
Starting batch  53




Time elapsed:  243.22957830037922
Starting batch  54




Time elapsed:  84.74351109936833
Starting batch  55




Time elapsed:  226.55841069947928
Starting batch  56




Time elapsed:  50.31738369911909
Starting batch  57




Time elapsed:  91.00799690000713
Starting batch  58




Time elapsed:  147.24600119981915
Starting batch  59
Time elapsed:  101.63020360004157
Starting batch  60




Time elapsed:  153.64846150018275
Starting batch  61




Time elapsed:  59.45085039921105
Starting batch  62




Time elapsed:  136.46887939982116
Starting batch  63




Time elapsed:  84.23466289974749
Starting batch  64




Time elapsed:  234.1550519000739
Starting batch  65




Time elapsed:  76.99335059989244
Starting batch  66




Time elapsed:  91.30404889956117
Starting batch  67




Time elapsed:  134.7625331999734
Starting batch  68




Time elapsed:  109.91594369988889
Starting batch  69




Time elapsed:  78.8962662992999
Starting batch  70




Time elapsed:  132.3246817998588
Starting batch  71




Time elapsed:  308.79970130044967
Starting batch  72




Time elapsed:  177.49884929973632
Starting batch  73




Time elapsed:  170.98918209969997
Starting batch  74




Time elapsed:  294.4138519996777
Starting batch  75




Time elapsed:  0.7878027996048331
Starting batch  76




Time elapsed:  57.775939599610865
Starting batch  77




Time elapsed:  136.3982887994498
Starting batch  78




Time elapsed:  142.1676896000281
Starting batch  79


