In [1]:
#import string
from pathlib import Path
import numpy as np
import pandas as pd
import sys, os
import timeit
#from typing import List
from pymatgen.io.cif import CifParser
# Old paper features
#from matminer.featurizers.base import BaseFeaturizer
from matminer.featurizers.site import GaussianSymmFunc, SiteElementalProperty,AGNIFingerprints
#from matminer.utils.data import MagpieData
from pymatgen.analysis.local_env import VoronoiNN
import os

BOND_MAX_DIST = 2.6  # Max distance for a bond in angstroms
# New Features
#rom matminer.featurizers.structure.bonding import BagofBonds, BondFractions, GlobalInstabilityIndex, StructuralHeterogeneity, MinimumRelativeDistances
#from matminer.featurizers.structure.matrix import CoulombMatrix, SineCoulombMatrix, OrbitalFieldMatrix
#from pymatgen.analysis import local_env


In [8]:
def featurize_dataset(cifs: list, verbose=False, saveto: str = "features.csv") -> pd.DataFrame:
    """Featurize crystal structures using elemetal, geometric, and chemical descriptors for local environments.

    :params cifs: list of paths to crystal structure in cif format
    :params verbos: printing the steps
    :params saveto: filename to save the generated features
    """
    
    ## Process Input Files
    if verbose: print("Parsing CIFs")
    features = {}
    for cif in cifs:
        structure = CifParser(cif).get_structures()[0]
        structure_name = Path(cif).name
        features[structure_name] = {}
        features[structure_name]["structure"] = structure
        features[structure_name]["structure_name"] = Path(cif).name
        features[structure_name]["structure_path"] = str(Path(cif).parent)
    data = pd.DataFrame.from_dict(features).T
    
    ### SITE PROPERTIES ###
    # These will be paired as features
    ## 1. Initialize the dictionary for each site
    #  TODO: Combine these into one big loop with easily expandable features
    if verbose: print("Assembling site property dictionary")
    site_features = {}
    for index, row in data.iterrows():
        structure = row["structure"]
        for atomidx in range(structure.num_sites):
            site_name = "%s_%i" % (index, atomidx)
            site_features[site_name] = {}
            site_features[site_name] = {"structure_name": row["structure_name"]}
            site_features[site_name].update({"structure_path": row["structure_path"]})
    
    ## 1. Site Elemental Property
    if verbose: print("site elemental properties")
    property_list = ("Number", "AtomicWeight", "Row", "Column", "Electronegativity", "CovalentRadius")
    SEP = SiteElementalProperty(properties=property_list)
    colnames = SEP._generate_column_labels(multiindex=False, return_errors=False)
    for index, row in data.iterrows():
        structure = row["structure"]
        if verbose: print(index)
        for atomidx in range(structure.num_sites):
            feat = SEP.featurize(structure, idx=atomidx)
            site_name = "%s_%i" % (index, atomidx)
            site_features[site_name].update(dict(zip(colnames, feat)))

    ## 2. AGNI
    if verbose: print("AGNI")
    property_list = ("Number", "AtomicWeight", "Row", "Column", "Electronegativity", "CovalentRadius")
    AGNI = AGNIFingerprints(cutoff=5, directions=[None])
    colnames = AGNI._generate_column_labels(multiindex=False, return_errors=False)
    for index, row in data.iterrows():
        structure = row["structure"]
        if verbose: print(index)
        for atomidx in range(structure.num_sites):
            feat = AGNI.featurize(structure, idx=atomidx)
            site_name = "%s_%i" % (index, atomidx)
            site_features[site_name].update(dict(zip(colnames, feat)))

    ## 3. Gaussian Symmetry Functions 
    if verbose: print("GSF")
    GSF = GaussianSymmFunc(cutoff=5)
    colnames = GSF._generate_column_labels(multiindex=False, return_errors=False)
    for index, row in data.iterrows():
        structure = row["structure"]
        if verbose: print(index)
        for atomidx in range(structure.num_sites):
            feat = GSF.featurize(structure, idx=atomidx)
            site_name = "%s_%i" % (index, atomidx)
            site_features[site_name].update(dict(zip(colnames, feat)))

 
    ### BOND PAIRS AND BOND PROPERTIES ###
    if verbose: print("Generating bond library")
    structures_bonds = {}  # Store bond pairs
    bond_properties = {}  # Store bond properties
    for index, row in data.iterrows():
        if verbose: print(index)
        structure = row["structure"]
        structures_bonds[index] = []
        bond_properties[index] = []
        neighbors = structure.get_neighbor_list(BOND_MAX_DIST)  # (center_indices, points_indices, offset_vectors, distances)
        for bond in range(len(neighbors[0])):
            if neighbors[0][bond] < neighbors[1][bond]:  # Don't double count bonds
                # Bonded indices
                structures_bonds[index].append((neighbors[0][bond], neighbors[1][bond]))
                # Bond properties (coord-num, bond-len)
                coord_num = list(neighbors[0]).count(neighbors[0][bond])
                bond_properties[index].append((coord_num, neighbors[3][bond]))

    # Build Dataframe by bonds
    if verbose: print("Copying over data to final dataframe")
    delta_properties = ["site Electronegativity", "site AtomicWeight"]  # For these properties, take the difference as a feature
    bond_features = {}  # Final dictionary for saving features format: bond_features['material_bond#']["feature_name"] = data
    for index, row in data.iterrows():
        bond_len_sum = 0
        if verbose: print(index)
        for bond_idx in range(len(structures_bonds[index])):
            bond = structures_bonds[index][bond_idx]
            bond_name = "%s_Atom%i_Bond%i" % (index, bond[0], bond_idx)
            bond_features[bond_name] = {}
            site1_name = "%s_%i" % (index, bond[0])
            site2_name = "%s_%i" % (index, bond[1])
            
            # Add Site features to dictionary
            # Order putting heavier element first
            # TODO: this works but is not very efficient, save data directly to final dataframe in the end?
            site_feat_labels = site_features[site1_name].keys()
            site_feat_labels = [k for k in site_feat_labels if k not in ["structure_path", "structure_name"]]
            bond_features[bond_name]["structure_name"] = site_features[site1_name]["structure_name"]
            bond_features[bond_name]["structure_path"] = site_features[site1_name]["structure_path"]
            if site_features[site1_name]["site AtomicWeight"] > site_features[site2_name]["site AtomicWeight"]:
                for k in site_feat_labels:
                    if k in delta_properties:
                        bond_features[bond_name][k+"_diff"] = site_features[site1_name][k] - site_features[site2_name][k]
                    bond_features[bond_name][k+"_atom1"] = site_features[site1_name][k]
                    bond_features[bond_name][k+"_atom2"] = site_features[site2_name][k]
            else:
                for k in site_feat_labels:
                    if k in delta_properties:
                        bond_features[bond_name][k+"_diff"] = site_features[site2_name][k] - site_features[site1_name][k]
                    bond_features[bond_name][k+"_atom1"] = site_features[site2_name][k]
                    bond_features[bond_name][k+"_atom2"] = site_features[site1_name][k]
                    
            # Insert bond properties        
            coord_num, bond_len = bond_properties[index][bond_idx]
            bond_features[bond_name]["coordination_number"] = coord_num
            bond_features[bond_name]["bond_length"] = bond_len
            bond_len_sum += bond_len
            
        # Now add each bond's fraction of lattice volume
        for bond_idx in range(len(structures_bonds[index])):
            bond = structures_bonds[index][bond_idx]
            bond_name = "%s_Atom%i_Bond%i" % (index, bond[0], bond_idx)
            _, bond_len = bond_properties[index][bond_idx]
            bond_features[bond_name]["volume_fraction"] = bond_len/bond_len_sum
    
    ### SAVE FILE
    df_features = pd.DataFrame.from_dict(bond_features).T
    if os.path.isfile(saveto):  # Append
        df_features.to_csv(saveto, mode='a', header=False)
    else:  # New file
        df_features.to_csv(saveto)

    return df_features


## Test Featurizing with a single file

In [9]:
featurize_dataset(['supercells_data/15284_super.cif'], saveto='test_feat.csv')

Saving to File!


Unnamed: 0,structure_name,structure_path,site Number_atom1,site Number_atom2,site AtomicWeight_diff,site AtomicWeight_atom1,site AtomicWeight_atom2,site Row_atom1,site Row_atom2,site Column_atom1,...,G4_0.005_1.0_1.0_atom2,G4_0.005_1.0_-1.0_atom1,G4_0.005_1.0_-1.0_atom2,G4_0.005_4.0_1.0_atom1,G4_0.005_4.0_1.0_atom2,G4_0.005_4.0_-1.0_atom1,G4_0.005_4.0_-1.0_atom2,coordination_number,bond_length,volume_fraction
15284_super.cif_Atom1_Bond0,15284_super.cif,supercells_data,26.0,8.0,39.8456,55.845,15.9994,4.0,2.0,8.0,...,7.58149,3.59042,2.622812,3.157582,3.895691,0.623596,0.327758,6,1.9495,0.166667
15284_super.cif_Atom1_Bond1,15284_super.cif,supercells_data,26.0,8.0,39.8456,55.845,15.9994,4.0,2.0,8.0,...,7.58149,3.59042,2.622812,3.157582,3.895691,0.623596,0.327758,6,1.9495,0.166667
15284_super.cif_Atom1_Bond2,15284_super.cif,supercells_data,26.0,8.0,39.8456,55.845,15.9994,4.0,2.0,8.0,...,7.58149,3.59042,2.622812,3.157582,3.895691,0.623596,0.327758,6,1.9495,0.166667
15284_super.cif_Atom1_Bond3,15284_super.cif,supercells_data,26.0,8.0,39.8456,55.845,15.9994,4.0,2.0,8.0,...,7.58149,3.59042,2.622812,3.157582,3.895691,0.623596,0.327758,6,1.9495,0.166667
15284_super.cif_Atom1_Bond4,15284_super.cif,supercells_data,26.0,8.0,39.8456,55.845,15.9994,4.0,2.0,8.0,...,7.58149,3.59042,2.622812,3.157582,3.895691,0.623596,0.327758,6,1.9495,0.166667
15284_super.cif_Atom1_Bond5,15284_super.cif,supercells_data,26.0,8.0,39.8456,55.845,15.9994,4.0,2.0,8.0,...,7.58149,3.59042,2.622812,3.157582,3.895691,0.623596,0.327758,6,1.9495,0.166667


## Featurize all data in target folder in a batch system to manage memory better
Note: be sure to delete any previous features.csv files as the function will append if the file exists

In [13]:
# Batching files to reduce memory use
BATCH_SIZE = 5

# Load all CIF files in directory
file_type = "_super.cif"  # Use files with this ending in input_dir
input_dir = "supercells_data/"  # Input data directory
output_dir = "features/"  # Output directory
filename = "features.csv"  # Output filename for features

if not os.path.isdir(output_dir):
    os.mkdir(output_dir)
elif os.path.isdir(output_dir+filename):
    os.remove(output_dir+filename)  # Remove existing file

files = os.listdir(input_dir)
cif_files = [input_dir+file for file in files if file.endswith(file_type)]

# Featurize all sturctures
n_batches = int(np.ceil(len(cif_files)/BATCH_SIZE))
if os.path.isfile(filename):  # Clean up any previous runs
    os.remove(filename)
    
print("{} Batches Total: ".format(n_batches))
for b in range(n_batches):
    print("Starting batch ", b)
    # Define which files to 
    idx_start = int(b*BATCH_SIZE)
    idx_end = int(min((b+1)*BATCH_SIZE, len(cif_files)))
    start = timeit.default_timer()
    data_frame = featurize_dataset(cif_files[idx_start:idx_end], saveto=output_dir+filename)
    print("Time elapsed: ", timeit.default_timer() - start)

print("Files processed: ", len(cif_files))

109 Batches Total: 
Starting batch  0




Saving to File!
Time elapsed:  15.834772899746895
Starting batch  1




Saving to File!
Time elapsed:  30.958941199816763
Starting batch  2




Saving to File!
Time elapsed:  48.47410449991003
Starting batch  3




Saving to File!
Time elapsed:  48.20701939985156
Starting batch  4




Saving to File!
Time elapsed:  33.048618000000715
Starting batch  5




Saving to File!
Time elapsed:  5.007014899980277
Starting batch  6




Saving to File!
Time elapsed:  106.1624608002603
Starting batch  7




Saving to File!
Time elapsed:  28.552425399888307
Starting batch  8




Saving to File!
Time elapsed:  25.434288000222296
Starting batch  9




Saving to File!
Time elapsed:  1.7077071997337043
Starting batch  10




Saving to File!
Time elapsed:  146.22171639977023
Starting batch  11




Saving to File!
Time elapsed:  46.829804399982095
Starting batch  12




Saving to File!
Time elapsed:  7.097039999905974
Starting batch  13
Saving to File!
Time elapsed:  15.988384600263089
Starting batch  14




Saving to File!
Time elapsed:  16.827006600331515
Starting batch  15




Saving to File!
Time elapsed:  5.63878390006721
Starting batch  16




Saving to File!
Time elapsed:  19.854101699776947
Starting batch  17




Saving to File!
Time elapsed:  27.253403400070965
Starting batch  18




Saving to File!
Time elapsed:  5.382491699885577
Starting batch  19
Saving to File!
Time elapsed:  49.7216119999066
Starting batch  20




Saving to File!
Time elapsed:  57.391573700122535
Starting batch  21
Saving to File!
Time elapsed:  125.62258609989658
Starting batch  22
Saving to File!
Time elapsed:  4.8110075001604855
Starting batch  23




Saving to File!
Time elapsed:  100.88134530000389
Starting batch  24




Saving to File!
Time elapsed:  33.50944930035621
Starting batch  25




Saving to File!
Time elapsed:  0.3805967001244426
Starting batch  26




Saving to File!
Time elapsed:  1.439369599800557
Starting batch  27
Saving to File!
Time elapsed:  6.120155700016767
Starting batch  28




Saving to File!
Time elapsed:  43.25491929985583
Starting batch  29
Saving to File!
Time elapsed:  0.3634585998952389
Starting batch  30




Saving to File!
Time elapsed:  5.748871400021017
Starting batch  31




Saving to File!
Time elapsed:  1.9997656997293234
Starting batch  32




Saving to File!
Time elapsed:  4.516792099922895
Starting batch  33
Saving to File!
Time elapsed:  0.48768569994717836
Starting batch  34




Saving to File!
Time elapsed:  2.452595899812877
Starting batch  35




Saving to File!
Time elapsed:  0.5773113998584449
Starting batch  36




Saving to File!
Time elapsed:  78.63905119989067
Starting batch  37




Saving to File!
Time elapsed:  6.222334300167859
Starting batch  38




Saving to File!
Time elapsed:  4.512644799891859
Starting batch  39




Saving to File!
Time elapsed:  5.09362219972536
Starting batch  40




Saving to File!
Time elapsed:  3.043943800032139
Starting batch  41




Saving to File!
Time elapsed:  8.29106150008738
Starting batch  42




Saving to File!
Time elapsed:  2.957256400026381
Starting batch  43




Saving to File!
Time elapsed:  11.922840099781752
Starting batch  44




Saving to File!
Time elapsed:  18.789618300274014
Starting batch  45




Saving to File!
Time elapsed:  3.0222664000466466
Starting batch  46




Saving to File!
Time elapsed:  8.958409099839628
Starting batch  47




Saving to File!
Time elapsed:  2.308460199739784
Starting batch  48
Saving to File!
Time elapsed:  0.6186286001466215
Starting batch  49




Saving to File!
Time elapsed:  9.287925499957055
Starting batch  50




Saving to File!
Time elapsed:  5.649377999827266
Starting batch  51




Saving to File!
Time elapsed:  14.01557550020516
Starting batch  52




Saving to File!
Time elapsed:  96.31988970004022
Starting batch  53




Saving to File!
Time elapsed:  40.57678799983114
Starting batch  54




Saving to File!
Time elapsed:  15.54531899979338
Starting batch  55




Saving to File!
Time elapsed:  44.83042890019715
Starting batch  56




Saving to File!
Time elapsed:  7.615060599986464
Starting batch  57




Saving to File!
Time elapsed:  13.390086700208485
Starting batch  58




Saving to File!
Time elapsed:  43.27200910029933
Starting batch  59
Saving to File!
Time elapsed:  20.076902700122446
Starting batch  60




Saving to File!
Time elapsed:  34.61645309999585
Starting batch  61




Saving to File!
Time elapsed:  9.126204600092024
Starting batch  62




Saving to File!
Time elapsed:  23.914512400049716
Starting batch  63




Saving to File!
Time elapsed:  13.023337999824435
Starting batch  64




Saving to File!
Time elapsed:  37.52907490031794
Starting batch  65




Saving to File!
Time elapsed:  10.156881700269878
Starting batch  66




Saving to File!
Time elapsed:  12.832925300113857
Starting batch  67




Saving to File!
Time elapsed:  22.191649800166488
Starting batch  68




Saving to File!
Time elapsed:  18.311538300011307
Starting batch  69




Saving to File!
Time elapsed:  11.749955200124532
Starting batch  70




Saving to File!
Time elapsed:  22.581083699595183
Starting batch  71




Saving to File!
Time elapsed:  65.76808970002457
Starting batch  72




Saving to File!
Time elapsed:  35.775646999944
Starting batch  73




Saving to File!
Time elapsed:  34.263707200065255
Starting batch  74




Saving to File!
Time elapsed:  95.24871309986338
Starting batch  75
Saving to File!




Time elapsed:  0.2150483001023531
Starting batch  76




Saving to File!
Time elapsed:  8.664956000167876
Starting batch  77




Saving to File!
Time elapsed:  22.760689900256693
Starting batch  78




Saving to File!
Time elapsed:  26.63541720015928
Starting batch  79




Saving to File!
Time elapsed:  19.863362000323832
Starting batch  80




Saving to File!
Time elapsed:  53.760095700155944
Starting batch  81




Saving to File!
Time elapsed:  0.6908510997891426
Starting batch  82




Saving to File!
Time elapsed:  78.32298680022359
Starting batch  83




Saving to File!
Time elapsed:  176.970617600251
Starting batch  84




Saving to File!
Time elapsed:  24.825603100005537
Starting batch  85




Saving to File!
Time elapsed:  59.47477600025013
Starting batch  86




Saving to File!
Time elapsed:  79.92517849989235
Starting batch  87




Saving to File!
Time elapsed:  48.58434840012342
Starting batch  88




Saving to File!
Time elapsed:  36.394547999836504
Starting batch  89
Saving to File!
Time elapsed:  105.56187650002539
Starting batch  90




Saving to File!
Time elapsed:  280.45905769988894
Starting batch  91




Saving to File!
Time elapsed:  28.693943900056183
Starting batch  92




Saving to File!
Time elapsed:  36.03228939976543
Starting batch  93




Saving to File!
Time elapsed:  10.928208800032735
Starting batch  94




Saving to File!
Time elapsed:  5.0170090002939105
Starting batch  95




Saving to File!
Time elapsed:  9.515094999689609
Starting batch  96




Saving to File!
Time elapsed:  3.85476309992373
Starting batch  97




Saving to File!
Time elapsed:  10.25018860027194
Starting batch  98




Saving to File!
Time elapsed:  15.896392999682575
Starting batch  99




Saving to File!
Time elapsed:  50.68608390027657
Starting batch  100




Saving to File!
Time elapsed:  15.241548999678344
Starting batch  101
Saving to File!
Time elapsed:  13.320395899936557
Starting batch  102




Saving to File!
Time elapsed:  103.65178920002654
Starting batch  103




Saving to File!
Time elapsed:  17.01253419974819
Starting batch  104




Saving to File!
Time elapsed:  13.531726500019431
Starting batch  105




Saving to File!
Time elapsed:  20.048045299947262
Starting batch  106




Saving to File!
Time elapsed:  0.8928411998786032
Starting batch  107




Saving to File!
Time elapsed:  3.874232300091535
Starting batch  108
Saving to File!
Time elapsed:  4.528739899862558
Files processed:  541


In [3]:
   ## 1. Bag of Bonds
"""
print("bag of bonds")
BB = BagofBonds()
for index, row in data.iterrows():
    structure = row["structure"]
    if verbos:
        print(index)
    BB.fit([structure])
    feat = BB.bag(structure)
    print(feat)
    site = list(feat.keys())
    print(site[0])
    print(structure[site])

print("bond fraction")
BF = BondFractions()
for index, row in data.iterrows():
    structure = row["structure"]
    if verbos:
        print(index)
    feat = BF.fit_transform([structure])
    #print(feat)
    #print(BF.feature_labels())
"""
## 5. site difference stats 
"""
print("LPD")
LPD = LocalPropertyStatsNew(properties=property_list)
colnames = LPD._generate_column_labels(multiindex=False, return_errors=False)
for index, row in data.iterrows():
    structure = row["structure"]
    if verbos:
        print(index)
    for atomidx in range(structure.num_sites):
        feat = LPD.featurize(structure, idx=atomidx)
        site_name = "%s_%i" % (index, atomidx)
        site_features[site_name].update(dict(zip(colnames, feat)))
"""
    