In [None]:
### Imports ###
import os
import pandas as pd
import smact
from smact import Species  # Import Species class from smact library for representing species
from smact.oxidation_states import Oxidation_state_probability_finder  # Import Oxidation_state_probability_finder class from smact library for finding oxidation states
from pymatgen.core.composition import Composition  # Import Composition class from pymatgen library for representing compositions
from pymatgen.core.structure import Structure  # Import Structure class from pymatgen library for representing crystal structures
from pymatgen.analysis.structure_prediction.substitution_probability import SubstitutionPredictor
from pymatgen.ext.matproj import MPRester
from pymatgen.analysis.bond_valence import BVAnalyzer
from pymatgen.io.cif import CifWriter
import json

## Loading and Describing Useful Band Gaps Data

This code block loads a CSV file containing filtered useful band gaps and displays the first few rows to confirm the data. It also provides a statistical summary of the dataset for an overview of its contents.
.
.
.


In [None]:
useful_BGs_used = pd.read_csv('drive/MyDrive/data/Bandgaps/filtered_useful_BGs.csv')
# Rename the column
useful_BGs_used = useful_BGs_used.rename(columns={'pretty_formula': 'formula_pretty'})
# Display the first few rows to confirm the change
print(useful_BGs_used.head())
useful_BGs_used.describe()

## Filtering and Calculating Sustainability Scores for Useful Band Gaps

This code block filters out compounds containing undesired elements and calculates a sustainability score for each remaining composition. 

1. **Filter Unwanted Elements**: Removes compounds containing specific unwanted elements from the dataset.
2. **Calculate Sustainability Scores**: Computes a sustainability score for each remaining compound based on the Herfindahl-Hirschman Index (HHI) of its constituent elements.
3. **Create DataFrame**: Converts the filtered list of compounds with their sustainability scores back into a pandas DataFrame.


In [None]:
# Filter out undesired elements from beginning 
unwanted_els = ['Be','Hg','Pb','Tl','Pr','Nd','Sm','Gd','Dy','Ho','Er','Tm','Lu','Hf','Ta']

# Convert to dict for ease
all_comps = list(useful_BGs_used.T.to_dict().values())

# reduce down to wanted compounds
wanted_comps = []
for i in all_comps:
    list_els = Composition(i['composition_obj']).elements
    wanted = True
    for el in list_els:
        if el.symbol in unwanted_els:
            wanted = False
    if wanted == True:
        wanted_comps.append(i)

# Work out sustainability score (based on HHI) for each composition
def sus_calc(comp):
    sus_factor = 0
    for i in comp.elements:
        sus_factor += (comp.get_wt_fraction(i) *smact.Element(i.symbol).HHI_r)
    return sus_factor

for i in wanted_comps:
    i['sus_factor'] = sus_calc(Composition(i['composition_obj']))

# Return to dataframe
filtered_useful_BGs = pd.DataFrame.from_dict(wanted_comps)

In [None]:
filtered_useful_BGs = filtered_useful_BGs.sort_values(by='sus_factor', ascending=True)
filtered_useful_BGs = filtered_useful_BGs.reset_index(drop=True)
selected_formulas = list(filtered_useful_BGs['formula_pretty'])

## Fetching and Decorating Structures with Oxidation States

This code block fetches crystal structures for given formulas from the Materials Project API and decorates them with oxidation states. It uses concurrent processing to handle multiple formulas efficiently.

1. **Initialize API**: Sets up the connection to the Materials Project API with the provided API key.
2. **Fetch and Decorate Structures**: Defines a function to fetch entries for a given formula, and decorate the structures with oxidation states using `BVAnalyzer`.
3. **Batch Processing**: Processes formulas in batches to manage memory usage.
4. **Concurrent Processing**: Utilizes a thread pool executor to fetch and decorate structures concurrently, improving efficiency.
5. **Collect Results**: Collects all decorated structures into a single list and prints the total number of structures processed.

This approach ensures efficient and concurrent fetching and decorating of crystal structures, handling large datasets in a manageable way.


In [None]:
import concurrent.futures

api_key = "XXXX"
mpr = MPRester(api_key)

def fetch_and_decorate(formula):
    try:
        entries = mpr.get_entries(formula)
        decorated_structures = []
        for entry in entries:
            structure = entry.structure
            try:
                bva = BVAnalyzer()
                structure = bva.get_oxi_state_decorated_structure(structure)
                decorated_structures.append(structure)
            except Exception as e:
                print(f"Oxidation states could not be added for {formula}: {e}")
        return decorated_structures
    except Exception as e:
        print(f"Failed to fetch structure for {formula}: {e}")
        return []

# Generator to yield batches of formulas to process
def formula_batches(formulas, batch_size):
    for i in range(0, len(formulas), batch_size):
        yield formulas[i:i + batch_size]

# List to hold all decorated structures
all_decorated_structures = []

batch_size = 10  # Adjust batch size based on available memory
max_workers = 4  # Adjust based on available CPU cores

# Process formulas in batches to manage memory usage
for batch in formula_batches(selected_formulas, batch_size):
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = executor.map(fetch_and_decorate, batch)
        for result in results:
            all_decorated_structures.extend(result)

print(f"Total structures fetched and decorated: {len(all_decorated_structures)}")

In [None]:
from pymatgen.analysis.structure_prediction.substitutor import Substitutor

# Initialize the substitution predictor with a probability threshold
substitutor = Substitutor(threshold=1e-5)

# List to hold new predicted structures
predicted_structures = []

# Iterate through the fetched structures
for structure in all_decorated_structures:
    try:
        substitutions = substitutor.pred_from_structures([structure], remove_duplicates=True, remove_existing=False)
        predicted_structures.extend(substitutions)
    except Exception as e:
        print(f"Substitution prediction failed for a structure: {e}")

print(f"Total predicted structures: {len(predicted_structures)}")

In [None]:
import os
import json

# Ensure the directory exists
save_dir = 'drive/MyDrive/data/Structures'
os.makedirs(save_dir, exist_ok=True)

# Path to save the JSON file
save_path = os.path.join(save_dir, 'predicted_structures.json')

# Convert predicted structures to a list of dictionaries
predicted_structures_dicts = [s.as_dict() for s in predicted_structures]

# Save to a JSON file
with open(save_path, 'w') as f:
    json.dump(predicted_structures_dicts, f, indent=4)

print(f"Predicted structures saved to {save_path}")

In [None]:
with open('drive/MyDrive/data/Structures/predicted_structures.json', 'r') as f:
    tetra_element_oxide_for_calc = json.load(f)

# Convert back to pymatgen Structure objects
tetra_element_oxide_for_calc = [Structure.from_dict(i) for i in tetra_element_oxide_for_calc]

In [None]:
# Initialize an instance of the Oxidation_state_probability_finder class
ox = Oxidation_state_probability_finder()

# Create a list of metals and oxygen to consider
metals_and_anions = smact.metals + ['O']

# Initialize a counter for the number of compounds that pass the test
num_passes = 0

# Iterate over each structure in the list of tetra-element oxides
for struc in tetra_element_oxide_for_calc:
    # Get a list of pymatgen species in the structure
    species = [i.specie for i in struc]
    # Filter the species list to include only metals and oxygen
    species = [i for i in species if i.symbol in metals_and_anions]

    # Pass the species to the probability calculator and get the probability
    prob = ox.compound_probability(species)

    # Check if the probability is below the threshold
    if prob < 0.005:
        print(species)
        print('Below threshold!')
    else:
        # Increment the counter if the probability is above the threshold
        num_passes += 1

# Print the number of compounds that passed the oxidation state probability test
print('number of compounds to pass the oxidation state probability test: {}'.format(num_passes))