The purpose of this script is to read each downloaded .csv file from ASM and identify 1-1-1 compositions with reported phase separation

In [None]:
import numpy as np
import pandas as pd
import pymatgen.core as mg
import glob
from collections import defaultdict

## Get list of all downloaded cifs and create dictionary to write final list to

In [None]:
downloaded_comps = glob.glob('previous_downloads/*')
results = defaultdict(list)

## Define function to clean up APD phase formatting

In [None]:
def clean_APD_phase(phase):
    remove_characters = ['orth2','stab','orth','ht1','ht2','vol','~','(',')','ht','rt','cal','ars','rhom','[',']']
    # make sure strings in remove_characters doesn't have a subset of another one that falls before it!
    # example: 'orth2' should not follow 'orth' because removing 'orth' from 'orth2' leaves a random '2' which will crash the code
    for char in remove_characters:
        phase = phase.replace(char,'')
    return phase

# Parse through each comp's files

In [None]:
for comp_dir in downloaded_comps:
    downloaded_files = sorted(glob.glob(f'{comp_dir}/*'))
    comp = comp_dir.split('/')[-1]

    comp_log = pd.read_csv(downloaded_files[0]) # first file should always be comp log after sorting
    comp_reports = downloaded_files[1:]
    
    # check for all files that have 1-1-1 composition attempted
    relevant_reports = comp_log[' Attempted Composition Range'] == 'full'
    relevant_temperatures = [] 
    drop_temps = []
    
    for i, row in comp_log.iterrows():
        if relevant_reports[i] == True: 
            relevant_temperatures += [row[' Temperature (Kelvin)']]
            continue # skip ones that are already full composition
            
        attempted_range = row[' Attempted Composition Range'].split('-')
        
        if comp in attempted_range:
            relevant_reports[i] = True # mark if 1-1-1 composition is covered
            relevant_temperatures += [row[' Temperature (Kelvin)']]
    
    relevant_report_filenames = list(comp_log[relevant_reports]['Filename'])
    
    if len(relevant_report_filenames) == 0: # skip compositions where the 1-1-1 composition range was not reported
        continue

    
    reported_phase = 'Phase separation' # assume phase separation
    parse_error = False
    

    for i in range(len(relevant_report_filenames)):
        report_fname = relevant_report_filenames[i]
        try:
            report = pd.read_csv(f'{comp_dir}/{report_fname}')
        except:
            continue
        mg_111 = mg.Composition(comp)
        
        for i, row in report.iterrows():
            
            try:
                try:
                    phase = row['Formula'] 
                    mg_phase = mg.Composition(phase.replace('~','')) # need to slightly pre-format formula
                except:
                    try:
                        phase = row['APD phase label, published phase label'].split('\n----\n')[0].split(' ')[0] # take APD phase label
                    except:
                        phase = row['APD phase label'].split('\n----\n')[0].split(' ')[0] # take APD phase label
                        
                    mg_phase = mg.Composition(clean_APD_phase(phase)) # need to slightly format formula
                
                if mg_phase == mg_111: # if 1-1-1 phase is actually reported
                    try:
                        reported_phase = row['Pearson symbol, space group'].split('\r\n')[-1].replace('~','')
                    except:
                        reported_phase = 'NaN'
            except:
                parse_error = True # mark a composition True if some phases not identifiable
        if reported_phase != 'Phase separation': 
            drop_temps += [i]
    try:
        temperatures = np.delete(relevant_temperatures,drop_temps)
    except:
        temperatures = 'N/A'
    
    results['Composition'] += [comp]
    results['1-1-1 Reported Phase'] += [reported_phase]
    results['Phase Separating Temperatures (Kelvin)'] += [temperatures]
    results['Parse Error'] += [parse_error]

## Convert results into pandas dataframe

In [None]:
pd_results = pd.DataFrame.from_dict(results)

## Save compositions reported to phase separate into .csv for 2nd round of checks for experimental reports

In [None]:
phase_separating = pd_results[(pd_results['1-1-1 Reported Phase'] == 'Phase separation') & (pd_results['Parse Error'] == False)]
phase_separating.to_csv('../Data Files/ASM_phase_separating.csv')

## Subtract compositions from icsd and springer combined lists since some experimentally reported compositions were initially missed

In [None]:
experimentally_reported = list(pd.read_csv('../Data Files/combined_results.csv')['Composition'])

In [None]:
def is_this_experimentally_reported(composition, exp_list = experimentally_reported):
    if composition in exp_list:
        return True
    else:
        return False

In [None]:
phase_separating['Experimentally Reported'] = phase_separating['Composition'].apply(is_this_experimentally_reported)

In [None]:
phase_separating = phase_separating[~phase_separating['Experimentally Reported']]

phase_separating.to_csv('../Data Files/actual_phase_separating.csv')