In [1]:
import os
import json
import pandas as pd
import numpy as np

# TODO: for a/b/c TEC from same pape, average values for volumetric TEC?

In [2]:
def generate_labels(file_list, input_dir):
    labels = {}
    for file in file_list:
        f = open(input_dir+file)
        data = json.load(f)
        name = file.split(".json")[0]
        labels[name] = {"phase_id": data["sample"]["material"]["phase_id"]}
        # FIND TEC IN 1e-6/K UNITS
        property_name = data["sample"]["measurement"][0]["property"]["name"]
        if property_name == "linear thermal expansion coefficient":    
            labels[name].update({"thermal_expansion": 1e6*data["sample"]["measurement"][0]["property"]["scalar"]})  
        elif property_name == "volume thermal expansion coefficient":
            # Convert volume TEC to linear TEC by dividing by 3
            labels[name].update({"thermal_expansion": (1e6/3)*data["sample"]["measurement"][0]["property"]["scalar"]})
        else:
            raise Exception("Unexpected property found: ", property_name)
        
        # Save temperature conditions of 
        if "scalar" in data["sample"]["measurement"][0]["condition"][0].keys():
            labels[name].update({"temperature": data["sample"]["measurement"][0]["condition"][0]["scalar"]})
        else:
            # If a range of temperatures is given, record the midpoint 
            labels[name].update({"temperature": np.mean(data["sample"]["measurement"][0]["condition"][0]["range"])})
            # TODO: dont just use mean of data range?
        f.close()
        
    # Save to file
    labels_df = pd.DataFrame.from_dict(labels).T
    return labels_df

## Compile all JSON files into a single labels file

In [3]:
# File Management
input_dir = "data_volume/"  # Input json files
output_dir = "labels/"  # Output labels
filename = "labels_volume.csv"  # name of labels file

if not os.path.isdir(output_dir):
    os.mkdir(output_dir)

# Load all CIF files in
file_type = ".json"
files = os.listdir(input_dir)
json_files = [file for file in files if file.endswith(file_type)]

# Make Output Directory if needed
if not os.path.isdir(output_dir):
    os.mkdir(output_dir)

labels_df = generate_labels(json_files, input_dir)
labels_df.to_csv(output_dir+filename)

# File Management
input_dir = "data_linear/"  # Input json files
filename = "labels_linear.csv"  # name of labels file

if not os.path.isdir(output_dir):
    os.mkdir(output_dir)

# Load all CIF files in
file_type = ".json"
files = os.listdir(input_dir)
json_files = [file for file in files if file.endswith(file_type)]

# Make Output Directory if needed
if not os.path.isdir(output_dir):
    os.mkdir(output_dir)

labels_df = generate_labels(json_files, input_dir)
labels_df.to_csv(output_dir+filename)

## Now Combine and Sort out duplicates by matching with Vol TECs

In [4]:
# Why is this step needed? Many linear TEC values are directional in the lattice leaving 3x different labels for many structures. Instead we use the volume TEC for these entries
# Load all label CSV files in
# Load and process Volume TEC Labels
vol_df = pd.read_csv("labels/labels_volume.csv", index_col= 0)
vol_df['entry'] = vol_df.index.astype(str).str.split('-').str[0]
vol_df = vol_df.sort_values(by=['phase_id'], ascending=False)

lin_df = pd.read_csv("labels/labels_linear.csv", index_col=0)
lin_df['entry'] = lin_df.index.astype(str).str.split('-').str[0]
lin_df = lin_df.sort_values(by=['phase_id'], ascending=False)

# Check if a volume TEC can be used to replace 3+ directional linear TECs
duplicate_matches = 0
for row in vol_df.index:
    vol_entry = vol_df.loc[row, ['phase_id', 'temperature', 'entry']]
    phase = vol_df.loc[row, 'entry']
    matches = list()
    for i in lin_df.index:
        if lin_df.loc[i, ['phase_id', 'temperature', 'entry']].equals(vol_entry):
            matches.append(i)
    if len(matches) >= 3:
        print(matches)
        duplicate_matches += 1
        lin_df = lin_df.drop(matches)

print("Duplicates matched with Vol TEC: ", duplicate_matches)

# Now combine and see how many duplicates are left
labels_df = pd.concat( (lin_df, vol_df))
labels_df = labels_df.sort_index()
labels_df.to_csv("labels/labels.csv")

['P1127173-2', 'P1127173-4', 'P1127173-3']
['P1110928-2', 'P1110928-3', 'P1110928-1']
['P1118650-1', 'P1118650-2', 'P1118650-3']
['P1317442-3', 'P1317442-4', 'P1317442-5']
['P1317441-2', 'P1317441-3', 'P1317441-4']
['P1712101-2', 'P1712101-4', 'P1712101-3']
['P1913322-7', 'P1913322-8', 'P1913322-9']
['P1712100-4', 'P1712100-3', 'P1712100-2']
['P1911327-2', 'P1911327-3', 'P1911327-4']
['P1811326-4', 'P1811326-5', 'P1811326-6']
['P1822379-2', 'P1822379-3', 'P1822379-1']
['P1128949-7', 'P1128949-8', 'P1128949-9']
['P1108804-7', 'P1108804-6', 'P1108804-5']
['P1108804-4', 'P1108804-3', 'P1108804-2']
['P1108806-4', 'P1108806-5', 'P1108806-6']
['P1108806-9', 'P1108806-7', 'P1108806-8']
['P1123568-4', 'P1123568-3', 'P1123568-2']
['P1520222-5', 'P1520222-4', 'P1520222-3']
['P1520221-3', 'P1520221-2', 'P1520221-4']
['P1110927-4', 'P1110927-3', 'P1110927-2']
['P1903898-7', 'P1903898-4', 'P1903898-5', 'P1903898-6', 'P1903898-3', 'P1903898-2']
['P1712806-8', 'P1712806-7', 'P1712806-6', 'P1712806-5'

## Now sort through duplicates not found in volume TEC

In [5]:
# Now combine duplicate entries with no corresponding volume label
replacements = 0  # Count number of times duplicates will be replaced to report
matches = list()
match_tecs = list()
prev_entry = labels_df.iloc[0,[0, 2, 3]]
for row in labels_df.index:
    entry = labels_df.loc[row, ['phase_id', 'temperature', 'entry']]
    if entry.equals(prev_entry):
        matches.append(row)
        match_tecs.append(float(labels_df.loc[row, 'thermal_expansion']))
    else:
        if len(matches) >= 2:
            print(matches)
            # Toss all but the first entry which will be modified
            labels_df = labels_df.drop(matches[1:])  
            # Make this entry the average of all entries, assuming these represent thermal expansion in different directions
            if len(matches) == 2: 
                # Here, typically [1,0,0] and [0,0,1] planes are listed with it implied the [0,1,0] plane is same as first
                labels_df.at[matches[0], 'thermal_expansion'] = (2/3)*match_tecs[0] + (1/3)*match_tecs[1]
            else:
                # May list 3 primary planes as well as other planes
                labels_df.at[matches[0], 'thermal_expansion'] = np.mean(match_tecs[0:3])  # Additional value may be for other planes
            replacements += 1
        matches = [row]
        match_tecs = [float(labels_df.loc[row, 'thermal_expansion'])]
            
    prev_entry = entry.copy()  
          
print("Duplicates reduced by averaging TEC: ", replacements)

# Save combined file
labels_df.to_csv("labels/labels_reduced.csv")

duplicated_rows = labels_df[labels_df[['entry', 'phase_id', 'temperature']].duplicated(keep=False) == True]
print("Number of Remaining Duplicated TECs: ", len(duplicated_rows))
duplicated_rows.to_csv("labels/duplicates.csv")

['P1100654-21', 'P1100654-22', 'P1100654-23']
['P1101182-3', 'P1101182-4']
['P1101183-10', 'P1101183-9']
['P1102312-1', 'P1102312-2', 'P1102312-3']
['P1102563-2', 'P1102563-3']
['P1105079-2', 'P1105079-3']
['P1106073-8', 'P1106073-9']
['P1106495-1', 'P1106495-2']
['P1106495-3', 'P1106495-4', 'P1106495-5']
['P1106597-5', 'P1106597-6']
['P1106598-4', 'P1106598-5']
['P1106599-5', 'P1106599-6']
['P1106993-4', 'P1106993-5', 'P1106993-6']
['P1106995-5', 'P1106995-6', 'P1106995-7']
['P1106996-4', 'P1106996-5', 'P1106996-6']
['P1107461-2', 'P1107461-3', 'P1107461-4', 'P1107461-5']
['P1107462-5', 'P1107462-6', 'P1107462-7', 'P1107462-8']
['P1107463-5', 'P1107463-6', 'P1107463-7']
['P1107464-4', 'P1107464-5', 'P1107464-6']
['P1108115-3', 'P1108115-4']
['P1110929-2', 'P1110929-3']
['P1110930-2', 'P1110930-3']
['P1111612-1', 'P1111612-2', 'P1111612-3']
['P1111771-3', 'P1111771-4', 'P1111771-5']
['P1111868-4', 'P1111868-5', 'P1111868-6']
['P1111869-6', 'P1111869-7', 'P1111869-8']
['P1111870-3', 'P1