In [1]:
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd

## Load in Data

In [2]:
labels_df = pd.read_csv("labels/labels.csv", index_col=0)
features_bond_df = pd.read_csv("features/features_extra_bond.csv", index_col=0)
features_struc_df = pd.read_csv("features/features_extra_structure.csv", index_col=0)
cif_meta_data = pd.read_csv("features/cif_metadata.csv", index_col=0)

In [3]:
features_struc_df.head()

Unnamed: 0,structure_name,structure_path,structural complexity per atom,structural complexity per cell,jml_pack_frac,jml_vpa,jml_density,jml_log_vpa,max packing efficiency,density,...,C1 polyhedra frac,C2 polyhedra frac,C3 polyhedra frac,C4 polyhedra frac,C5 polyhedra frac,C6 polyhedra frac,C7 polyhedra frac,C8 polyhedra frac,Low DOF polyhedra frac,High DOF polyhedra frac
56196_super.cif,56196_super.cif,supercells_data,8.294621,2604.510915,2.46538,5.441979,11.767896,0.825026,0.440784,5.441979436559926 g cm^-3,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
52946_super.cif,52946_super.cif,supercells_data,10.691744,17684.143781,2.66221,5.490813,14.327853,0.525541,0.261041,5.490813487319241 g cm^-3,...,0.001751,0.056918,0.224168,0.11296,0.0,0.037653,0.082312,0.484238,0.178634,0.821366
18989_super.cif,18989_super.cif,supercells_data,1.370951,6.854753,2.47349,5.13637,11.8638,0.804442,0.44692,5.136370180598363 g cm^-3,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50120_super.cif,50120_super.cif,supercells_data,1.370951,6.854753,2.76403,6.806522,15.86371,0.879566,0.44692,6.806521771594858 g cm^-3,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
77566_super.cif,77566_super.cif,supercells_data,1.921928,19.219281,3.03669,3.057759,20.83626,0.370867,0.33206,4.952992640506285 g cm^-3,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [4]:
len(features_struc_df)

739

Check for NaN values from Feature Generator

In [5]:
print(features_struc_df.isnull().sum())
print(features_bond_df.isnull().sum())
features_struc_df = features_struc_df.dropna()
features_bond_df = features_bond_df.dropna()

structure_name                    0
structure_path                    0
structural complexity per atom    2
structural complexity per cell    2
jml_pack_frac                     0
jml_vpa                           0
jml_density                       0
jml_log_vpa                       0
max packing efficiency            0
density                           0
vpa                               0
packing fraction                  0
number of elements                0
entropy of mixing                 0
C1 polyhedra frac                 6
C2 polyhedra frac                 6
C3 polyhedra frac                 6
C4 polyhedra frac                 6
C5 polyhedra frac                 6
C6 polyhedra frac                 6
C7 polyhedra frac                 6
C8 polyhedra frac                 6
Low DOF polyhedra frac            6
High DOF polyhedra frac           6
dtype: int64
structure_name                  0
structure_path                  0
site Number_atom1               0
site Number_atom2    

In [6]:
# Choose Features
columns_all = features_bond_df.columns[2:-1].to_list()  # ALL FEATURES, exclude file names and bond volume fractions
feature_weights = features_bond_df[features_bond_df.columns[-1]].to_numpy()
struc_columns = features_struc_df.columns[2:].to_list()

# Copy over labels to each bond (just for correlation)
y = list()
weights = list()  # weights for each bond based on bond_length
combined_data = list()  # Big object for combining data from features_bond and labels
struc_data = list() # Big object for extract data from feature_structures
labels_skipped = 0  # how many labels did now have a structure associated with it
max_length = 0  # max number of bonds for a single structure
all_phases = set()  # used to count unique structures
feature_labels = [columns_all+["temperature", "bond frac"], struc_columns+["temperature", "dil_data", "xrd_data"]] # all feature labels
sample_class_labels = []  # labels from cif meta_data

skipped_phases = set()

def string2float(s):
    if type(s) == str:
        l = s.split(' ')
        return pd.Series( float(l[0]) )
    else:
        return s

# For ever label, use listed phase to pair with features
for idx in labels_df.index:
    phase = int(labels_df.loc[idx, 'phase_id'])
    # Extract feature information based on phasse
    feat = features_bond_df[features_bond_df['structure_name'] == str(phase)+'_super.cif']  # Select all rows applicable to 
    feat_struc = features_struc_df[features_struc_df['structure_name'] == str(phase)+'_super.cif']
    
    if len(feat) > 0 and len(feat_struc) > 0: # feature data exists in both files
        weights.append(torch.Tensor(feat['volume_fraction'].values).to(torch.float).view(len(feat),1))
        feat = feat[columns_all]
        feat['temperature'] = labels_df.loc[idx, "temperature"]  # add temperature as a feature
        max_length = max(max_length, len(feat))
        combined_data.append(torch.Tensor(feat.values).to(torch.float)) # combined features_bond.csv and temperature from labels.csv
        feat_struc = feat_struc[struc_columns]
        
        feat_struc['temperature'] = labels_df.loc[idx, "temperature"]
        feat_struc['dil_data'] = labels_df.loc[idx, "dil_data"] # adding labels for how data was (supposedly) collected
        feat_struc['xrd_data'] = labels_df.loc[idx, "xrd_data"]
        
        try: # For debugging
            # TODO: fix this in featurizer
            struc_cols = feat_struc.columns.to_list()
            for col in struc_cols:
                feat_struc[col] = feat_struc[col].apply(string2float) # dumb featurizer returns a string, remove if not using density
            struc_data.append(torch.Tensor(feat_struc.values).to(torch.float))  # cast values to a tensor
        except:
            print("Struc failed to cast! ", feat_struc.values)
        y.append( labels_df.loc[idx, "thermal_expansion"] )
        all_phases.add(phase)
        sample_class_labels.append(cif_meta_data.loc[phase, ["material_class","symmetry_int_table_number","space_goup","crystal_system", "num_elements", "composition_class"]].values.flatten().tolist() )
    else:
        print("Phases ", phase, " not found in features data")
        labels_skipped += 1
        skipped_phases.add(phase)

print("Total Number of Labels: ", len(y))
print("Skipped {} structures".format(labels_skipped))
print("Total Unique structures", len(all_phases))
print("Max Number of Bonds: ", max_length)
# Recast y as a torch tensor
y = torch.Tensor(y)
y = y.view(-1,1)

Phases  122321  not found in features data
Phases  26534  not found in features data
Phases  175446  not found in features data
Phases  86000  not found in features data
Phases  86170  not found in features data
Phases  10874  not found in features data
Phases  105027  not found in features data
Phases  170350  not found in features data
Phases  95569  not found in features data
Phases  95569  not found in features data
Phases  95569  not found in features data
Phases  95569  not found in features data
Phases  94093  not found in features data
Phases  94093  not found in features data
Phases  94093  not found in features data
Phases  77247  not found in features data
Phases  77247  not found in features data
Phases  77247  not found in features data
Phases  106509  not found in features data
Phases  106509  not found in features data
Phases  106509  not found in features data
Phases  77246  not found in features data
Phases  77246  not found in features data
Phases  95569  not found in

In [7]:
feat_struc

Unnamed: 0,structural complexity per atom,structural complexity per cell,jml_pack_frac,jml_vpa,jml_density,jml_log_vpa,max packing efficiency,density,vpa,packing fraction,...,C4 polyhedra frac,C5 polyhedra frac,C6 polyhedra frac,C7 polyhedra frac,C8 polyhedra frac,Low DOF polyhedra frac,High DOF polyhedra frac,temperature,dil_data,xrd_data
13150_super.cif,0.918296,5.509775,2.57288,3.925697,13.10357,0.319029,0.371196,3.925697,13.10357,0.319029,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,298.0,1,0


In [8]:
# Make sure all weights are normalized
# TODO: figure out why this is necessary
for i in range(len(weights)):
    s = torch.sum(weights[i])
    if abs(s-1) > 1e-6:
        weights[i] = weights[i]/s

In [9]:
# Build and Normalize X
X_mean = torch.mean(torch.cat(combined_data),dim=0)
X_std = torch.std(torch.cat(combined_data),dim=0)
X = []
for x,w in zip(combined_data, weights):
    X.append(torch.cat(((x-X_mean)/X_std, w), dim=1))

In [10]:
# Build and Normalize X_struc
# TODO: exclude one-hot labels
X_struc_mean = torch.mean(torch.cat(struc_data),dim=0)
X_struc_std = torch.std(torch.cat(struc_data),dim=0)
# Don't normalize the one-hot data labels
X_struc_mean[-2:] = 0
X_struc_std[-2:] = 1
for i in range(len(X_struc_std)):
    if X_struc_std[i] == 0:
        X_struc_std[i] = 1
        
X_struc = []
for x in struc_data:
    X_struc.append(torch.Tensor((x-X_struc_mean)/X_struc_std) )
X_struc = torch.cat(X_struc)

In [11]:
# Normalize Y
y_mean = torch.mean(y)
y_std = torch.std(y)
y = (y-y_mean)/y_std

In [12]:
y_std

tensor(8.7882)

In [13]:
y_mean

tensor(12.9118)

In [14]:
# Save tensors to a file
torch.save({'X': X, 'y': y, 'X_struc': X_struc, 'X_mean':X_mean, 'X_std':X_std, 'X_struc_mean':X_struc_mean, 'X_struc_std':X_struc_std,'y_mean':y_mean, 'y_std':y_std, 'sample_class_labels': sample_class_labels, 'feature_labels':feature_labels}, 'features/data_tensors_extra.pth')