In [21]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats

In [36]:
labels_df = pd.read_csv("labels/labels_reduced.csv", index_col=0)
features_bond_df = pd.read_csv("features/features_bond.csv")
features_struc_df = pd.read_csv("features/features_structure.csv")

# Features in Dataframe
columns_all = features_bond_df.columns[3:-1].to_list()  # ALL FEATURES, exclude file names and bond volume fractions
struc_columns = features_struc_df.columns[0:-2].to_list()
feature_weights = features_bond_df[features_bond_df.columns[-1]].to_numpy()

# The structures for each row in features_bond_df
feature_phase_ids = features_bond_df['structure_name'].astype(str).str.split('_super').str[0].to_list()  

In [24]:
features_bond_df.head()

Unnamed: 0.1,Unnamed: 0,structure_name,structure_path,site Number_atom1,site Number_atom2,site AtomicWeight_diff,site AtomicWeight_atom1,site AtomicWeight_atom2,site Row_atom1,site Row_atom2,...,G4_0.005_1.0_1.0_atom2,G4_0.005_1.0_-1.0_atom1,G4_0.005_1.0_-1.0_atom2,G4_0.005_4.0_1.0_atom1,G4_0.005_4.0_1.0_atom2,G4_0.005_4.0_-1.0_atom1,G4_0.005_4.0_-1.0_atom2,coordination_number,bond_length,volume_fraction
0,10004_super.cif_Atom0_Bond0,10004_super.cif,supercells_data,62.0,8.0,134.3606,150.36,15.9994,6.0,2.0,...,7.729945,2.356312,2.636161,3.98593,4.03554,0.183682,0.274427,8,2.348969,0.018402
1,10004_super.cif_Atom0_Bond1,10004_super.cif,supercells_data,62.0,8.0,134.3606,150.36,15.9994,6.0,2.0,...,7.753143,2.356312,2.747697,3.98593,4.025568,0.183682,0.348856,8,2.322674,0.018196
2,10004_super.cif_Atom0_Bond2,10004_super.cif,supercells_data,62.0,8.0,134.3606,150.36,15.9994,6.0,2.0,...,7.729945,2.356312,2.636161,3.98593,4.03554,0.183682,0.274427,8,2.348969,0.018402
3,10004_super.cif_Atom0_Bond3,10004_super.cif,supercells_data,62.0,8.0,134.3606,150.36,15.9994,6.0,2.0,...,7.729945,2.356312,2.636161,3.98593,4.03554,0.183682,0.274427,8,2.581681,0.020225
4,10004_super.cif_Atom0_Bond4,10004_super.cif,supercells_data,62.0,8.0,134.3606,150.36,15.9994,6.0,2.0,...,7.729945,2.356312,2.636161,3.98593,4.03554,0.183682,0.274427,8,2.580842,0.020218


In [25]:
features_bond_df.tail()

Unnamed: 0.1,Unnamed: 0,structure_name,structure_path,site Number_atom1,site Number_atom2,site AtomicWeight_diff,site AtomicWeight_atom1,site AtomicWeight_atom2,site Row_atom1,site Row_atom2,...,G4_0.005_1.0_1.0_atom2,G4_0.005_1.0_-1.0_atom1,G4_0.005_1.0_-1.0_atom2,G4_0.005_4.0_1.0_atom1,G4_0.005_4.0_1.0_atom2,G4_0.005_4.0_-1.0_atom1,G4_0.005_4.0_-1.0_atom2,coordination_number,bond_length,volume_fraction
2384481,86345_super.cif_Atom529_Bond1453,86345_super.cif,supercells_data,8.0,8.0,0.0,15.9994,15.9994,2.0,2.0,...,3.701065,1.178408,1.157956,1.922302,1.905861,0.097303,0.092867,6,2.940812,0.000789
2384482,86345_super.cif_Atom529_Bond1454,86345_super.cif,supercells_data,8.0,8.0,0.0,15.9994,15.9994,2.0,2.0,...,3.701065,1.231454,1.157956,2.015004,1.905861,0.10331,0.092867,6,2.992393,0.000803
2384483,86345_super.cif_Atom530_Bond1455,86345_super.cif,supercells_data,8.0,8.0,0.0,15.9994,15.9994,2.0,2.0,...,3.701282,1.178247,1.158057,1.922077,1.905949,0.097272,0.092876,6,2.940812,0.000789
2384484,86345_super.cif_Atom530_Bond1456,86345_super.cif,supercells_data,8.0,8.0,0.0,15.9994,15.9994,2.0,2.0,...,3.701282,1.231488,1.158057,2.015068,1.905949,0.103288,0.092876,6,2.992418,0.000803
2384485,86345_super.cif_Atom531_Bond1457,86345_super.cif,supercells_data,8.0,8.0,0.0,15.9994,15.9994,2.0,2.0,...,3.701284,1.178241,1.158054,1.92208,1.905953,0.09727,0.092876,6,2.940812,0.000789


## Violin Plots of Different Features

In [39]:
# Variables of Interst
def violin_plots(variables, units, data):
    for v, u in zip(variables, units):
        fig, axes = plt.subplots()
        axes.violinplot(dataset = data[v].to_numpy() )
        axes.set_title('Data Distribution for {}'.format(v))
        axes.set_ylabel(u)
        axes.set_xlabel(v)
        plt.savefig("figs/"+v+"_violin.png")
        plt.show()

In [None]:
    
# Bond Data
variables = [ 'site AtomicWeight_atom1', 'bond_length', 'site CovalentRadius_atom1', 'site Electronegativity_atom1', 'coordination_number']
units = ['amu', 'Angstroms', 'picometers', 'Electronegativity', 'Coord #']
violin_plots(variables, units, features_bond_df)

# Labels
variables = ['thermal_expansion', 'temperature']
units = ['10^-6/K', 'Kelvin']
violin_plots(variables, units, labels_df)

# Structure
variables = ['structural complexity per atom', 'jml_density']
units = ['bits/atom', 'JarvisML Density (?)']
violin_plots(variables, units, features_struc_df)

## Average features for each structure by bond volume fraction

In [8]:
# Make each structure described by weighted average 
features_dict = dict()
d = len(columns_all)  # Number of features considered
i = 0
for idx in features_bond_df.index:
    if feature_phase_ids[i] not in features_dict.keys():     
        features_dict[feature_phase_ids[i]] = np.zeros(d)
    # For feature selection, each feature is a weighted sum of its constituent bond features
    features_dict[feature_phase_ids[i]] = features_dict[feature_phase_ids[i]] + features_bond_df.loc[idx, columns_all].to_numpy()*feature_weights[i]
    i += 1


## Copy over features for each label

In [9]:
# Copy over labels to each bond (just for correlation)
y = list()
temps = list()
combined_data = dict()
labels_skipped = 0
i = 0
for idx in labels_df.index:
    phase = str(int(labels_df.loc[idx, 'phase_id']))
    if phase in features_dict.keys():
        combined_data[idx] = features_dict[phase]
        y.append( labels_df.loc[idx, "thermal_expansion"] )
        temps.append( labels_df.loc[idx, "temperature"] )
    else:
        print("Phases ", phase, " not found in structures")
        labels_skipped += 1
    i += 1

print("Total Number of Labels: ", len(labels_df))
print("Labels Skipped Total: ", labels_skipped)
# New dataframe with features and labels
combined_df = pd.DataFrame.from_dict(combined_data, orient='index', columns=columns_all)
combined_df['temperature'] = temps
combined_df['thermal_expansion'] = y

Phases  13996  not found in structures
Phases  131156  not found in structures
Phases  130566  not found in structures
Phases  71777  not found in structures
Phases  71777  not found in structures
Phases  71776  not found in structures
Phases  71776  not found in structures
Phases  142888  not found in structures
Phases  127780  not found in structures
Phases  122321  not found in structures
Phases  59916  not found in structures
Phases  9782  not found in structures
Phases  12354  not found in structures
Phases  70111  not found in structures
Phases  70110  not found in structures
Phases  70180  not found in structures
Phases  70109  not found in structures
Phases  70109  not found in structures
Phases  77642  not found in structures
Phases  128280  not found in structures
Phases  70179  not found in structures
Phases  15966  not found in structures
Phases  15933  not found in structures
Phases  120633  not found in structures
Phases  120634  not found in structures
Phases  26534  not

## Find Correlation Coefficients for Features

In [None]:
select_columns = ["site AtomicWeight_atom1",  "site Electronegativity_diff", "site CovalentRadius_atom1", "AGNI eta=8.00e-01_atom1", "AGNI eta=6.80e+00_atom2", "G2_0.05_atom1", "coordination_number", "bond_length", "temperature"]

## Generate Heat Map for Correlation Matrix ##
select_columns.append("thermal_expansion")

## List correlation with TEC for all features
for i in range(len(columns_all)):
    print(columns_all[i])
    print(np.corrcoef(combined_df[[columns_all[i], 'thermal_expansion']].to_numpy().T)[0][1] )


In [None]:
# Pearson for Selected Labels
pearson = np.corrcoef(combined_df[select_columns].to_numpy().T)
fig, ax = plt.subplots()
im = ax.imshow(pearson, aspect='auto')
# Show all ticks and label them with the respective list entries
ax.set_xticks(np.arange(len(select_columns)), labels=select_columns)
ax.set_yticks(np.arange(len(select_columns)), labels=select_columns)
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
# Loop over data dimensions and create text annotations.
for i in range(len(select_columns)):
    for j in range(len(select_columns)):
        text = ax.text(j, i, np.round(100*pearson[i, j])/100,
                       ha="center", va="center", color="w")
ax.set_title("Pearson Correlation Matrix")
fig.tight_layout()
plt.savefig("figs/"+"pearson.png")
plt.show()


In [None]:
# Spearman for Selected Labels
spearman = scipy.stats.spearmanr(combined_df[select_columns].to_numpy()).statistic
fig, ax = plt.subplots()
im = ax.imshow(spearman, aspect='auto')
# Show all ticks and label them with the respective list entries
ax.set_xticks(np.arange(len(select_columns)), labels=select_columns)
ax.set_yticks(np.arange(len(select_columns)), labels=select_columns)
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
# Loop over data dimensions and create text annotations.
for i in range(len(select_columns)):
    for j in range(len(select_columns)):
        text = ax.text(j, i, np.round(100*spearman[i, j])/100,
                       ha="center", va="center", color="w")

ax.set_title("Spearman Correlation Matrix")
fig.tight_layout()
plt.savefig("figs/"+"spearman.png")
plt.show()


## List Common Elements and Space Groups

## Violin Plots of TECs and for different subgroups