In [50]:
import glob
import os
import pickle as pkl
import re
import sys

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm

In [51]:
with open("../../datasets/nrel_abx3_db.pkl", "rb") as f:
    nrel_db = pkl.load(f)

In [52]:
features = list(filter(lambda x: len(x) > 0, nrel_db.keys()))
relevant_features = features[1:-6]
print(f"N Features: {len(features)}")
print(f"N Relevant Features: {len(relevant_features)}")
relevant_features

N Features: 30
N Relevant Features: 23


['sorted formula     sorted chemical formula',
 'ΔHf (eV/atom)     final enthalpy, eV/atom',
 'Stb     qhull stability',
 'ΔHGS enthalpy wrt ground state',
 'ΔHDecomp enthalpy wrt decomp',
 'ΔEGS energyperatom wrt ground state',
 'ICSD SG     ICSD space group number',
 'final SG     final space group number',
 'Etot (eV/Atom)     total energy, per atom, eV/Atom',
 'minID     id having min energy for this formula',
 'ΔEcbm     gwvd: Ecbm - parentEcbm: difference in conduction band minimum energy, eV',
 'ΔEvbm     gwvd: Evbm - parentEvbm: difference in valence band maximum energy, eV',
 'εe(IP)     static electronic dielectric constant (independent particle)',
 'εe(TDDFT)     static electronic dielectric constant (TD-DFT)',
 'εe(post-lopt)     static electronic dielectric constant (from post-lopt)',
 'Eg (eV)     bandgap, eV',
 'Eg,d (eV)     direct bandgap, eV',
 'me*/m0 DOS effective mass for electrons, T=1000K',
 'mh*/m0 DOS effective mass for holes, T=1000K',
 'netCharge     net char

In [53]:
nrel_db = nrel_db.replace(regex='', value=np.nan)

In [54]:
nrel_clean = nrel_db[nrel_db.columns[nrel_db.columns != '']]

In [55]:
def cast_df(df, debug=False):
    final_df = pd.DataFrame()
    for col_label in df.columns:
        if debug: print(col_label)
        current_type = None
        if col_label in final_df.columns:
            current_type = final_df[col_label].dtype
            if debug and False: print(current_type)
        for t in [int, float, bool]:
            try:
                if current_type in {float, int, bool}:
                    break
                casted_col = df[col_label].astype(t)
                final_df[col_label] = casted_col
                if debug: print(f"col {col_label} cast as {t}")
            except:
                if debug and False: print(f"type {t} failed for {col_label}")
                continue
        if current_type == None:
            final_df[col_label] = df[col_label].astype(str)
    if debug: print(df[col_label])

    return final_df

In [63]:
with open("nrel_clean.pkl", "wb") as f:
    pkl.dump(nrel_clean, f)

In [67]:
static_ep_label = 'εe(IP)     static electronic dielectric constant (independent particle)'
static_ep = nrel_clean[static_ep_label]
len(static_ep[static_ep.notna()])

47

In [70]:
static_ep_tddft = nrel_clean['εe(TDDFT)     static electronic dielectric constant (TD-DFT)']
len(static_ep_tddft[static_ep_tddft.notna()])


47

In [71]:
eff_mass = nrel_clean['me*/m0 DOS effective mass for electrons, T=1000K']
len(eff_mass[eff_mass.notna()])

1174

In [74]:
nonzero_idxs = dict()
for k in nrel_clean.columns:
    x = nrel_clean[k]
    nonzero_idxs[k] = x[x.notna()].index

In [76]:
nz_lens = dict()
for k in nonzero_idxs.keys():
    nz_lens[k] = len(nonzero_idxs[k])
pd.Series(nz_lens)

id     MatDB ID                                                                        1639
sorted formula     sorted chemical formula                                             1639
ΔHf (eV/atom)     final enthalpy, eV/atom                                               991
Stb     qhull stability                                                                 991
ΔHGS enthalpy wrt ground state                                                          991
ΔHDecomp enthalpy wrt decomp                                                            991
ΔEGS energyperatom wrt ground state                                                     991
ICSD SG     ICSD space group number                                                    1318
final SG     final space group number                                                  1639
Etot (eV/Atom)     total energy, per atom, eV/Atom                                     1592
minID     id having min energy for this formula                                 