In [9]:
import glob
import os
import pickle as pkl
import re
import sys

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm

In [3]:
with open("nrel_clean.pkl", "rb") as f:
    nrel_db = pkl.load(f)

In [4]:
features = list(filter(lambda x: len(x) > 0, nrel_db.keys()))
relevant_features = features[1:-6]
print(f"N Features: {len(features)}")
print(f"N Relevant Features: {len(relevant_features)}")
relevant_features

N Features: 30
N Relevant Features: 23


['sorted formula     sorted chemical formula',
 'ΔHf (eV/atom)     final enthalpy, eV/atom',
 'Stb     qhull stability',
 'ΔHGS enthalpy wrt ground state',
 'ΔHDecomp enthalpy wrt decomp',
 'ΔEGS energyperatom wrt ground state',
 'ICSD SG     ICSD space group number',
 'final SG     final space group number',
 'Etot (eV/Atom)     total energy, per atom, eV/Atom',
 'minID     id having min energy for this formula',
 'ΔEcbm     gwvd: Ecbm - parentEcbm: difference in conduction band minimum energy, eV',
 'ΔEvbm     gwvd: Evbm - parentEvbm: difference in valence band maximum energy, eV',
 'εe(IP)     static electronic dielectric constant (independent particle)',
 'εe(TDDFT)     static electronic dielectric constant (TD-DFT)',
 'εe(post-lopt)     static electronic dielectric constant (from post-lopt)',
 'Eg (eV)     bandgap, eV',
 'Eg,d (eV)     direct bandgap, eV',
 'me*/m0 DOS effective mass for electrons, T=1000K',
 'mh*/m0 DOS effective mass for holes, T=1000K',
 'netCharge     net char

In [7]:
nonzero_idxs = dict()
for k in nrel_db.columns:
    x = nrel_db[k]
    nonzero_idxs[k] = x[x.notna()].index

In [8]:
nz_lens = dict()
for k in nonzero_idxs.keys():
    nz_lens[k] = len(nonzero_idxs[k])
pd.Series(nz_lens)

id     MatDB ID                                                                        1639
sorted formula     sorted chemical formula                                             1639
ΔHf (eV/atom)     final enthalpy, eV/atom                                               991
Stb     qhull stability                                                                 991
ΔHGS enthalpy wrt ground state                                                          991
ΔHDecomp enthalpy wrt decomp                                                            991
ΔEGS energyperatom wrt ground state                                                     991
ICSD SG     ICSD space group number                                                    1318
final SG     final space group number                                                  1639
Etot (eV/Atom)     total energy, per atom, eV/Atom                                     1592
minID     id having min energy for this formula                                 

In [45]:
nrel_preusable = nrel_db.loc[nrel_db['εe(TDDFT)     static electronic dielectric constant (TD-DFT)'].notna()]
useable_columns = (nrel_preusable.notna()).all(axis=0)

In [60]:
nrel_usable = nrel_preusable.loc[:, useable_columns.values]
nrel_relevant = nrel_usable[nrel_usable.columns[1:-4]]
len(nrel_relevant.columns)

12

In [92]:
nrel_usable['id     MatDB ID']

27      289692
33      289828
59      289861
136     289698
142     289684
151     289615
204     289620
214     286928
250      10465
256      10892
267     286933
272     286938
282     286943
294     289632
325     289652
341     289823
346      12401
348     289702
354     289708
360     289723
362      11636
369     290076
378     290291
470     289999
471     290087
486     289815
497     289757
534     289749
543     289780
607     289994
611     290176
720     289958
725     290106
909      12364
915      11769
1268     12181
1272     11559
1319    289890
1356    290067
1375    290046
1403     11295
1404     11392
1510    290197
1538    290226
1586    290229
1590     10842
1635    290213
Name: id     MatDB ID, dtype: object

In [67]:
nrel_relevant

Unnamed: 0,sorted formula sorted chemical formula,final SG final space group number,"ΔEcbm gwvd: Ecbm - parentEcbm: difference in conduction band minimum energy, eV","ΔEvbm gwvd: Evbm - parentEvbm: difference in valence band maximum energy, eV",εe(IP) static electronic dielectric constant (independent particle),εe(TDDFT) static electronic dielectric constant (TD-DFT),"Eg (eV) bandgap, eV","Eg,d (eV) direct bandgap, eV","me*/m0 DOS effective mass for electrons, T=1000K","mh*/m0 DOS effective mass for holes, T=1000K",netCharge net charge,numSpin number of spin values: 1(non-polarized) or 2(polarized)
27,Al Bi O3,161,0.417,-0.634,5.357,5.101,3.976,4.117,0.929,7.501,0.0,1
33,Al Ce O3,72,0.562,-0.681,3.82,3.738,3.059,3.059,0.386,12.433,0.0,2
59,Al La O3,167,0.631,-1.146,3.746,3.666,5.671,5.698,0.644,7.397,0.0,1
136,Ba3 Bi N,194,0.184,-0.346,10.338,9.54,1.201,1.201,1.234,0.373,0.0,1
142,Ba3 N Sb,194,0.21,-0.264,9.606,8.903,1.143,1.143,0.992,0.387,0.0,1
151,Ba Ce O3,62,2.116,-0.446,4.316,4.169,5.092,5.093,29.93,8.265,0.0,1
204,Ba Mn O3,194,0.996,-0.607,4.889,4.305,4.098,4.098,7.604,6.632,0.0,2
214,Ba Ni O3,194,1.205,-0.227,6.097,5.05,2.906,2.967,7.287,5.543,0.0,1
250,Ba O3 Sn,221,0.917,-1.05,3.917,3.827,2.979,3.485,0.339,4.888,0.0,1
256,Ba O3 Ti,99,0.081,-0.523,5.376,5.212,2.697,2.963,1.503,3.635,0.0,1


In [73]:
inverse_square_eps = nrel_relevant['εe(TDDFT)     static electronic dielectric constant (TD-DFT)'].astype(float).pow(-2)
rydberg_const = 13.605693122994 #(26) eV
direct_band_gap = nrel_relevant['Eg (eV)     bandgap, eV'].astype(float)
me = nrel_relevant['me*/m0 DOS effective mass for electrons, T=1000K'].astype(float)
mh = nrel_relevant['mh*/m0 DOS effective mass for holes, T=1000K'].astype(float)
reduced_mass = me*mh/(me+mh)
ebes = direct_band_gap - rydberg_const*inverse_square_eps*reduced_mass
ebes

27      3.543768
33      2.694455
59      5.071254
136     1.158181
142     1.095213
151     0.022094
204     1.497403
214     1.226404
250     2.684502
256     2.164425
267     1.937370
272     1.858228
282     4.200903
294     1.734616
325     2.254889
341     1.544480
346     1.175776
348     1.837260
354     0.976705
360     2.959921
362     2.688578
369     1.173696
378     0.720416
470     5.343578
471     3.517129
486     1.508669
497     0.980362
534     4.335836
543     2.753603
607     0.833486
611    -0.107334
720     5.563031
725     3.712834
909     0.991508
915     0.381933
1268    2.192362
1272    1.547854
1319    1.892272
1356    1.193997
1375    0.209838
1403    1.157684
1404    1.355180
1510    2.450560
1538    0.700199
1586    3.731389
1590    2.277210
1635    2.506527
dtype: float64

In [90]:
compositions = nrel_relevant[
    'sorted formula     sorted chemical formula'].str.findall('((?P<el>[A-Z][a-z]?)(?P<n>\d{0,2}))')
parsed_comps = dict()
for k,v in compositions.items():
    altered_v = []
    for t in v:
        tl = list(t[1:])
        if len(t[-1]) == 0:
            tl[-1] = 1
        else:
            tl[-1] = int(t[-1])
        altered_v.append(tuple(tl))
    parsed_comps[k] = altered_v
parsed_comps = pd.Series(parsed_comps)
parsed_comps

27       [(Al, 1), (Bi, 1), (O, 3)]
33       [(Al, 1), (Ce, 1), (O, 3)]
59       [(Al, 1), (La, 1), (O, 3)]
136      [(Ba, 3), (Bi, 1), (N, 1)]
142      [(Ba, 3), (N, 1), (Sb, 1)]
151      [(Ba, 1), (Ce, 1), (O, 3)]
204      [(Ba, 1), (Mn, 1), (O, 3)]
214      [(Ba, 1), (Ni, 1), (O, 3)]
250      [(Ba, 1), (O, 3), (Sn, 1)]
256      [(Ba, 1), (O, 3), (Ti, 1)]
267      [(Ba, 1), (O, 3), (Ti, 1)]
272      [(Ba, 1), (O, 3), (Ti, 1)]
282      [(Ba, 1), (O, 3), (Zr, 1)]
294      [(Ba, 1), (S, 3), (Sn, 1)]
325      [(Ba, 1), (S, 3), (Zr, 1)]
341      [(Bi, 1), (Ca, 3), (N, 1)]
346      [(Bi, 1), (Co, 1), (O, 3)]
348      [(Bi, 1), (Cr, 1), (O, 3)]
354      [(Bi, 1), (Fe, 1), (O, 3)]
360      [(Bi, 1), (Ga, 1), (O, 3)]
362      [(Bi, 1), (In, 1), (O, 3)]
369      [(Bi, 1), (Mg, 3), (N, 1)]
378      [(Bi, 1), (N, 1), (Sr, 3)]
470      [(Br, 1), (Li, 3), (O, 1)]
471      [(Br, 1), (Na, 3), (O, 1)]
486      [(Ca, 3), (N, 1), (Sb, 1)]
497      [(Ca, 1), (Ce, 1), (O, 3)]
534      [(Ca, 1), (O, 3), (

In [11]:
from sklearn import linear_model

In [None]:
reg = linear_model.LassoCV()
reg.fit()