# Produce Feature Vectors

### Load Imports and Files

In [2]:
%run imports.py


__init__ is deprecated
MaterialsProjectCompatibility will be updated with new correction classes as well as new values of corrections and uncertainties in 2020



Imports successfully loaded


In [14]:
#%%script false --no-raise-error
masks = loadfn('data/masks.json')

In [4]:
elfcars = loadfn('data/elfcars.json')

In [5]:
ids = [i for i in elfcars.keys()]

In [6]:
binary_formula_ids = loadfn('data/binary_formulas_ids.json')
binary_ids = list(binary_formula_ids.values())

### Prepare Element Statistics

In [7]:
element_formulas_ids = loadfn('data/element_formulas_ids.json')

In [8]:
element_elfcars = {}

for f, ID in element_formulas_ids.items():
    element_elfcars[f] = elfcars[str(ID)]

In [9]:
for f in element_elfcars.keys():
    e = element_elfcars[f]

In [10]:
element_statistics = {}

for f in element_elfcars.keys():
    spatial_data = element_elfcars[f].get_alpha().data['total'].flatten()
    element_statistics[f] = [np.mean(spatial_data), np.std(spatial_data), np.max(spatial_data)]

In [11]:
pprint(element_statistics)

{'Ac': [2.3952880904166385, 0.9221624536523357, 4.186004617717708],
 'Ag': [4.373444566115248, 9.3601895754998, 179.08046335007322],
 'Al': [1.7857315750123286, 1.303596186758892, 21.19068202065356],
 'Ar': [10.08894643061993, 11.014270879716667, 65.85032868291142],
 'As': [2.927476565281586, 2.4780503446526985, 10.965066987160945],
 'Au': [3.4875908165023946, 3.5191039435277944, 45.239207511055575],
 'B': [2.4804683886419365, 2.2235692605077135, 22.52643126985591],
 'Ba': [2.9255046533821023, 1.5481167338698114, 6.574370690467469],
 'Be': [2.0735385961683694, 1.3075652461540779, 6.386318005187307],
 'Bi': [2.5172121868344575, 1.948098995393369, 11.225414565916763],
 'Br': [4.945517100257024, 4.845327657368386, 19.997851987445422],
 'C': [4.143758773426854, 4.089905490463662, 19.370301237493063],
 'Ca': [2.9976317340345755, 2.0591215822889666, 8.90733654534101],
 'Cd': [3.6707442427955344, 6.161494326827701, 128.98490061679868],
 'Ce': [2.3377263456297652, 0.9978052527804198, 8.2886055

In [13]:
dumpfn(element_statistics, 'data/element_statistics.json')

### Mask Application & Feature Vector Methods

In [16]:
def create_mask(mask, index):
    shape = mask.shape
    flat = mask.flatten()
    for i in range(len(flat)):
        flat[i] = 1 if int(flat[i]) == index else 0
    mask = flat.reshape(shape)
    return mask

def apply_mask(elfcar, mask):
    ex, ey, ez = elfcar.shape
    mx, my, mz = mask.shape
    xfac, yfac, zfac = int(mx / ex), int(my / ey), int(mz / ez)
    mask_trimmed = mask[::xfac, ::yfac, ::zfac]
    return elfcar * mask_trimmed

def strip_zeros(arr):
    arr = arr.flatten()
    return np.array([a for a in arr if a != 0])

In [17]:
potential_problems = []

def retrieve_objects_methods(i):
    elfcar, mask = elfcars[i].get_alpha(), masks[i]
    elfcar_data, mask_data = elfcar.data['total'], mask.data['total']
    struct = masks[i].structure
    composition = struct.composition
    assert composition == elfcar.structure.composition, 'ELFCAR & mask compositions aren\'t equal'
    cation, anion = [str(e) for e in composition.elements]
    
    def vectorize_well(index):
        f = str(struct.sites[index - 1].specie)
        #assert(mask.value_at(*struct.sites[index - 1].frac_coords) != 0.0) #Ensure atom index matches Bader well
        clean_mask = strip_zeros(apply_mask(elfcar_data, create_mask(mask_data, index)))
        if not clean_mask.any():
            clean_mask = np.array([0.0, 0.0, 0.0])
            potential_problems.append(mask.structure.composition.reduced_formula)
        mean, std, Max = np.mean(clean_mask), np.std(clean_mask), np.max(clean_mask)
        vi = [mean, std, Max]
        if f == cation:
            return vi, True
        elif f == anion:
            return vi, False
        else:
            raise AssertionError('Current atom neither a cation nor an anion')
    return composition, cation, anion, vectorize_well

def well_statistics(i):
    composition, cation, anion, vectorize_well = retrieve_objects_methods(i)
    well_data = []
    for index in range(1, int(composition.num_atoms) + 1):
        vi, flag = vectorize_well(index)
        if flag:
            well_data.append({cation:vi})
        else:
            well_data.append({anion:vi})
    return well_data

In [20]:
#%%script false --no-raise-error

all_bader_statistics = {}
for i in tqdm(binary_ids):
    i = str(i)
    stats = well_statistics(i)
    all_bader_statistics[i] = stats


  0%|          | 0/427 [00:00<?, ?it/s][A
  0%|          | 2/427 [00:08<28:39,  4.05s/it][A
  1%|          | 3/427 [00:56<2:03:03, 17.41s/it][A
  1%|          | 4/427 [00:56<1:26:26, 12.26s/it][A
  1%|          | 5/427 [01:29<2:08:51, 18.32s/it][A
  1%|▏         | 6/427 [01:30<1:31:37, 13.06s/it][A
  2%|▏         | 7/427 [01:37<1:19:19, 11.33s/it][A
  2%|▏         | 8/427 [01:40<1:01:18,  8.78s/it][A
  2%|▏         | 9/427 [01:40<43:32,  6.25s/it]  [A
  2%|▏         | 10/427 [01:41<31:33,  4.54s/it][A
  3%|▎         | 11/427 [01:41<22:34,  3.26s/it][A
  3%|▎         | 12/427 [01:42<16:57,  2.45s/it][A
  3%|▎         | 13/427 [01:46<20:31,  2.97s/it][A
  3%|▎         | 14/427 [01:49<21:11,  3.08s/it][A
  4%|▎         | 15/427 [01:52<21:29,  3.13s/it][A
  4%|▎         | 16/427 [01:53<15:26,  2.25s/it][A
  4%|▍         | 17/427 [01:57<19:28,  2.85s/it][A
  4%|▍         | 18/427 [02:03<26:22,  3.87s/it][A
  4%|▍         | 19/427 [11:55<20:26:16, 180.34s/it][A
  5%|▍    

 37%|███▋      | 158/427 [33:17<29:30,  6.58s/it][A
 37%|███▋      | 159/427 [33:26<32:40,  7.31s/it][A
 37%|███▋      | 160/427 [33:33<32:56,  7.40s/it][A
 38%|███▊      | 161/427 [33:35<25:07,  5.67s/it][A
 38%|███▊      | 162/427 [33:36<18:38,  4.22s/it][A
 38%|███▊      | 163/427 [33:36<13:18,  3.02s/it][A
 38%|███▊      | 164/427 [33:36<09:57,  2.27s/it][A
 39%|███▊      | 165/427 [33:37<07:20,  1.68s/it][A
 39%|███▉      | 166/427 [33:37<05:58,  1.37s/it][A
 39%|███▉      | 167/427 [33:38<04:33,  1.05s/it][A
 39%|███▉      | 168/427 [33:38<03:26,  1.25it/s][A
 40%|███▉      | 169/427 [33:43<09:00,  2.10s/it][A
 40%|███▉      | 170/427 [33:46<10:09,  2.37s/it][A
 40%|████      | 171/427 [33:46<07:15,  1.70s/it][A
 40%|████      | 172/427 [33:52<12:53,  3.03s/it][A
 41%|████      | 173/427 [33:56<13:58,  3.30s/it][A
 41%|████      | 174/427 [34:12<29:40,  7.04s/it][A
 41%|████      | 175/427 [36:08<2:47:19, 39.84s/it][A
 41%|████      | 176/427 [38:47<5:16:08, 75.

 72%|███████▏  | 307/427 [1:22:41<23:59, 11.99s/it][A
 72%|███████▏  | 308/427 [1:22:54<24:38, 12.42s/it][A
 72%|███████▏  | 309/427 [1:22:55<17:31,  8.91s/it][A
 73%|███████▎  | 310/427 [1:22:55<12:25,  6.37s/it][A
 73%|███████▎  | 311/427 [1:23:59<45:41, 23.63s/it][A
 73%|███████▎  | 312/427 [1:24:00<31:57, 16.67s/it][A
 73%|███████▎  | 313/427 [1:24:04<24:33, 12.93s/it][A
 74%|███████▎  | 314/427 [1:24:05<17:36,  9.35s/it][A
 74%|███████▍  | 315/427 [1:24:06<12:51,  6.89s/it][A
 74%|███████▍  | 316/427 [1:24:16<14:43,  7.96s/it][A
 74%|███████▍  | 317/427 [1:24:18<11:06,  6.06s/it][A
 74%|███████▍  | 318/427 [1:24:18<07:48,  4.30s/it][A
 75%|███████▍  | 319/427 [1:24:29<11:21,  6.31s/it][A
 75%|███████▍  | 320/427 [1:24:30<08:06,  4.55s/it][A
 75%|███████▌  | 321/427 [1:24:31<06:14,  3.53s/it][A
 75%|███████▌  | 322/427 [1:24:37<07:49,  4.47s/it][A
 76%|███████▌  | 323/427 [1:24:40<06:48,  3.93s/it][A
 76%|███████▌  | 324/427 [1:24:45<07:04,  4.12s/it][A
 76%|█████

In [21]:
#%%script false --no-raise-error

dumpfn(all_bader_statistics, 'data/bader_statistics.json')

In [15]:
bader_statistics = loadfn('data/bader_statistics.json')

In [16]:
def zipper(l1, l2):
    i1, i2 = iter(l1), iter(l2)
    for _ in range(max(len(l1), len(l2))):
        try:
            yield next(i1)
        except StopIteration:
            pass
        try:
            yield next(i2)
        except StopIteration:
            pass
        
zip_lists = lambda l1, l2: [x for x in zipper(l1, l2)]   
flatten_list = lambda l: [y for x in l for y in x]

def trim_vector(vector, length, ndescriptors):
    if len(vector) > length:
        vector = vector[:length]
    elif len(vector) < length:
        idx = 0
        while len(vector) < length:
            vector += vector[idx:idx+ndescriptors]
            idx += ndescriptors
    return vector

#### Feature Vector Specifications:
- 12 Bader wells are used to construct each vector
- Zipper fashion: cation, anion, cation, anion, ...
- Well statistics are offset by element statistics
    - v_i = Bader_stat - elem_stat
- For each well: \[mean - e_mean, std - e_std, max - e_max\]
- Total length of 36 = 12 * 3
- If less than 12 Bader wells, fill with existing data until len(v) = 36

In [17]:
skips = ['7356']

def add_general_features(vector, cation, anion, nbader_wells):
    pass

def feature_vector(i, num_wells=12):
    i = str(i)
    if i in skips:
        return None
    cation, anion = [str(e) for e in elfcars[i].structure.composition.elements]
    bader_stats = bader_statistics[i]
    cation_vectors, anion_vectors = [], []
    for well in bader_stats:
        f = [*well.keys()][0]
        vi = np.array(well[f]) - np.array(element_statistics[f])
        if f == cation:
            cation_vectors.append(vi)
        elif f == anion:
            anion_vectors.append(vi)
        else:
            raise ValueError('Atom does not match either cation or anion')
    ndescriptors = len(cation_vectors[0])
    length = num_wells * ndescriptors
    vector = np.array(trim_vector(flatten_list(zip_lists(cation_vectors, anion_vectors)), length, ndescriptors))
    reshaped = vector.reshape((-1,3))
    diffsquared_cat_an = np.mean(cation_vectors, axis=0)**2 - np.mean(anion_vectors, axis=0)**2
    vector = np.append(vector, diffsquared_cat_an)    
    return vector

In [18]:
feature_vector(6405).reshape((-1,3))

array([[ 4.50969334e+00,  4.92817031e-01, -7.41933043e-01],
       [-1.67430955e+02, -3.27421624e+02, -3.03831668e+03],
       [ 4.50969334e+00,  4.92817031e-01, -7.41933043e-01],
       [-1.67430955e+02, -3.27421624e+02, -3.03831668e+03],
       [ 4.50969334e+00,  4.92817031e-01, -7.41933043e-01],
       [-1.67430955e+02, -3.27421624e+02, -3.03831668e+03],
       [ 4.50969334e+00,  4.92817031e-01, -7.41933043e-01],
       [-1.67430955e+02, -3.27421624e+02, -3.03831668e+03],
       [ 4.50969334e+00,  4.92817031e-01, -7.41933043e-01],
       [-1.67430955e+02, -3.27421624e+02, -3.03831668e+03],
       [ 4.50969334e+00,  4.92817031e-01, -7.41933043e-01],
       [-1.67430955e+02, -3.27421624e+02, -3.03831668e+03],
       [-2.80127873e+04, -1.07204677e+05, -9.23136772e+06]])

### Produce Feature Vectors & Stack Into Matrix

In [19]:
vector_list = []
for i in tqdm(binary_ids):
    new_vec = feature_vector(i)
    if new_vec is not None:
        vector_list.append(new_vec)

100%|██████████| 427/427 [00:00<00:00, 3347.45it/s]


In [20]:
A = np.vstack(vector_list)
print(A.shape)

(426, 39)


In [27]:
scaler = StandardScaler()
scaler.fit(A)
A_scaled = scaler.transform(A)

In [32]:
np.save('data/A.npy', A)
np.save('data/A_scaled.npy', A_scaled)

### Load Error Data and Make Target Vector

In [83]:
abs_error_data = loadfn('data/absolute_errors.json')

In [84]:
try:
    binary_ids.remove(7356)
except ValueError:
    pass
    
b, drops = [], []
for x in range(len(binary_ids)):
    i = str(binary_ids[x])
    f = elfcars[i].structure.composition.reduced_formula
    try:
        b.append(abs_error_data[f])
    except:
        drops.append((x, f))
assert not drops
print(len(b))

426


In [86]:
np.save('data/b.npy', b)