# Produce Feature Vectors

### Load Imports and Files

In [25]:
%run imports.py

Imports successfully loaded


In [2]:
%%script false --no-raise-error
masks = loadfn('data/masks.json')

In [3]:
elfcars = loadfn('data/elfcars.json')

In [4]:
ids = [i for i in elfcars.keys()]

In [5]:
binary_formula_ids = loadfn('data/binary_formulas_ids.json')
binary_ids = list(binary_formula_ids.values())

### Prepare Element Statistics

In [6]:
element_formulas_ids = loadfn('data/element_formulas_ids.json')

In [7]:
element_elfcars = {}

for f, ID in element_formulas_ids.items():
    element_elfcars[f] = elfcars[str(ID)]

In [8]:
for f in element_elfcars.keys():
    e = element_elfcars[f]

In [9]:
element_statistics = {}

for f in element_elfcars.keys():
    spatial_data = element_elfcars[f].get_alpha().data['total'].flatten()
    element_statistics[f] = [np.mean(spatial_data), np.std(spatial_data), np.max(spatial_data)]

In [10]:
pprint(element_statistics)

{'Ac': [2.3952880904166385, 0.9221624536523357, 4.186004617717708],
 'Ag': [4.373444566115248, 9.3601895754998, 179.08046335007322],
 'Al': [1.7857315750123286, 1.303596186758892, 21.19068202065356],
 'Ar': [10.08894643061993, 11.014270879716667, 65.85032868291142],
 'As': [2.927476565281586, 2.4780503446526985, 10.965066987160945],
 'Au': [3.4875908165023946, 3.5191039435277944, 45.239207511055575],
 'B': [2.4804683886419365, 2.2235692605077135, 22.52643126985591],
 'Ba': [2.9255046533821023, 1.5481167338698114, 6.574370690467469],
 'Be': [2.0735385961683694, 1.3075652461540779, 6.386318005187307],
 'Bi': [2.5172121868344575, 1.948098995393369, 11.225414565916763],
 'Br': [4.945517100257024, 4.845327657368386, 19.997851987445422],
 'C': [4.143758773426854, 4.089905490463662, 19.370301237493063],
 'Ca': [2.9976317340345755, 2.0591215822889666, 8.90733654534101],
 'Cd': [3.6707442427955344, 6.161494326827701, 128.98490061679868],
 'Ce': [2.3377263456297652, 0.9978052527804198, 8.2886055

### Mask Application & Feature Vector Methods

In [11]:
def create_mask(mask, index):
    shape = mask.shape
    flat = mask.flatten()
    for i in range(len(flat)):
        flat[i] = 1 if int(flat[i]) == index else 0
    mask = flat.reshape(shape)
    return mask

def apply_mask(elfcar, mask):
    ex, ey, ez = elfcar.shape
    mx, my, mz = mask.shape
    xfac, yfac, zfac = int(mx / ex), int(my / ey), int(mz / ez)
    mask_trimmed = mask[::xfac, ::yfac, ::zfac]
    return elfcar * mask_trimmed

def strip_zeros(arr):
    arr = arr.flatten()
    return np.array([a for a in arr if a != 0])

In [12]:
potential_problems = []

def retrieve_objects_methods(i):
    elfcar, mask = elfcars[i].get_alpha(), masks[i]
    elfcar_data, mask_data = elfcar.data['total'], mask.data['total']
    struct = masks[i].structure
    composition = struct.composition
    assert composition == elfcar.structure.composition, 'ELFCAR & mask compositions aren\'t equal'
    cation, anion = [str(e) for e in composition.elements]
    
    def vectorize_well(index):
        f = str(struct.sites[index - 1].specie)
        #assert(mask.value_at(*struct.sites[index - 1].frac_coords) != 0.0) #Ensure atom index matches Bader well
        clean_mask = strip_zeros(apply_mask(elfcar_data, create_mask(mask_data, index)))
        if not clean_mask.any():
            clean_mask = np.array([0.0, 0.0, 0.0])
            potential_problems.append(mask.structure.composition.reduced_formula)
        mean, std, Max = np.mean(clean_mask), np.std(clean_mask), np.max(clean_mask)
        vi = [mean, std, Max]
        if f == cation:
            return vi, True
        elif f == anion:
            return vi, False
        else:
            raise AssertionError('Current atom neither a cation nor an anion')
    return composition, cation, anion, vectorize_well

def well_statistics(i):
    composition, cation, anion, vectorize_well = retrieve_objects_methods(i)
    well_data = []
    for index in range(1, int(composition.num_atoms) + 1):
        vi, flag = vectorize_well(index)
        if flag:
            well_data.append({cation:vi})
        else:
            well_data.append({anion:vi})
    return well_data

In [13]:
%%script false --no-raise-error

all_bader_statistics = {}
for i in tqdm(ids):
    stats = well_statistics(i)
    all_bader_statistics[i] = stats

In [14]:
%%script false --no-raise-error

dumpfn(all_bader_statistics, 'data/bader_statistics.json')

In [15]:
bader_statistics = loadfn('data/bader_statistics.json')

In [16]:
def zipper(l1, l2):
    i1, i2 = iter(l1), iter(l2)
    for _ in range(max(len(l1), len(l2))):
        try:
            yield next(i1)
        except StopIteration:
            pass
        try:
            yield next(i2)
        except StopIteration:
            pass
        
zip_lists = lambda l1, l2: [x for x in zipper(l1, l2)]   
flatten_list = lambda l: [y for x in l for y in x]

def trim_vector(vector, length, ndescriptors):
    if len(vector) > length:
        vector = vector[:length]
    elif len(vector) < length:
        idx = 0
        while len(vector) < length:
            vector += vector[idx:idx+ndescriptors]
            idx += ndescriptors
    return vector

#### Feature Vector Specifications:
- 12 Bader wells are used to construct each vector
- Zipper fashion: cation, anion, cation, anion, ...
- Well statistics are offset by element statistics
    - v_i = Bader_stat - elem_stat
- For each well: \[mean - e_mean, std - e_std, max - e_max\]
- Total length of 36 = 12 * 3
- If less than 12 Bader wells, fill with existing data until len(v) = 36

In [17]:
skips = ['7356']

def add_general_features(vector, cation, anion, nbader_wells):
    pass

def feature_vector(i, num_wells=12):
    i = str(i)
    if i in skips:
        return None
    cation, anion = [str(e) for e in elfcars[i].structure.composition.elements]
    bader_stats = bader_statistics[i]
    cation_vectors, anion_vectors = [], []
    for well in bader_stats:
        f = [*well.keys()][0]
        vi = np.array(well[f]) - np.array(element_statistics[f])
        if f == cation:
            cation_vectors.append(vi)
        elif f == anion:
            anion_vectors.append(vi)
        else:
            raise ValueError('Atom does not match either cation or anion')
    ndescriptors = len(cation_vectors[0])
    length = num_wells * ndescriptors
    vector = np.array(trim_vector(flatten_list(zip_lists(cation_vectors, anion_vectors)), length, ndescriptors))
    reshaped = vector.reshape((-1,3))
    diffsquared_cat_an = np.mean(cation_vectors, axis=0)**2 - np.mean(anion_vectors, axis=0)**2
    vector = np.append(vector, diffsquared_cat_an)    
    return vector

In [18]:
feature_vector(6405).reshape((-1,3))

array([[ 4.50969334e+00,  4.92817031e-01, -7.41933043e-01],
       [-1.67430955e+02, -3.27421624e+02, -3.03831668e+03],
       [ 4.50969334e+00,  4.92817031e-01, -7.41933043e-01],
       [-1.67430955e+02, -3.27421624e+02, -3.03831668e+03],
       [ 4.50969334e+00,  4.92817031e-01, -7.41933043e-01],
       [-1.67430955e+02, -3.27421624e+02, -3.03831668e+03],
       [ 4.50969334e+00,  4.92817031e-01, -7.41933043e-01],
       [-1.67430955e+02, -3.27421624e+02, -3.03831668e+03],
       [ 4.50969334e+00,  4.92817031e-01, -7.41933043e-01],
       [-1.67430955e+02, -3.27421624e+02, -3.03831668e+03],
       [ 4.50969334e+00,  4.92817031e-01, -7.41933043e-01],
       [-1.67430955e+02, -3.27421624e+02, -3.03831668e+03],
       [-2.80127873e+04, -1.07204677e+05, -9.23136772e+06]])

### Produce Feature Vectors & Stack Into Matrix

In [19]:
vector_list = []
for i in tqdm(binary_ids):
    new_vec = feature_vector(i)
    if new_vec is not None:
        vector_list.append(new_vec)

100%|██████████| 427/427 [00:00<00:00, 3347.45it/s]


In [20]:
A = np.vstack(vector_list)
print(A.shape)

(426, 39)


In [27]:
scaler = StandardScaler()
scaler.fit(A)
A_scaled = scaler.transform(A)

In [32]:
np.save('data/A.npy', A)
np.save('data/A_scaled.npy', A_scaled)

### Load Error Data and Make Target Vector

In [83]:
abs_error_data = loadfn('data/absolute_errors.json')

In [84]:
try:
    binary_ids.remove(7356)
except ValueError:
    pass
    
b, drops = [], []
for x in range(len(binary_ids)):
    i = str(binary_ids[x])
    f = elfcars[i].structure.composition.reduced_formula
    try:
        b.append(abs_error_data[f])
    except:
        drops.append((x, f))
assert not drops
print(len(b))

426


In [86]:
np.save('data/b.npy', b)