# Produce Feature Vectors

### Load Imports and Files

In [1]:
%run imports.py


__init__ is deprecated
MaterialsProjectCompatibility will be updated with new correction classes as well as new values of corrections and uncertainties in 2020



Imports successfully loaded


In [2]:
masks = loadfn('data/masks.json')
elfcars = loadfn('data/elfcars.json')

In [3]:
ids = [i for i in masks.keys()]
print(ids)

['6405', '3890', '7373', '11133', '3914', '5804', '10090', '3957', '3996', '6135', '3955', '4648', '4827', '3901', '2779', '6017', '4392', '3992', '4837', '6454', '4601', '4826', '3929', '3926', '3930', '4019', '6412', '5874', '6240', '5312', '2936', '10792', '4617', '4102', '4782', '8414', '5364', '4870', '7394', '10278', '10118', '4406', '6359', '4026', '10408', '9789', '9416', '2180', '4999', '4510', '3947', '4072', '2072', '9887', '10201', '2597', '3882', '4212', '4226', '4682', '9493', '7850', '9879', '8450', '8661', '4331', '4687', '5723', '10375', '8706', '10389', '5431', '7076', '7562', '10285', '7722', '5662', '4089', '9731', '4656', '9068', '8428', '9914', '6190', '7962', '10642', '5462', '4611', '4035', '7921', '7740', '6344', '5557', '5570', '6853', '6417', '4283', '4311', '5692', '4252', '3942', '3989', '9927', '8986', '4758', '2167', '4787', '2820', '7468', '8989', '10948', '6283', '7408', '11087', '6862', '4757', '6094', '2085', '4079', '4346', '7239', '5863', '6872', '4

### Prepare Element Statistics

In [4]:
element_formulas_ids = loadfn('data/element_formulas_ids.json')

In [5]:
element_elfcars = {}

for f, ID in element_formulas_ids.items():
    element_elfcars[f] = elfcars[str(ID)]

In [6]:
for f in element_elfcars.keys():
    e = element_elfcars[f]

In [7]:
element_statistics = {}

for f in element_elfcars.keys():
    spatial_data = element_elfcars[f].get_alpha().data['total'].flatten()
    element_statistics[f] = [np.mean(spatial_data), np.std(spatial_data), np.max(spatial_data)]

In [8]:
pprint(element_statistics)

{'Ac': [2.3952880904166385, 0.9221624536523357, 4.186004617717708],
 'Ag': [4.373444566115248, 9.3601895754998, 179.08046335007322],
 'Al': [1.7857315750123286, 1.303596186758892, 21.19068202065356],
 'Ar': [10.08894643061993, 11.014270879716667, 65.85032868291142],
 'As': [2.927476565281586, 2.4780503446526985, 10.965066987160945],
 'Au': [3.4875908165023946, 3.5191039435277944, 45.239207511055575],
 'B': [2.4804683886419365, 2.2235692605077135, 22.52643126985591],
 'Ba': [2.9255046533821023, 1.5481167338698114, 6.574370690467469],
 'Be': [2.0735385961683694, 1.3075652461540779, 6.386318005187307],
 'Bi': [2.5172121868344575, 1.948098995393369, 11.225414565916763],
 'Br': [4.945517100257024, 4.845327657368386, 19.997851987445422],
 'C': [4.143758773426854, 4.089905490463662, 19.370301237493063],
 'Ca': [2.9976317340345755, 2.0591215822889666, 8.90733654534101],
 'Cd': [3.6707442427955344, 6.161494326827701, 128.98490061679868],
 'Ce': [2.3377263456297652, 0.9978052527804198, 8.2886055

### Mask Application & Feature Vector Methods

In [10]:
m = masks['6405']

In [17]:
m.structure.composition.get_el_amt_dict()

defaultdict(float, {'Li': 1.0, 'H': 1.0})

In [33]:
c = m.structure.sites[0].frac_coords
m.value_at(*c)

1.0

In [41]:
def create_mask(mask, index):
    shape = mask.shape
    flat = mask.flatten()
    for i in range(len(flat)):
        flat[i] = 1 if int(flat[i]) == index else 0
    mask = flat.reshape(shape)
    return mask

def apply_mask(elfcar, mask):
    ex, ey, ez = elfcar.shape
    mx, my, mz = mask.shape
    masked = []
    for i1 in range(len(elfcar)):
        xfac = int(mx / ex)
        for i2 in range(len(elfcar[i1])):
            yfac = int(my / ey)
            for i3 in range(len(elfcar[i1][i2])):
                zfac = int(mz / ez)
                masked.append(elfcar[i1][i2][i3] * mask[i1 * xfac][i2 * yfac][i3 * zfac])
    return np.array(masked)

def strip_zeros(arr):
    return np.array([a for a in arr if a != 0])

#### Feature Vector Specifications:
- Top 15 most-dense bader wells are used
- For each well: \[mean, standard deviation, max / mean ^ 2\]
- Total length of 45
- If less than 15 Bader wells, repeat until len(v) = 45

In [59]:
def feature_vector(i, num_wells=15):
    elfcar_data, mask_data = elfcars[i].get_alpha().data['total'], masks[i].data['total']
    struct = masks[i].structure
    composition = struct.composition
    amts = composition.get_el_amt_dict()
    print(composition, amts)
    assert(composition == elfcars[i].structure.composition)
    vector = []
    for index in range(1, int(composition.num_atoms) + 1):
        f = str(struct.sites[index - 1].specie)
        assert(masks[i].value_at(*struct.sites[index - 1].frac_coords) != 0.0)
        clean_mask = strip_zeros(apply_mask(elfcar_data, create_mask(mask_data, index)))
        mean, std, Max = np.mean(clean_mask), np.std(clean_mask), np.max(clean_mask)
        elem_mean, elem_std, elem_max = element_statistics[f]
        print(f)
        print([mean, std, Max])
        print(element_statistics[f])
        vi = [mean - amts[f] * elem_mean, std - amts[f] * elem_std, Max - amts[f] * elem_max]
        vector.append(vi)
#     vector = sorted(vector, key=lambda e: e[0])
    ndescriptors = len(vector[0])
    length = num_wells * ndescriptors
    print('ndescriptors', ndescriptors, 'length', length)
#     vector = [y for x in vector for y in x]
#     if len(vector) > length:
#         vector = vector[:length]
#     elif len(vector) < length:
#         idx = 0
#         while len(vector) < length:
#             vector += vector[idx:idx+ndescriptors]
#             idx += ndescriptors
#     return np.array(vector)
    return vector

In [60]:
pprint(feature_vector('6405', num_wells=1))

Li1 H1 defaultdict(<class 'float'>, {'Li': 1.0, 'H': 1.0})
Li
[7.864990980618099, 4.1663760702205845, 14.589026895463757]
[3.3552976432507675, 3.673559039520009, 15.330959938151894]
H
[2.43177302535177, 2.16549515706697, 9.670468365704853]
[169.86272797512584, 329.5871192818629, 3047.9871524550476]
ndescriptors 3 length 3
[[4.509693337367332, 0.4928170307005755, -0.741933042688137],
 [-167.43095494977408, -327.4216241247959, -3038.3166840893427]]


### Produce Feature Vectors & Stack Into Matrix

In [None]:
vector_list = []
for i in tqdm(ids):
    new_vec = feature_vector(i)
    vector_list.append(new_vec)

In [None]:
A = np.vstack(vector_list)
A.shape

### Load Error Data and Filter A

In [None]:
error_data = pd.read_json(loadfn('data/error_data.json'), orient='columns')
exp = error_data.Formula.tolist()

In [None]:
b, drops = [], []
for x in range(len(ids)):
    i = ids[x]
    found = False
    f = elfcars[i].structure.composition.reduced_formula
    if f in exp:
        for j in range(len(error_data)):
            row = error_data.iloc[j]
            if f == row['Formula']:
                b.append(row['Error'] / row['Experimental'])
    else:
        drops.append(x)

In [None]:
A = np.delete(A, drops, axis=0)
A.shape

### Save A and b Locally

In [None]:
np.save('data/A.npy', A)
np.save('data/b.npy', b)

### Preliminary Least Squares Regression Testing

In [None]:
A, b = np.load('data/A.npy'), np.load('data/b.npy')

In [None]:
A_train, A_test = A[:130], A[130:]
b_train, b_test = b[:130], b[130:]

In [None]:
x_hat = np.linalg.inv(A_train.T @ A_train) @ A_train.T @ b_train

In [None]:
plt.figure(figsize=(16,10))
plt.bar(x=[i for i in range(45)], height=x_hat)
plt.show()

In [None]:
result = A_test @ x_hat
for i in range(len(result)):
    print(result[i], '\t', b_test[i])

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
reg = model.fit(A_train, b_train)

In [None]:
result2 = model.predict(A_test)
for i in range(len(result2)):
    print(result2[i], '\t', b_test[i])

In [None]:
plt.figure(figsize=(16,10))
indices = [i for i in range(45)]
#Calculate optimal width
width = np.min(np.diff(indices))/3

plt.bar(indices-width,x_hat,width)
plt.bar(indices,reg.coef_,width)
plt.show()