## CONTINUOUS_EMBEDDINGS

In [20]:
import os
import pickle as pkl
import numpy as np
import sys
import torch
import torch.nn.functional as F
import pandas as pd

sys.path.append(os.path.expanduser('../../ocpmodels/datasets/embeddings'))

In [2]:
def isNaN(num):
    """
    check if NuN
    """
    return num!= num


def convert(group):
    """
    conbert atom Mendeleev long table groups number to short.
    arg: scalar number in range [1, 18]
    """
    if not isNaN(group):
        if group > 10: group -= 10
        elif group > 7: group = 8
        
    return group


def fenc(cemb_v, prop_index_k, enc='enc', num_classes=0):
    """
    convert value of atom_prop from list of values in cont embeddings
    to torch.tensor(-1) with F.one_hot encoding if specified
    
    args:: 
    cemb_v : list of values,
    prop_index_k: index of property,
    enc: use F.one_hot only if 'enc'
    num_classes: number of classes for F.one_hot
    
    """
    
    vec = torch.tensor(cemb_v[prop_index_k])
    if enc=='enc':
        if isNaN(vec):
            res = torch.zeros(num_classes)
        else:
            res = F.one_hot(vec.long(), num_classes=num_classes)
    else:
            res = vec

    return res.reshape(-1)

In [3]:
from continuous_embeddings import CONTINUOUS_EMBEDDINGS as cemb
for cemb_v in cemb.values():
    cemb_v.append(convert(cemb_v[0]))

In [4]:
prop_index = {
    0: ['group_l',  'enc', 19],
    1: ['period', 'enc', 8],
    7: ['block',  'enc', 5],
    2: ['electroneg', 'float', 0], 
    3: ['radius', 'float', 0],
    4: ['valence', 'float', 0], 
    5: ['ionization', 'float', 0],  
    6: ['affinity', 'float', 0],
    8: ['volume', 'float', 0],
    9: ['group_s', 'enc', 9]
}      

cemb_v_ts_all = torch.tensor([])

for cemb_k, cemb_v in cemb.items():
    
    cemb_v_ts = torch.Tensor()
    
    for prop_index_k, prop_index_v in prop_index.items():
        
        emb = fenc(cemb_v, prop_index_k, prop_index_v[1], prop_index_v[2])
        cemb_v_ts = torch.cat((cemb_v_ts, emb), dim=0)
        
    cemb_v_ts_all = torch.cat((cemb_v_ts_all, cemb_v_ts.reshape(1,-1)), dim=0)

print(cemb_v_ts_all.size())

torch.Size([101, 47])


In [5]:
cemb_v_ts_all[1]

tensor([ 0.0000,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  1.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  1.0000,  0.0000,  0.0000,  0.0000,
         2.1878, 31.0000,  1.0000, 13.5984,  0.7542, 14.1000,  0.0000,  1.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000])

In [6]:
cemb_v_ts_all[1].shape

torch.Size([47])

In [7]:
with open('custom_embedding_v1.pickle', 'wb') as f:
    f.write(pkl.dumps(cemb_v_ts_all))

In [8]:
atom_prop_embed = pd.DataFrame.from_dict(cemb, orient='index', columns=[v[0] for v in prop_index.values()])
atom_prop_embed.index.names=['No']

In [9]:
at_prop = pd.read_csv('atomic_properties.csv', header=0, index_col='No')
atom_prop_embed_join = atom_prop_embed.join(at_prop[['Name', 'Symbol']])

columns = list(atom_prop_embed_join.columns)
columns[0:0] = columns[-2:]

atom_prop_embed_join = atom_prop_embed_join.reindex(columns=columns[:-2])
atom_prop_embed_join.to_csv('atomic_properties_embed.csv')
atom_prop_embed_join.head(20)

Unnamed: 0_level_0,Name,Symbol,group_l,period,block,electroneg,radius,valence,ionization,affinity,volume,group_s
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,,,,,,,,,,,,
1,Hydrogen,H,1.0,1.0,2.187771,31.0,1.0,13.598434,0.754195,1.0,14.1,1.0
2,Helium,He,18.0,1.0,1.0,28.0,2.0,24.587387,-19.700001,1.0,31.799999,8.0
3,Lithium,Li,1.0,2.0,0.048868,128.0,1.0,5.391715,0.618049,1.0,13.1,1.0
4,Beryllium,Be,2.0,2.0,0.126847,96.0,2.0,9.322699,-2.4,1.0,5.0,2.0
5,Boron,B,13.0,2.0,0.254627,84.0,3.0,8.298019,0.279723,2.0,4.6,3.0
6,Carbon,C,14.0,2.0,0.427525,73.0,4.0,11.260296,1.262119,2.0,5.3,4.0
7,Nitrogen,N,15.0,2.0,0.577482,71.0,5.0,14.53413,-1.4,2.0,17.299999,5.0
8,Oxygen,O,16.0,2.0,0.941649,66.0,6.0,13.618054,1.461113,2.0,14.0,6.0
9,Fluorine,F,17.0,2.0,1.017681,57.0,7.0,17.422819,3.40119,2.0,17.1,7.0


In [28]:
atom_prop_embed_values = atom_prop_embed_join.values[:,2:].astype('float32')

In [35]:
atom_prop_embed_values[5]

array([13.        ,  2.        ,  0.25462738, 84.        ,  3.        ,
        8.298019  ,  0.279723  ,  2.        ,  4.6       ,  3.        ],
      dtype=float32)

In [31]:
atom_prop_embed_values_ts = torch.from_numpy(atom_prop_embed_values)

In [38]:
group_F = atom_prop_embed_values_ts[:,1]

In [39]:
group_F

tensor([nan, 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 3., 3., 3., 3., 3., 3., 3.,
        3., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4.,
        4., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
        5., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6.,
        6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., nan, 6., nan, 7., 7.,
        7., 7., 7., 7., 7., 7., 7., nan, nan, nan, nan])

In [43]:
group_F = group_F.apply_(lambda x: F.one_hot(torch.Tensor(x), num_classes=18))

TypeError: new(): data must be a sequence (got float)