In [2]:
import numpy as np


In [20]:
def periodicfunc(element):
    """
    A function to output atomic number for each element in the periodic table
    """
    f = open("pt.txt")
    atomicnum = [line.split()[1] for line in f if line.split()[0] == element]
    f.close()
    return int(atomicnum[0])

def coulombmat(file,dim=29):
    """
    This function takes in an xyz input file for a molecule, number of atoms in the biggest molecule  to computes the corresponding coulomb Matrix 
    """
    xyzfile=open(file)
    xyzheader = int(xyzfile.readline())
    xyzfile.close()
    i=0 ; j=0    
    cij=np.zeros((dim,dim))
    chargearray = np.zeros((xyzheader,1))
    xyzmatrix = np.loadtxt(file,skiprows=2,usecols=[1,2,3])
    atominfoarray = np.loadtxt(file,skiprows=2,dtype=np.str,usecols=[0])
    chargearray = [periodicfunc(symbol)  for symbol in atominfoarray]
    
    for i in range(xyzheader):
        for j in range(xyzheader):
            if i == j:
                cij[i,j]=0.5*chargearray[i]**2.4   # Diagonal term described by Potential energy of isolated atom
            else:
                dist= np.linalg.norm(xyzmatrix[i,:] - xyzmatrix[j,:])              
                cij[i,j]=chargearray[i]*chargearray[j]/dist   #Pair-wise repulsion 
    return  cij

In [9]:
coulombmat('xyz/dsgdb9nsd_000004.xyz', 29)

array([[36.8581052 , 30.0230431 ,  2.65348407,  5.64919039,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ],
       [30.0230431 , 36.8581052 ,  5.64919039,  2.65348407,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ],
       [ 2.65348407,  5.64919039,  0.5       ,  0.30090779,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
  

In [32]:
from pathlib import Path
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

train_fname = Path('train.npz')
npzfile = np.load(train_fname)
m = npzfile['m']

t = pd.read_csv('../train.csv')
m_index_map = pd.factorize(t['molecule_name'])[1]
# - t['id'].min() for test

df = pd.DataFrame({'m': m, 'molecule_name': m_index_map[m]})
#df = df[0:2]

def apply_coulombmat(row):
    return coulombmat('xyz/%s.xyz' % row['molecule_name'])

df['coulombmat'] = df.progress_apply(apply_coulombmat, axis=1)

100%|██████████| 785836/785836 [56:58<00:00, 229.86it/s]  


In [28]:
df['coulombmat']

array([[36.8581052 ,  5.49474169,  5.49474894,  5.49477531,  5.49476946,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ],
       [ 5.49474169,  0.5       ,  0.56081483,  0.5608061 ,  0.56080321,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ],
       [ 5.49474894,  0.56081483,  0.5       ,  0.56080291,  0.56080582,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
  

In [33]:
np.save('x_coulombmat.npy', df['coulombmat'].values)

In [34]:
!ls -altrh x_coulombmat.npy

-rw-rw-r-- 1 pavel pavel 5.9G Jun 15 11:38 x_coulombmat.npy


In [3]:
x_coulombmat = np.load('x_coulombmat.npy', allow_pickle=True)
x_coulombmat = np.array(x_coulombmat.tolist())
x_coulombmat.shape

(785836, 29, 29)