In [1]:
import numpy as np

In [4]:
def periodicfunc(element):
    """
    A function to output atomic number for each element in the periodic table
    """
    f = open("pt.txt")
    atomicnum = [line.split()[1] for line in f if line.split()[0] == element]
    f.close()
    return int(atomicnum[0])

def coulombmat(file,dim=29):
    """
    This function takes in an xyz input file for a molecule, number of atoms in the biggest molecule  to computes the corresponding coulomb Matrix 
    """
    xyzfile=open(file)
    xyzheader = int(xyzfile.readline())
    xyzfile.close()
    i=0 ; j=0    
    cij=np.zeros((dim,dim))
    chargearray = np.zeros((xyzheader,1))
    xyzmatrix = np.loadtxt(file,skiprows=2,usecols=[1,2,3])
    atominfoarray = np.loadtxt(file,skiprows=2,dtype=np.str,usecols=[0])
    chargearray = [periodicfunc(symbol)  for symbol in atominfoarray]
    
    for i in range(xyzheader):
        for j in range(xyzheader):
            if i == j:
                cij[i,j]=0.5*chargearray[i]**2.4   # Diagonal term described by Potential energy of isolated atom
            else:
                dist= np.linalg.norm(xyzmatrix[i,:] - xyzmatrix[j,:])              
                cij[i,j]=chargearray[i]*chargearray[j]/dist   #Pair-wise repulsion 
    return  cij

In [5]:
coulombmat('xyz/dsgdb9nsd_000004.xyz', 29)

array([[36.8581052 , 30.0230431 ,  2.65348407,  5.64919039,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ],
       [30.0230431 , 36.8581052 ,  5.64919039,  2.65348407,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ],
       [ 2.65348407,  5.64919039,  0.5       ,  0.30090779,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
  

In [13]:
from pathlib import Path
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

ext = '_ext' # or '' 
file,tp = 'test', 't' # 'train', ''

train_fname = Path(f'{file}{ext}.npz') #  Path(f'train{ext}.npz')
npzfile = np.load(train_fname)
m = npzfile['m']

t = pd.read_csv(f'{file}{ext}.csv')
m_index_map = pd.factorize(t['molecule_name'])[1]
# - t['id'].min() for test

df = pd.DataFrame({'m': m, 'molecule_name': m_index_map[m]})
#df = df[0:2]

def apply_coulombmat(row):
    return coulombmat('xyz/%s.xyz' % row['molecule_name'])

df['coulombmat'] = df.progress_apply(apply_coulombmat, axis=1)

100%|██████████| 756113/756113 [43:27<00:00, 290.03it/s]


In [9]:
df['coulombmat']

0          [[36.85810519942594, 5.494741690848265, 5.4947...
1          [[36.85810519942594, 5.494741690848265, 5.4947...
2          [[36.85810519942594, 5.494741690848265, 5.4947...
3          [[36.85810519942594, 5.494741690848265, 5.4947...
4          [[36.85810519942594, 5.494741690848265, 5.4947...
5          [[53.3587073998281, 6.881703335693628, 6.88172...
6          [[53.3587073998281, 6.881703335693628, 6.88172...
7          [[53.3587073998281, 6.881703335693628, 6.88172...
8          [[53.3587073998281, 6.881703335693628, 6.88172...
9          [[73.51669471981023, 8.31508507867743, 8.31508...
10         [[73.51669471981023, 8.31508507867743, 8.31508...
11         [[36.85810519942594, 36.46631299597994, 5.6253...
12         [[36.85810519942594, 36.46631299597994, 5.6253...
13         [[36.85810519942594, 36.46631299597994, 5.6253...
14         [[36.85810519942594, 23.53512310187778, 5.4796...
15         [[36.85810519942594, 23.53512310187778, 5.4796...
16         [[36.85810519

In [17]:
np.save(f'x{tp}_coulombmat{ext}.npy', df['coulombmat'].values)

In [18]:
!ls -altrh x*_coulombmat{ext}.npy

-rw-rw-r-- 1 antor antor  11G jul  1 20:28 x_coulombmat_ext.npy
-rw-rw-r-- 1 antor antor 5,7G jul  2 22:11 xt_coulombmat_ext.npy


In [19]:
x_coulombmat = np.load(f'x{tp}_coulombmat{ext}.npy', allow_pickle=True)
x_coulombmat = np.array(x_coulombmat.tolist())
x_coulombmat.shape

(756113, 29, 29)