In [1]:
!pip install ase qml

Collecting qml
[?25l  Downloading https://files.pythonhosted.org/packages/7b/f3/c08d18659054cdf5e5dc531ee32ab20e6aa1d3772545837557828a03b1f5/qml-0.4.0.27.tar.gz (41kB)
[K     |████████████████████████████████| 51kB 22.5MB/s eta 0:00:01
Building wheels for collected packages: qml
  Building wheel for qml (setup.py) ... [?25ldone
[?25h  Stored in directory: /home/ubuntu/.cache/pip/wheels/ed/12/cf/9f6f875260ccc47dbdbd131631c416e24c933c84a8a20e2bc8
Successfully built qml
Installing collected packages: qml
Successfully installed qml-0.4.0.27


In [2]:
import ase
from ase import Atoms
import qml
import ase.visualize
def view(molecule):
    # Select a molecule
    mol = structures[structures['molecule_name'] == molecule]
    
    # Get atomic coordinates
    xcart = mol.iloc[:, 3:].values
    
    # Get atomic symbols
    symbols = mol.iloc[:, 2].values
    
    # Display molecule
    system = Atoms(positions=xcart, symbols=symbols)
    print('Molecule Name: %s.' %molecule)
    return ase.visualize.view(system, viewer="x3d")

In [3]:
import pandas as pd
import numpy as np
from numpy.linalg import eig, svd
from sklearn.decomposition import PCA
from tqdm import tqdm

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

In [4]:
folder = '../../data/input/structures/'

In [5]:
structures = pd.read_csv('../../data/input/structures.csv')

In [6]:
all_molecule_names = structures['molecule_name'].unique()

In [7]:
filenames = ['dsgdb9nsd_133883.xyz']

In [8]:
stats = []

for filename in tqdm(filenames):
    entrystats = {}
    
    # Create the compound object mol from the file which happens to be methane
    mol = qml.Compound(xyz=folder+filename)
    entrystats['molecule_name'] = filename[:-4]
    mol.generate_coulomb_matrix(size=mol.natoms, sorting="unsorted")
    a = mol.representation
    for i, v in enumerate(a):
        entrystats['c_'+str(i)] = a[i]
    stats.append(entrystats)

100%|██████████| 1/1 [00:00<00:00, 42.42it/s]


In [9]:
structures.loc[structures.molecule_name==filenames[0][:-4]]

Unnamed: 0,molecule_name,atom_index,atom,x,y,z
2358606,dsgdb9nsd_133883,0,C,-1.181902,1.415048,0.876272
2358607,dsgdb9nsd_133883,1,N,-0.652572,0.206319,1.67655
2358608,dsgdb9nsd_133883,2,C,-0.898215,-0.616367,0.37127
2358609,dsgdb9nsd_133883,3,C,0.224088,-1.56243,0.015561
2358610,dsgdb9nsd_133883,4,C,1.39672,-0.764049,0.445825
2358611,dsgdb9nsd_133883,5,C,0.709493,0.478235,0.961172
2358612,dsgdb9nsd_133883,6,C,-0.302582,0.69361,-0.181481
2358613,dsgdb9nsd_133883,7,C,0.402259,0.359544,-1.482579
2358614,dsgdb9nsd_133883,8,N,1.087067,-0.89825,-1.015033
2358615,dsgdb9nsd_133883,9,H,-0.837467,2.377278,1.262714


In [10]:
print(mol.coordinates, '\n')
print(mol.atomtypes, '\n')
print(mol.nuclear_charges,'\n')
print(mol.name, '\n')
print(mol.natoms, '\n')

[[-1.18190188  1.4150478   0.87627152]
 [-0.65257208  0.20631894  1.6765499 ]
 [-0.89821456 -0.61636689  0.3712696 ]
 [ 0.22408829 -1.5624298   0.01556072]
 [ 1.39671975 -0.76404888  0.44582465]
 [ 0.70949329  0.47823514  0.96117151]
 [-0.30258242  0.6936104  -0.18148114]
 [ 0.40225925  0.35954397 -1.48257948]
 [ 1.08706749 -0.89825044 -1.01503296]
 [-0.8374668   2.37727777  1.26271408]
 [-2.26578713  1.40479229  0.73862194]
 [-1.93721676 -0.89219665  0.19316622]
 [ 0.16715745 -2.64234637  0.00354626]
 [ 2.33666798 -1.16524685  0.79957916]
 [ 1.28751667  1.30334436  1.37639599]
 [ 1.1605985   1.07877279 -1.80164725]
 [-0.25470874  0.11517923 -2.32094133]] 

['C', 'N', 'C', 'C', 'C', 'C', 'C', 'C', 'N', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H'] 

[6 7 6 6 6 6 6 6 7 1 1 1 1 1 1 1 1] 

../../data/input/structures/dsgdb9nsd_133883.xyz 

17 



In [11]:
c_mat1 = qml.representations.vector_to_matrix(a)

In [12]:
# this is 17x17 matrix, and it is symmetric
print(a.shape, c_mat1.shape, np.allclose(c_mat1, c_mat1.T))

(153,) (17, 17) True


In [13]:
pca = PCA(n_components = min(7,mol.natoms))
pca.fit(c_mat1) 

PCA(copy=True, iterated_power='auto', n_components=7, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [14]:
print(pca.mean_, '\n\n', pca.singular_values_)

[11.02412991 13.85862457 12.5416481  12.06477409 12.06477043 12.54163298
 12.9922889  11.37038871 13.89426614  1.43727153  1.43727343  1.56678944
  1.47599233  1.47599223  1.56678631  1.4831532   1.48315499] 

 [118.67364643  57.18663419  32.86433332  25.37769419  20.4224077
  17.38712976   9.17310632]


In [15]:
stats = []

for filename in tqdm(filenames):
    entrystats = {}
    
    # Create the compound object mol from the file which happens to be methane
    mol = qml.Compound(xyz=folder+filename)
    entrystats['molecule_name'] = filename[:-4]
    mol.generate_atomic_coulomb_matrix(size=mol.natoms, 
                                       sorting="distance", 
                                       central_cutoff=6.0, 
                                       central_decay=3, 
                                       interaction_cutoff=3.0, 
                                       interaction_decay=1)
    a = mol.representation

100%|██████████| 1/1 [00:00<00:00, 54.22it/s]


In [16]:
c_mat2 = qml.representations.vector_to_matrix(a[0])

In [17]:
# 17x17 matrix, and it is symmetric
print(a.shape, c_mat2.shape, np.allclose(c_mat2, c_mat2.T))

(17, 153) (17, 17) True


In [18]:
pca = PCA(n_components=min(7,mol.natoms))
pca.fit(c_mat2)  

PCA(copy=True, iterated_power='auto', n_components=7, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [19]:
print(pca.mean_, '\n\n', pca.singular_values_)

[ 7.8518432   0.77155052  0.77154468 10.49835052 11.78464339 10.63809441
 10.63805685  0.83693432  0.83692507  7.47546663  8.53494309  8.53493585
  0.62689842  0.62689342  8.61837598  0.40578976  0.40578611] 

 [103.16711555  79.32198801  42.37043412  28.29614027  18.17746279
  16.37652283   4.50816225]


In [21]:
filenames = [i + '.xyz' for i in all_molecule_names]

In [22]:
def get_laplacian(A, tol=1e-10):
    '''
    input: square Coulomb matrix
    '''
    A = A + tol
    L = np.exp(-1/A)
    L[np.diag_indices_from(L)] = 0
    G = np.diag((L*(L>tol)).sum(axis=1)) - L
    
    return G

In [23]:
stats = []
NUM_SINGULAR_VALUES = 5
NUM_PCA_COMPONENTS = 7
CUTOFF = 1e-8 # cut-off for zero interaction
TOL = 1e-8 # tol for zero eigenvalue

for filename in tqdm(filenames):
    
    # Create the compound object mol from the file which happens to be methane
    mol = qml.Compound(xyz=folder+filename)
    natoms = mol.natoms
    mol.generate_atomic_coulomb_matrix(size=natoms, 
                                       sorting='distance', 
                                       central_cutoff=6.0, 
                                       central_decay=3, 
                                       interaction_cutoff=3.0, 
                                       interaction_decay=1)
    ac_c_mat = mol.representation # atom-centered Coulomb matrices collection
    for i in range(natoms): # a loop for every atoms in this molecule
        atomstats = {}
        atomstats['molecule_name'] = filename[:-4]
        atomstats['atom_index'] = i
        
        a = qml.representations.vector_to_matrix(ac_c_mat[i])
        _, eigvals, _ = svd(a)
        atomstats['eigv_min'] = eigvals[np.abs(eigvals)>TOL].min()
        atomstats['eigv_max'] = eigvals.max()
        atomstats['eigv_gap'] = atomstats['eigv_max'] - atomstats['eigv_min']
        
        L = get_laplacian(a, tol=CUTOFF)
        _, eigvals, _ = svd(L)
        
        atomstats['fiedler_eig'] = eigvals[eigvals>TOL][-1]
        atomstats['connectedness'] = (eigvals<TOL).sum()
        
        pca = PCA(n_components = min(NUM_PCA_COMPONENTS, natoms))
        pca.fit(a)
        sv = pca.singular_values_
        atomstats['sv_min'] = sv[sv>TOL][-1]
        atomstats['coulomb_mean'] = pca.mean_[0]
        
        if natoms < NUM_SINGULAR_VALUES: # if there are less than certain atoms/singular values
            sv = np.r_[sv, np.zeros(NUM_SINGULAR_VALUES-natoms)] 
            
        for k in range(NUM_SINGULAR_VALUES):
            atomstats['sv_'+str(k)] = sv[k]
        stats.append(atomstats)

100%|██████████| 130775/130775 [1:28:55<00:00, 24.51it/s]


In [24]:
struct_eig = pd.DataFrame(stats)
struct_eig.tail(10)

Unnamed: 0,molecule_name,atom_index,eigv_min,eigv_max,eigv_gap,fiedler_eig,connectedness,sv_min,coulomb_mean,sv_0,sv_1,sv_2,sv_3,sv_4
2358647,dsgdb9nsd_133885,6,0.062409,170.390785,170.328376,1.874715,1,5.057414,13.367334,108.125725,90.31092,56.789167,30.288431,19.145452
2358648,dsgdb9nsd_133885,7,0.061998,170.387117,170.325118,1.873655,1,5.074715,11.751573,108.140285,90.291825,56.803031,30.302178,19.142275
2358649,dsgdb9nsd_133885,8,0.051722,169.569743,169.518022,1.711193,1,5.017732,10.040741,108.469025,88.831708,56.434587,30.26024,17.929273
2358650,dsgdb9nsd_133885,9,0.010952,148.466793,148.455841,0.157116,1,2.738255,0.816018,101.22808,76.172794,36.908859,24.422232,17.920462
2358651,dsgdb9nsd_133885,10,0.010952,148.466807,148.455854,0.157118,1,2.738271,0.816021,101.228065,76.172779,36.908916,24.422304,17.920445
2358652,dsgdb9nsd_133885,11,0.043296,168.700482,168.657187,1.732641,1,4.72166,0.897861,108.155112,89.805847,55.538129,29.439831,19.03876
2358653,dsgdb9nsd_133885,12,0.005134,158.464326,158.459193,0.191387,1,4.790562,0.766879,104.610942,75.365213,52.624588,29.451303,13.188512
2358654,dsgdb9nsd_133885,13,0.002619,154.748392,154.745773,0.083729,1,4.617521,0.860495,108.799073,59.540607,46.703356,28.390455,8.732286
2358655,dsgdb9nsd_133885,14,0.043295,168.700477,168.657182,1.732643,1,4.721656,0.897866,108.155106,89.805834,55.53813,29.439858,19.038758
2358656,dsgdb9nsd_133885,15,0.005134,158.464426,158.459292,0.191386,1,4.790553,0.76688,104.611077,75.365173,52.624663,29.451286,13.188488


In [25]:
structures.tail(10)

Unnamed: 0,molecule_name,atom_index,atom,x,y,z
2358647,dsgdb9nsd_133885,6,C,-0.0574,0.61121,0.541102
2358648,dsgdb9nsd_133885,7,C,-0.095929,0.380424,-0.972098
2358649,dsgdb9nsd_133885,8,C,0.816694,-0.813067,-1.02236
2358650,dsgdb9nsd_133885,9,H,-2.090436,1.327066,1.263661
2358651,dsgdb9nsd_133885,10,H,-1.440042,2.287218,-0.127543
2358652,dsgdb9nsd_133885,11,H,-1.454004,-0.967309,1.459246
2358653,dsgdb9nsd_133885,12,H,0.277779,-2.697872,0.19577
2358654,dsgdb9nsd_133885,13,H,2.515854,-1.151784,0.527369
2358655,dsgdb9nsd_133885,14,H,0.013699,1.199431,-1.680192
2358656,dsgdb9nsd_133885,15,H,1.260745,-1.246754,-1.906767


In [26]:
struct_eig.corrwith(struct_eig['eigv_min'])

atom_index      -0.195376
eigv_min         1.000000
eigv_max         0.258600
eigv_gap         0.256786
fiedler_eig      0.486149
connectedness   -0.144648
sv_min           0.122363
coulomb_mean     0.329267
sv_0             0.107630
sv_1             0.189728
sv_2             0.228337
sv_3             0.231325
sv_4             0.209893
dtype: float64

In [32]:
struct_eig.to_pickle('struct_eigen.pkl', compression='gzip')