In [1]:
import os
import torch
import pandas as pd

PATH = "./dataset/"

train_df = pd.read_csv(os.path.join(PATH, "train_set.csv"))
test_df = pd.read_csv(os.path.join(PATH, "test_set.csv"))

In [2]:
from rdkit import Chem, DataStructs
from rdkit.Chem import Descriptors

text = train_df.iloc[128]['SMILES']

In [3]:
smiles = text
mol = Chem.MolFromSmiles(smiles)
adj = Chem.GetAdjacencyMatrix(mol)
dist = Chem.GetDistanceMatrix(mol)

for idx_i, atom in enumerate(mol.GetAtoms()):
    for bond in atom.GetBonds():
        start = bond.GetBeginAtomIdx()
        end = bond.GetEndAtomIdx()
        d = dist[start, end]
        bond_type = bond.GetBondType().name
        print("d : {}, bond_type : {}".format(d, bond_type))
        print("idx_i : {}, start : {}, end : {}".format(idx_i, start, end))


d : 1.0, bond_type : SINGLE
idx_i : 0, start : 0, end : 1
d : 1.0, bond_type : SINGLE
idx_i : 1, start : 0, end : 1
d : 1.0, bond_type : AROMATIC
idx_i : 1, start : 1, end : 2
d : 1.0, bond_type : AROMATIC
idx_i : 1, start : 31, end : 1
d : 1.0, bond_type : AROMATIC
idx_i : 2, start : 1, end : 2
d : 1.0, bond_type : AROMATIC
idx_i : 2, start : 2, end : 3
d : 1.0, bond_type : AROMATIC
idx_i : 3, start : 2, end : 3
d : 1.0, bond_type : SINGLE
idx_i : 3, start : 3, end : 4
d : 1.0, bond_type : AROMATIC
idx_i : 3, start : 3, end : 30
d : 1.0, bond_type : SINGLE
idx_i : 4, start : 3, end : 4
d : 1.0, bond_type : AROMATIC
idx_i : 4, start : 4, end : 5
d : 1.0, bond_type : AROMATIC
idx_i : 4, start : 28, end : 4
d : 1.0, bond_type : AROMATIC
idx_i : 5, start : 4, end : 5
d : 1.0, bond_type : SINGLE
idx_i : 5, start : 5, end : 6
d : 1.0, bond_type : AROMATIC
idx_i : 5, start : 5, end : 7
d : 1.0, bond_type : SINGLE
idx_i : 6, start : 5, end : 6
d : 1.0, bond_type : AROMATIC
idx_i : 7, start : 

In [7]:
total_smiles = pd.concat([train_df['SMILES'] , test_df['SMILES']], axis = 0).reset_index(drop = True)

In [10]:
from tqdm.auto import tqdm

atom_degree_list = []
atom_numhs_list = []
atom_valence_list = []
atom_aromatic_list = []
atom_ring_list = []
atom_charge_list = []
atom_hybrid_list = []

bond_type_list = []
bond_aromatic_list = []
bond_conjugated_list = []
bond_ring_list = []
bond_stereo_list = []
bond_dist_list = []

for smiles in total_smiles:

    mol = Chem.MolFromSmiles(smiles)
    adj = Chem.GetAdjacencyMatrix(mol)
    dist = Chem.GetDistanceMatrix(mol)

    for idx_i, atom in enumerate(mol.GetAtoms()):
        atom_degree_list.append(atom.GetDegree())
        atom_numhs_list.append(atom.GetTotalNumHs())
        atom_valence_list.append(atom.GetImplicitValence())
        atom_aromatic_list.append(atom.GetIsAromatic())
        atom_ring_list.append(atom.IsInRing())
        atom_charge_list.append(atom.GetFormalCharge())
        atom_hybrid_list.append(atom.GetHybridization())

        for bond in atom.GetBonds():
            start = bond.GetBeginAtomIdx()
            end = bond.GetEndAtomIdx()
            d = dist[start, end]
            bond_type = bond.GetBondType().name

            bond_type_list.append(bond_type)
            bond_aromatic_list.append(bond.GetIsAromatic())
            bond_conjugated_list.append(bond.GetIsConjugated())
            bond_ring_list.append(bond.IsInRing())
            bond_stereo_list.append(bond.GetStereo())
            bond_dist_list.append(d)

import numpy as np

atom_degree_list = np.unique(np.array(atom_degree_list))
atom_numhs_list = np.unique(np.array(atom_numhs_list))
atom_valence_list = np.unique(np.array(atom_valence_list))
atom_aromatic_list = np.unique(np.array(atom_aromatic_list))
atom_ring_list = np.unique(np.array(atom_ring_list))
atom_charge_list = np.unique(np.array(atom_charge_list))
atom_hybrid_list = np.unique(np.array(atom_hybrid_list))

bond_type_list = np.unique(np.array(bond_type_list))
bond_aromatic_list = np.unique(np.array(bond_aromatic_list))
bond_conjugated_list = np.unique(np.array(bond_conjugated_list))
bond_ring_list = np.unique(np.array(bond_ring_list))
bond_stereo_list = np.unique(np.array(bond_stereo_list))
bond_dist_list = np.unique(np.array(bond_dist_list))

In [12]:
print("atom degree : ", atom_degree_list)
print("atom numhs : ", atom_numhs_list)
print("atom valence : ", atom_valence_list)
print("atom aromatic : ", atom_aromatic_list)
print("atom ring : ", atom_ring_list)
print("atom charge : ", atom_charge_list)
print("atom hybrid : ", atom_hybrid_list)

print("bond type : ", bond_type_list)
print("bond aromatic : ", bond_aromatic_list)
print("bond conjugated : ", bond_conjugated_list)
print("bond ring : ", bond_ring_list)
print("bond stereo : ", bond_stereo_list)
print("bond dist : ", bond_dist_list)

atom degree :  [0 1 2 3 4 5 6]
atom numhs :  [0 1 2 3]
atom valence :  [0 1 2 3]
atom aromatic :  [False  True]
atom ring :  [False  True]
atom charge :  [0]
atom hybrid :  [2 3 4 5 6]
bond type :  ['AROMATIC' 'DOUBLE' 'SINGLE' 'TRIPLE']
bond aromatic :  [False  True]
bond conjugated :  [False  True]
bond ring :  [False  True]
bond stereo :  [0 2 3]
bond dist :  [1.]
