In [50]:
import numpy as np
from pandas import pandas as pd

In [53]:
def readSmilesToFingerprints(smstr):
    l = len(smstr)
    i = 0
    prevatom = None
    prevnegval = None
    atom = None
    negval = 0
    nxt = None
    doubleBond = False
    tripleBond = False
    vec = np.zeros(24)
    bondmatrix = np.zeros((6,6)) # matrix for pairwise bonds among O0,O1,C0,C1,C2,C3
    o_zero_pos_at_vec = 3
    c_zero_pos_at_vec = 5
    while i < l:
        prevatom = atom
        prevnegval = negval
        negval = 0
        atom = None
        leftBranch = False
        rightBranch = False
        elemToRight = False
        if i >= 4 and smstr[i-4:i] == '(O=)':
            leftBranch = True
            vec[o_zero_pos_at_vec] += 1
            vec[2] += 1 #number of O
        if smstr[i] == '[':
            atom = smstr[i+1]
            negval = int(smstr[i+3])
            i += 4
        elif smstr[i] == 'C' or smstr[i] == 'O':
            atom = smstr[i]
            negval = 0
        if i < (l-4) and smstr[i+1:i+5] == '(=O)':
            rightBranch = True
            vec[o_zero_pos_at_vec] += 1
            vec[2] += 1 #number of O
            i += 4
        if i < (l-1) and (smstr[i+1] == '[' or smstr[i+1] == 'C' or smstr[i+1] == 'O'):
            elemToRight = True
            
        rem_positions = 0
        if atom == 'C':
            rem_positions = 4
            vec[1] += 1 #number of C
            vec[c_zero_pos_at_vec + negval] += 1 #number of C0 or C1 or C2 or C3; starting at vector index 5
        elif atom == 'O':
            rem_positions = 2
            vec[2] += 1 #number of O
            vec[o_zero_pos_at_vec + negval] += 1 #number of O0 or O1; starting at vector index 3
        rem_positions -= negval
        if leftBranch:
            rem_positions -= 2
        if rightBranch:
            rem_positions -= 2        
        if elemToRight:
            rem_positions -= 1
        if prevatom is not None:
            rem_positions -= 1
        
        curr = str(atom) + str(negval)
        vec[0] += rem_positions #number of H
        if atom == 'C':
            vec[9] += rem_positions #number of C-H
            if leftBranch:
                vec[12] += 1 #number of C=O
            if rightBranch:
                vec[12] += 1 #number of C=O
        if atom == 'O':
            vec[13] += rem_positions #number of O-H
        
        if prevatom is not None:
            rowstartidx = 0
            colstartidx = 0
            if prevatom == 'C':
                rowstartidx = 2
            if atom == 'C':
                colstartidx = 2
            bondmatrix[rowstartidx + prevnegval, colstartidx + negval] += 1
        i += 1
        
    for j in range(3,6):
        for i in range(2,j):
            bondmatrix[i,j] += bondmatrix[j,i]
    vec[14:] = (bondmatrix[2:,2:])[np.triu_indices(4)]
    vec[10] += np.sum(bondmatrix[2:,0]) + np.sum(bondmatrix[0,2:])
    vec[11] += np.sum(bondmatrix[2:,1]) + np.sum(bondmatrix[1,2:])
    return vec
        

In [44]:
#readSmilesToFingerprints('OC(=O)[C-1]CC(=O)O')
#readSmilesToFingerprints('[C-1](=O)[C-2][C-2][C-1](=O)')
#readSmilesToFingerprints('CCC(=O)[O-1]')
#readSmilesToFingerprints('C[C-3]')

array([3., 2., 0., 0., 0., 1., 0., 0., 1., 3., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 0.])

In [68]:
def convertSMILES_file_to_csv(filename, input_cols, num_rows):
    f = open(filename, 'r')
    df = pd.DataFrame(index=range(num_rows), columns = input_cols)
    i = 0
    next(f)
    for line in f:
        df.iloc[i,:] = line.split()
        i += 1
    df.to_csv(filename + '.csv')


def MakeFingerprintsOutofSmiles(input_smiles_filename, input_cols, num_rows):
    convertSMILES_file_to_csv(input_smiles_filename, input_cols, num_rows)
    smilesDF = pd.read_csv(input_smiles_filename + '.csv', skiprows = 0, index_col=0)
    fpDF = pd.DataFrame(index = range(smilesDF.shape[0]), columns = input_cols + ['H', 'C', 'O', 'O0', 'O1',
                                                                    'C0', 'C1', 'C2', 'C3', 'C-H', 'C-O0', 'C-O1', 'C=O', 'O-H',
                                                                    'C0-C0', 'C0-C1', 'C0-C2', 'C0-C3', 'C1-C1', 'C1-C2', 'C1-C3',
                                                                    'C2-C2', 'C2-C3', 'C3-C3'])
    for i in range(smilesDF.shape[0]):
        print(smilesDF.iloc[i, len(input_cols)-1])
        fpDF.iloc[i, 0:len(input_cols)] = smilesDF.iloc[i, 0:len(input_cols)]
        fpDF.iloc[i, len(input_cols):] = readSmilesToFingerprints(smilesDF.iloc[i,len(input_cols)-1])
    fpDF.to_csv(input_smiles_filename + '_Fingerprints.csv')

In [70]:
#MakeFingerprintsOutofSmiles('SUCC_SMILES', ['species','formula','smiles'], 187)
MakeFingerprintsOutofSmiles('PAC_SMILES', ['formula','smiles'], 29)

[C-2][C-2]
CCC(=O)[O-1]
[C-1][C-2]
C[C-2]C(=O)[O-1]
[C-1][C-1][C-1](=O)
CC[C-1](=O)
[C-1][C-1]C(=O)O
[C-2](=O)
CC
C[C-1]C(=O)[O-1]
[O-2]
C[C-2]
C[C-2]C(=O)O
C[C-3]
C[C-1][C-1](=O)
[C-1](=O)O
[C-4]
[O-1]
[C-1][C-1]
C[C-1]
CCC(=O)O
C[C-2][C-1](=O)
C[C-1]C(=O)O
[C-2][C-1]C(=O)O
[C-1][C-3]
[C-2][C-1][C-1](=O)
[H-1]
(O=)C(=O)
O


In [59]:
ss = 'abcdefgh'
print(ss[3:6])
vc = np.zeros(5)
print(vc)
for i in range(3,6):
    print(i)
col1 = ['a','b','c']
col2 = col1 + ['e','d','f']
col2

def
[0. 0. 0. 0. 0.]
3
4
5


['a', 'b', 'c', 'e', 'd', 'f']