In [10]:
import numpy as np
from pandas import pandas as pd

In [11]:
def isLeftDoubleBondedBranch(smstr, curridx):
    if curridx > 3 and smstr[curridx-4:curridx] == '(O=)':
        return True
    return False

def isRightDoubleBondedBranch(smstr, curridx):
    if curridx < len(smstr)-4 and smstr[curridx+1:curridx+5] == '(=O)':
        return True
    return False

def isRightSingleBondedSaturatedOxygen(smstr, curridx):
    if curridx < len(smstr)-3 and smstr[curridx+1:curridx+4] == '(O)':
        return True
    return False

def isRightSingleBondedUnsaturatedOxygen(smstr, curridx):
    if curridx < len(smstr)-5 and smstr[curridx+1:curridx+6] == '([O])':
        return True
    if curridx < len(smstr)-7 and smstr[curridx+1:curridx+8] == '([O-1])':
        return True
    return False

def isElemToRight(smstr, curridx):
    i = curridx
    if i >= len(smstr)-1:
        return False
    i += 1
    if smstr[i] == '(':
        while smstr[i] != ')':
            i += 1
        i += 1
    if i < len(smstr) and (smstr[i]  == 'C' or smstr[i] == 'O' or smstr[i] == '['):
        return True
    return False


#'H', 'C', 'O', 'O0', 'O1', 'C0', 'C1', 'C2', 'C3', ###Idx 0-8
#'C-H', 'C-O0', 'C-O1', 'C=O', 'O-H','CbranchO0','CbranchO1', ###Idx 9-15
#'C0-C0', 'C0-C1', 'C0-C2', 'C0-C3', 'C1-C1', 'C1-C2', 'C1-C3', 'C2-C2', 'C2-C3', 'C3-C3' ###Idx 15-25
###NEW FP:
#'H', 'C', 'O', 'O0', 'O1', 'C0', 'C1', 'C2', 'C3', ###Idx 0-8
#'C0-H','C1-H','C2-H','C0=O','C1=O','O-H', ###Idx 9-14
#'C0-O0', 'C0-O1', 'C1-O0', 'C1-O1', 'C2-O0', 'C2-O1', ###Idx 15-20
#'C0branch-O0','C1branch-O0','C2branch-O0','C0branch-O1','C1branch-O1','C2branch-O1', ###Idx 21-26
#'C0-C0', 'C0-C1', 'C0-C2', 'C0-C3', 'C1-C1', 'C1-C2', 'C1-C3', 'C2-C2', 'C2-C3', 'C3-C3' ###Idx 27-36
def readSmilesToFingerprints(smstr, isRingStruct = False):
    l = len(smstr)
    i = 0
    prevatom = None
    prevnegval = None
    atom = None
    if isRingStruct:
        atom = 'O'
    negval = 0
    vec = np.zeros(37)
    bondmatrix = np.zeros((6,6)) # matrix for pairwise bonds among O0,O1,C0,C1,C2,C3
    o_zero_pos_at_vec = 3
    c_zero_pos_at_vec = 5
    
    i = 0
    while i < len(smstr):
        if smstr[i] == '(': #skip the parenthesised part, these will be calculated as branch of the main-chain-atom
            while smstr[i] != ')':
                i += 1
            i += 1
        if i >= len(smstr):
            break
        if smstr[i] != 'C' and smstr[i] != 'O':
            i += 1
            continue
        prevatom = atom
        prevnegval = negval
        negval = 0
        atom = smstr[i]
        elemToRight = False
        leftidx = i
        rightidx = i
        if i > 0 and smstr[i-1] == '[':
            negval = int(smstr[i+2])
        j = i - 1
        while j >= 0 and smstr[j] != 'C' and smstr[j] != 'O' and smstr[j] != ')' and smstr[j] != ']':
            j -= 1
        leftidx = j + 1
        j = i + 1
        while j < len(smstr) and smstr[j] != 'C' and smstr[j] != 'O' and smstr[j] != '(' and smstr[j] != '[':
            j += 1
        rightidx = j - 1
        
        rem_positions = 0
        if atom == 'C':
            rem_positions = 4
            vec[1] += 1 #number of C
            vec[c_zero_pos_at_vec + negval] += 1 #number of C0 or C1 or C2 or C3; starting at vector index 5
        elif atom == 'O':
            rem_positions = 2
            vec[2] += 1 #number of O
            vec[o_zero_pos_at_vec + negval] += 1 #number of O0 or O1; starting at vector index 3
        rem_positions -= negval
        
        if isLeftDoubleBondedBranch(smstr, leftidx):
            vec[o_zero_pos_at_vec] += 1
            vec[2] += 1 #number of O
            rem_positions -= 2
            vec[12 + negval] += 1 #number of C0=O,C1=O from index 12 to 13
        if isRightDoubleBondedBranch(smstr, rightidx):
            vec[o_zero_pos_at_vec] += 1
            vec[2] += 1 #number of O
            rem_positions -= 2
            vec[12 + negval] += 1 #number of C0=O,C1=O from index 12 to 13
        if isRightSingleBondedSaturatedOxygen(smstr, rightidx):
            vec[o_zero_pos_at_vec] += 1
            vec[2] += 1 #number of O
            vec[21 + negval] += 1 #C(O); C0 or C1 or C2 has a branch containing saturated O; from index 21 to 23
            vec[0] += 1 #Saturated O connected to an H, so inc H count
            vec[13] += 1 #Saturated O connected to an H, so inc O-H count
            rem_positions -= 1
        if isRightSingleBondedUnsaturatedOxygen(smstr, rightidx):
            vec[o_zero_pos_at_vec+1] += 1
            vec[2] += 1 #number of O
            vec[24 + negval] += 1 #C([O]); C0 or C1 or C2 has a branch containing unsaturated O; from index 24 to 26
            rem_positions -= 1
        if isRingStruct or isElemToRight(smstr, rightidx):
            elemToRight = True
        if elemToRight:
            rem_positions -= 1
        if prevatom is not None:
            rem_positions -= 1
            
        vec[0] += rem_positions #number of H
        if atom == 'C':
            vec[9 + negval] += rem_positions #number of C0-H,C1-H,C2-H from index 9 to 11
        if atom == 'O':
            vec[14] += rem_positions #number of O-H
            
        if prevatom is not None:
            rowstartidx = 0
            colstartidx = 0
            if prevatom == 'C':
                rowstartidx = 2
            if atom == 'C':
                colstartidx = 2
            bondmatrix[rowstartidx + prevnegval, colstartidx + negval] += 1
        
        i += 1
        
        
    for j in range(3,6):
        for i in range(2,j):
            bondmatrix[i,j] += bondmatrix[j,i]
    
    vec[27:] = (bondmatrix[2:,2:])[np.triu_indices(4)] # last 10 indices for all combinations of connections between C0,C1,C2,C3
    # from index 15 to 20: C0-O0, C0-O1, C1-O0, C1-O1, C2-O0, C2-O1 :
    for cneg in range(3):
        for oneg in range(2):
            i = 2 + cneg
            j = oneg
            #let, bondmatrix denoted by 'bm'. We have, for 6 values of combinations of cneg and oneg:
            # bm[2,0]+bm[0,2]; bm[2,1]+bm[1,2]; bm[3,0]+bm[0,3]; bm[3,1]+bm[1,3]; bm[4,0]+bm[0,4]; bm[4,1]+bm[1,4]
            vec[15 + (cneg*2) + oneg] = bondmatrix[i,j] + bondmatrix[j,i]
    return vec
        

In [12]:
def isElementNextToCurr(smstr, curridx, seekright = 1, norecurse = 0):
    if seekright == 1:
        if curridx == len(smstr)-1:
            return True # because it is a ring
        if smstr[curridx+1] == 'C' or smstr[curridx+1] == 'O':
            return True
        if smstr[curridx+1] == '[':
            return True
        if smstr[curridx+1] == '(':
            idx_plus = 3
            if smstr[curridx+2] == '=' or smstr[curridx+3] == '=':
                idx_plus += 1
            return isElementNextToCurr(smstr, curridx+idx_plus, seekright, 1)
    else: # seek left
        if curridx == 0:
            return True # because it is a ring
        if smstr[curridx-1] == 'C' or smstr[curridx-1] == 'O':
            return True
        if smstr[curridx-1] == ']':
            return True
        if smstr[curridx-1] == ')':
            idx_minus = 3
            if smstr[curridx-2] == '=' or smstr[curridx-3] == '=':
                idx_minus += 1
            return isElementNextToCurr(smstr, curridx-idx_minus, seekright, 1)
        
def BranchBondCountNextToCurr(smstr, curridx, seekright = 1):
    if seekright == 1:
        if curridx == len(smstr)-1:
            return 0
        if smstr[curridx+1] == '(':
            if (smstr[curridx+2] == 'C' or smstr[curridx+2] == 'O') and smstr[curridx+3] == ')': #(O)
                return 1
            if smstr[curridx+2] == '=' and (smstr[curridx+3] == 'C' or smstr[curridx+3] == 'O'): #(=O)
                return 2
            if smstr[curridx+2] == '[' and smstr[curridx+3]=='O' and smstr[curridx+4]==']' and smstr[curridx+5]==')': #([O])
                return 1
        return 0
    else: # seek left
        if curridx == 0:
            return 0
        if smstr[curridx-1] == ')':
            if (smstr[curridx-2] == 'C' or smstr[curridx-2] == 'O') and smstr[curridx-3] == '(':
                return 0
            if smstr[curridx-2] == '=' and (smstr[curridx-3] == 'C' or smstr[curridx-3] == 'O'):
                return 2
        return 0
    

def convertNewSmilesToOldSmiles(smstr):
    smstr = smstr.replace('1','')
    convStr = ''
    i = 0
    while i < len(smstr):
        if smstr[i] != '[':
            if smstr[i] == '(' and i < len(smstr)-2 and smstr[i:i+3] == '(H)':
                i += 3
            else:
                convStr += smstr[i]
                i += 1
        else:
            numH = 0
            idxPlus = 2
            if smstr[i+2] == 'H':
                if smstr[i+3] == ']':
                    numH = 1
                    idxPlus += 1
                else:
                    numH = int(smstr[i+3])
                    idxPlus += 2
            if i < len(smstr)-5 and smstr[i+3:i+6] == '(H)':
                numH = 1
                idxPlus += 3
                    
            usedValence = numH
            if isElementNextToCurr(smstr, i, 0):
                usedValence += 1
            usedValence += BranchBondCountNextToCurr(smstr, i, 0)            
            if isElementNextToCurr(smstr, i + idxPlus, 1):
                usedValence += 1
            usedValence += BranchBondCountNextToCurr(smstr, i + idxPlus, 1)
            
            freeval = 0
            if smstr[i+1] == 'C':
                freeval = 4 - usedValence
                convStr += '[C-' + str(freeval) + ']'
            else:
                #freeval = 2 - usedValence
                #convStr += '[O-' + str(freeval) + ']'
                convStr += '[O-1]'
            i = i + idxPlus + 1
    return convStr


In [5]:
### USE THIS CELL OR THE PREVIOUS CELL ##########

def isElementNextToCurr(smstr, curridx, seekright = 1, norecurse = 0):
    if seekright == 1:
        if curridx == len(smstr)-1:
            return False
        if smstr[curridx] == ')' and smstr[curridx-2] == '(' and norecurse == 0:
            return False #isElementNextToCurr(smstr, curridx, seekright, 1)
        if smstr[curridx] == ')' and smstr[curridx-1] != '=' and norecurse == 0:
            return False
        if smstr[curridx] == ')' and smstr[curridx-1] == '=' and norecurse == 0:
            return isElementNextToCurr(smstr, curridx, seekright, 1)
        if smstr[curridx+1] == 'C' or smstr[curridx+1] == 'O':
            return True
        if smstr[curridx+1] == '[':
            return True
        if smstr[curridx+1] == '(':
            idx_plus = 3
            if smstr[curridx+2] == '=' or smstr[curridx+3] == '=':
                idx_plus += 1
            return isElementNextToCurr(smstr, curridx+idx_plus, seekright, 1)
    else: # seek left
        if curridx == 0:
            return False
        if smstr[curridx] == '(' and smstr[curridx+2] == ')' and norecurse == 0:
            return isElementNextToCurr(smstr, curridx, seekright, 1)
        if smstr[curridx] == '(' and smstr[curridx+1] != '=' and norecurse == 0:
            return False
        if smstr[curridx] == '(' and smstr[curridx+1] == '=' and norecurse == 0:
            return isElementNextToCurr(smstr, curridx, seekright, 1)
        if smstr[curridx-1] == 'C' or smstr[curridx-1] == 'O':
            return True
        if smstr[curridx-1] == ']':
            return True
        if smstr[curridx-1] == ')':
            idx_minus = 3
            if smstr[curridx-2] == '=' or smstr[curridx-3] == '=':
                idx_minus += 1
            return isElementNextToCurr(smstr, curridx-idx_minus, seekright, 1)
        
def BranchBondCountNextToCurr(smstr, curridx, seekright = 1):
    if seekright == 1:
        if curridx == len(smstr)-1:
            return 0
        if smstr[curridx+1] == '(':
            if (smstr[curridx+2] == 'C' or smstr[curridx+2] == 'O') and smstr[curridx+3] == ')':
                return 1
            if smstr[curridx+2] == '=' and (smstr[curridx+3] == 'C' or smstr[curridx+3] == 'O'):
                return 2
        return 0
    else: # seek left
        if curridx == 0:
            return 0
        if smstr[curridx-1] == ')':
            if (smstr[curridx-2] == 'C' or smstr[curridx-2] == 'O') and smstr[curridx-3] == '(':
                return 0
            if smstr[curridx-2] == '=' and (smstr[curridx-3] == 'C' or smstr[curridx-3] == 'O'):
                return 2
        return 0
    
def convertNewSmilesToOldSmiles(smstr):
    convStr = ''
    i = 0
    while i < len(smstr):
        if smstr[i] != '[':
            convStr += smstr[i]
            i += 1
        else:
            numH = 0
            idxPlus = 2
            if smstr[i+2] == 'H':
                if smstr[i+3] == ']':
                    numH = 1
                    idxPlus += 1
                else:
                    numH = int(smstr[i+3])
                    idxPlus += 2
                    
            usedValence = numH
            if isElementNextToCurr(smstr, i, 0):
                usedValence += 1
            usedValence += BranchBondCountNextToCurr(smstr, i, 0)            
            if isElementNextToCurr(smstr, i + idxPlus, 1):
                usedValence += 1
            usedValence += BranchBondCountNextToCurr(smstr, i + idxPlus, 1)
            
            freeval = 0
            if smstr[i+1] == 'C':
                freeval = 4 - usedValence
                convStr += '[C-' + str(freeval) + ']'
            else:
                freeval = 2 - usedValence
                convStr += '[O-' + str(freeval) + ']'
            i = i + idxPlus + 1
    return convStr


In [13]:
def sortRowsOfOneFileByAnother(masterFile, fileToSort, masterColToMatch, destColToMatch):
    dfmaster = pd.read_csv(masterFile + '.csv', index_col=0, skiprows = 0)
    dfdest = pd.read_csv(fileToSort + '.csv', index_col=0, skiprows = 0)
    dfdest_sorted = pd.DataFrame(index = range(dfmaster.shape[0]), columns = dfdest.columns.values)
    for i in range(dfmaster.shape[0]):
        dfdest_sorted.iloc[i,:] = dfdest[dfdest[destColToMatch] == dfmaster.loc[i, masterColToMatch]].values.tolist()[0]       
    dfdest_sorted.to_csv(fileToSort + '_sorted.csv')

def prepareFingerprintFile(inputcsv, leadingcols, smilesIdx, outputcsv, collist):    
    dfin = pd.read_csv(inputcsv + '.csv', index_col = 0, skiprows = 0)[leadingcols]
    dffp = pd.DataFrame(index = range(dfin.shape[0]), columns = leadingcols + collist)

    for i in range(dfin.shape[0]):
        dffp.iloc[i, :len(leadingcols)] = dfin.iloc[i, :len(leadingcols)]
        dffp.iloc[i, len(leadingcols):] = readSmilesToFingerprints(convertNewSmilesToOldSmiles(dfin.iloc[i,smilesIdx]), True)
    dffp.to_csv(outputcsv + '.csv')

In [29]:
convstr = convertNewSmilesToOldSmiles('C1([O])CCC([O])O1')#('C1(=O)CCC(=O)O1')#('[C]1(H)[C][CH]C([O])O1')#('C(=O)CC[C](O)')
print(convstr)
#convstr = 'C(=O)CC[C-2](O)'
#convstr = '[C-1][C-2][C-1]C([O-1])O'
readSmilesToFingerprints(convstr, True)

C([O-1])CCC([O-1])O


array([6., 4., 3., 1., 2., 4., 0., 0., 0., 6., 2., 0., 0., 0., 0., 2., 3.,
       0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [15]:
cstr = '[C]1(H)[C][CH]C([O])O1'
cstr = cstr.replace('1','')
print(cstr)

[C](H)[C][CH]C([O])O


In [21]:
def convertSMILES_file_to_csv(filename, input_cols, num_rows):
    f = open(filename, 'r')
    df = pd.DataFrame(index=range(num_rows), columns = input_cols)
    i = 0
    next(f)
    for line in f:
        df.iloc[i,:] = line.split()
        i += 1
    df.to_csv(filename + '.csv')
    
def collapseOrMergeUnwantedColumns(filepath, outfile):
    df = pd.read_csv(filepath, index_col=0, skiprows = 0)
    df['C0-O0'] = df['C0-O0'] + df['C0branch-O0']
    df['C0-O1'] = df['C0-O1'] + df['C0branch-O1']
    df['C1-O0'] = df['C1-O0'] + df['C1branch-O0']
    df['C1-O1'] = df['C1-O1'] + df['C1branch-O1']
    df = df.drop(['H', 'C', 'O', 'C2-H', 'C2-O0', 'C2-O1',
                  'C0branch-O0','C1branch-O0','C2branch-O0','C0branch-O1','C1branch-O1','C2branch-O1',], 
                  axis = 1)
    df.to_csv(outfile)


collist = ['H', 'C', 'O', 'O0', 'O1', 'C0', 'C1', 'C2', 'C3', ###Idx 0-8
           'C0-H','C1-H','C2-H','C0=O','C1=O','O-H', ###Idx 9-14
           'C0-O0', 'C0-O1', 'C1-O0', 'C1-O1', 'C2-O0', 'C2-O1', ###Idx 15-20
           'C0branch-O0','C1branch-O0','C2branch-O0','C0branch-O1','C1branch-O1','C2branch-O1', ###Idx 21-26
           'C0-C0', 'C0-C1', 'C0-C2', 'C0-C3', 'C1-C1', 'C1-C2', 'C1-C3', 'C2-C2', 'C2-C3', 'C3-C3'] ###Idx 27-36
    
print('starting flat fingerprint generation for SUCC Ring ...')
prepareFingerprintFile('SUCC_RING_SMILES', ['species', 'smiles'], 1,
                       'SUCC_Ring_Flat_Fingerprints_Expanded', collist)
sortRowsOfOneFileByAnother('SUCC_Ring_min_energies_new_090319', 'SUCC_Ring_Flat_Fingerprints_Expanded', 
                           'species', 'species')
collapseOrMergeUnwantedColumns('SUCC_Ring_Flat_Fingerprints_Expanded_sorted.csv',
                              'SUCC_Ring_Flat_Fingerprints_Expanded_CUT_sorted.csv')
#collapseOrMergeUnwantedColumns('SUCC_Ring_Flat_Fingerprints_NEW_sorted.csv',
                              #'SUCC_Ring_Flat_Fingerprints_NEW_CUT_sorted.csv')

starting flat fingerprint generation for SUCC Ring ...


In [29]:
def collapseOrMergeUnwantedColumns(filepath, outfile):
    df = pd.read_csv(filepath, index_col=0, skiprows = 0)
    df['C-O0'] = df['C-O0'] + df['CbranchO0']
    df['C-O1'] = df['C-O1'] + df['CbranchO1']
    df = df.drop(['H', 'C', 'O', 'CbranchO0','CbranchO1'], axis = 1)
    df.to_csv(outfile)

collist = ['H', 'C', 'O', 'O0', 'O1', 'C0', 'C1', 'C2', 'C3', 'C-H', 'C-O0', 'C-O1', 'C=O', 'O-H','CbranchO0','CbranchO1',
             'C0-C0', 'C0-C1', 'C0-C2', 'C0-C3', 'C1-C1', 'C1-C2', 'C1-C3', 'C2-C2', 'C2-C3', 'C3-C3']

print('starting flat fingerprint generation for PAC ...')
prepareFingerprintFile('Extrapolation_paper_data/PAC_SMILES', ['formula', 'smiles'], 1,
                       'Extrapolation_paper_data/PAC_Flat_Fingerprints_allcols', collist)
sortRowsOfOneFileByAnother('Extrapolation_paper_data/PAC_Energies', 'Extrapolation_paper_data/PAC_Flat_Fingerprints_allcols', 
                           'species', 'formula')
collapseOrMergeUnwantedColumns('Extrapolation_paper_data/PAC_Flat_Fingerprints_allcols_sorted.csv',
                              'Extrapolation_paper_data/PAC_Flat_Fingerprints.csv')

print('starting flat fingerprint generation for PAC alcohol ...')
prepareFingerprintFile('Extrapolation_paper_data/PAC_alcohol_SMILES', ['formula', 'smiles'], 1,
                       'Extrapolation_paper_data/PAC_alcohol_Flat_Fingerprints_allcols', collist)
sortRowsOfOneFileByAnother('Extrapolation_paper_data/PAC_alcohol_2001_Energies', 
                           'Extrapolation_paper_data/PAC_alcohol_Flat_Fingerprints_allcols', 
                           'species', 'formula')
collapseOrMergeUnwantedColumns('Extrapolation_paper_data/PAC_alcohol_Flat_Fingerprints_allcols_sorted.csv',
                              'Extrapolation_paper_data/PAC_alcohol_Flat_Fingerprints.csv')

print('starting flat fingerprint generation for SUCC ...')
prepareFingerprintFile('Extrapolation_paper_data/SUCC_SMILES', ['species','formula', 'smiles'], 2,
                       'Extrapolation_paper_data/SUCC_Flat_Fingerprints_allcols', collist)
sortRowsOfOneFileByAnother('Extrapolation_paper_data/SUCC_Energies', 
                           'Extrapolation_paper_data/SUCC_Flat_Fingerprints_allcols', 
                           'species', 'species')
collapseOrMergeUnwantedColumns('Extrapolation_paper_data/SUCC_Flat_Fingerprints_allcols_sorted.csv',
                              'Extrapolation_paper_data/SUCC_Flat_Fingerprints.csv')

print('starting flat fingerprint generation for SUCC DCX ...')
prepareFingerprintFile('Extrapolation_paper_data/SUCC_DCX_SMILES', ['species', 'smiles'], 1,
                       'Extrapolation_paper_data/SUCC_DCX_Flat_Fingerprints_allcols', collist)
sortRowsOfOneFileByAnother('Extrapolation_paper_data/SUCC_DCX_Energies', 
                           'Extrapolation_paper_data/SUCC_DCX_Flat_Fingerprints_allcols', 
                           'species', 'species')
collapseOrMergeUnwantedColumns('Extrapolation_paper_data/SUCC_DCX_Flat_Fingerprints_allcols_sorted.csv',
                              'Extrapolation_paper_data/SUCC_DCX_Flat_Fingerprints.csv')

starting flat fingerprint generation for PAC ...
starting flat fingerprint generation for PAC alcohol ...
starting flat fingerprint generation for SUCC ...
starting flat fingerprint generation for SUCC DCX ...
