In [2]:
import pandas as pd

train_path = r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\from papers\scaffold_train_three_all_new_norm.csv'
test_path = r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\from papers\scaffold_test_three_all_new_norm.csv'
valid_path = r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\from papers\scaffold_valid_three_all_new_norm.csv'

df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)
df_valid = pd.read_csv(valid_path)

In [11]:
df = pd.concat([df_train, df_test, df_valid], axis=0)[['smiles', 'logp']].dropna()

In [44]:
df

Unnamed: 0,smiles,logp
0,CCN(CCO)C(=O)COC(=O)c1ccccc1,0.93
2,Brc1ccccc1,2.99
5,COc1cc(C=CC(=O)O)ccc1O,1.51
8,C#CCN(C)C(C)Cc1ccccc1,2.90
14,O=[N+]([O-])c1cc([N+](=O)[O-])c(O)c([N+](=O)[O...,1.33
...,...,...
3316,COc1ccc2sc(C(=O)N=c3nn[nH][nH]3)c(OC(C)C)c2c1,1.78
3317,CCCCOc1ccnc(C=CC2=CC([N+](=O)[O-])O2)n1,2.04
3318,O=C(CO)CN=c1[nH]cc[nH]c1=[N+]([O-])O,-0.49
3319,COc1ccc2c(C=Nc3c(C)n(C)n(-c4ccccc4)c3=O)c(=O)n...,4.66


In [20]:
fluor_df = df[df['smiles'].str.contains('F')]

In [21]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdForceFieldHelpers

def calculate_matches(smiles):
    
    mol = Chem.MolFromSmiles(smiles)
    mol = Chem.AddHs(mol)
    rdForceFieldHelpers.MMFFSanitizeMolecule(mol)

    AllChem.EmbedMolecule(mol, randomSeed=42)

    carboxile_submol = Chem.MolFromSmiles('CC=O')
    nitro_amine_submol = Chem.MolFromSmiles('CN')

    carboxile_matches = mol.GetSubstructMatches(carboxile_submol)
    nitro_amine_matches = mol.GetSubstructMatches(nitro_amine_submol)

    return carboxile_matches, nitro_amine_matches

In [42]:
fluor_df

Unnamed: 0,smiles,logp
204,CC(C)C(=O)Nc1ccc([N+](=O)[O-])c(C(F)(F)F)c1,3.35
208,NC(Cc1ccc(F)cc1)C(=O)O,-1.89
900,O=C(O)c1ccccc1Nc1cccc(C(F)(F)F)c1,5.25
925,O=C(O)Cc1ccccc1Nc1c(F)cccc1Cl,3.80
926,O=C(O)Cc1ccccc1Nc1c(F)cccc1F,3.57
...,...,...
3307,CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCN(C)CC3)cc21,0.27
3308,CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCNC(C)C3)c(F)c21,-0.30
3309,CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCNC(C)C3)cc21,0.34
3310,CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCNCC3)c(F)c21,-0.57


In [34]:
all_smiles = []

for _, row in fluor_df.iterrows():
    SMILES = row['smiles']

    all_smiles.append(SMILES.lower())
    carboxile_matches, nitro_amine_matches = calculate_matches(smiles=SMILES)

    if len(carboxile_matches) + len(nitro_amine_matches) == 1:
        print(f"SMILES: {SMILES}, carboxic matches: {len(carboxile_matches)}, nitro matches: {len(nitro_amine_matches)}")


SMILES: C=C(C)C(=O)OCC(F)(F)F, carboxic matches: 1, nitro matches: 0
SMILES: COC(=O)C(F)(F)F, carboxic matches: 1, nitro matches: 0
SMILES: NCC(F)(F)F, carboxic matches: 0, nitro matches: 1
SMILES: CCOC(=O)C(F)(F)C(F)(F)F, carboxic matches: 1, nitro matches: 0
SMILES: CCOC(=O)C(F)(F)F, carboxic matches: 1, nitro matches: 0
SMILES: CCOC(=O)C(F)F, carboxic matches: 1, nitro matches: 0
SMILES: O=C(O[Na])C(F)(F)C(F)F, carboxic matches: 1, nitro matches: 0
SMILES: Cc1c(F)c(F)c(C(=O)O)c(F)c1F, carboxic matches: 1, nitro matches: 0
SMILES: Cc1ccc(NS(=O)(=O)C(F)(F)F)cc1, carboxic matches: 0, nitro matches: 1
SMILES: CC(=O)Oc1cccc(C(F)(F)F)c1, carboxic matches: 1, nitro matches: 0
SMILES: CC(=O)Oc1cccc(F)c1, carboxic matches: 1, nitro matches: 0
SMILES: CC(=O)Oc1ccccc1C(F)(F)F, carboxic matches: 1, nitro matches: 0
SMILES: CC(=O)Oc1ccccc1F, carboxic matches: 1, nitro matches: 0
SMILES: CC(C)(C)S(=O)(=O)C(C#N)C=Nc1ccc(OC(F)(F)F)cc1, carboxic matches: 0, nitro matches: 1
SMILES: CC(C)(C)[N+]([O-]

In [40]:
our_dataset = r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\init_data\pKa_Prediction_Starting data_2024.01.25.csv'

our_df = pd.read_csv(our_dataset)

our_smiles = our_df['Smiles'].to_list()[1:]

In [43]:
all_smiles

['cc(c)c(=o)nc1ccc([n+](=o)[o-])c(c(f)(f)f)c1',
 'nc(cc1ccc(f)cc1)c(=o)o',
 'o=c(o)c1ccccc1nc1cccc(c(f)(f)f)c1',
 'o=c(o)cc1ccccc1nc1c(f)cccc1cl',
 'o=c(o)cc1ccccc1nc1c(f)cccc1f',
 'fc(f)(f)c1nc2ccccc2[nh]1',
 'cnccc(oc1ccc(c(f)(f)f)cc1)c1ccccc1',
 'cc1cn(c2cc3c(cc2f)c(=o)c(c(=o)o)cn3-c2ccc(f)cc2f)ccn1',
 'coc1ccc(oc)c(cn(c(c)=o)c2cc(f)ccc2oc2ccccc2)c1',
 'coc1ccc(occf)c(cn(c(c)=o)c2cc(f)ccc2oc2ccccc2)c1',
 'coc1ccc(oci)c(cn(c(c)=o)c2cc(f)ccc2oc2ccccc2)c1',
 'coc1ccc(ocf)c(cn(c(c)=o)c2cc(f)ccc2oc2ccccc2)c1',
 'ccoc(=o)c1ncn2c1cn(c)c(=o)c1cc(f)ccc1-2',
 'cc(c)c(nc(=o)cn1c(-c2ccccc2)ccc(n)c1=o)c(=o)c(f)(f)f',
 'occn1ccn(cccn2c3ccccc3sc3ccc(c(f)(f)f)cc32)cc1',
 'cn1ccn(cccn2c3ccccc3sc3ccc(c(f)(f)f)cc32)cc1',
 'cn1c(=o)cn=c(c2ccccc2f)c2cc([n+](=o)[o-])ccc21',
 'cn(c)cccn1c2ccccc2sc2ccc(c(f)(f)f)cc21',
 'o=c(o)c1cn(c2cc2)c2cc(n3ccncc3)c(f)cc2c1=o',
 'o=c1[nh]c2c(o)ccc(ccncccs(=o)(=o)nccoccc3cccc(c(f)(f)f)c3)c2s1',
 'o=c1[nh]c2c(o)ccc(ccncccs(=o)(=o)nccoccc3ccc(f)cc3)c2s1',
 'ccn1cc(c(=o)o)c

In [41]:
for our_smile in our_smiles:
    if our_smile.lower() in all_smiles:
        print(our_smile)