In [173]:
from rdkit import Chem

def convert_to_canon(smi):
    try:
        canon_smi = Chem.CanonSmiles(smi)
    except:
        canon_smi = smi
    return canon_smi

In [174]:
import pandas as pd

train_df = pd.read_csv(r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\logP_lipophilicity_data\gnn_cv\train.csv', index_col=0)
test_df = pd.read_csv(r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\logP_lipophilicity_data\gnn_cv\test.csv', index_col=0)

df = pd.concat([train_df, test_df], axis=0)
df['Smiles'] = df['Smiles'].apply(convert_to_canon)

In [175]:
df_init = pd.read_csv(r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\init_data\pKa_Prediction_Starting data_2024.01.25.csv', index_col=0)

df_init['Amides for LogP'] = df_init['Amides for LogP'].apply(convert_to_canon)

In [176]:
SMILES_to_functional_group = {}

for index, row in df_init.iterrows():
    if pd.isnull(row['Smiles']):
        continue

    SMILES_to_functional_group[row['Amides for LogP']] = row['F group']

In [177]:
functional_group_to_smiles = {
    "CF3": "C(F)(F)F", 
    "CH2F": "CF", 
    "gem-CF2": "F", 
    "CHF2": "C(F)(F)",
    "CHF": "F",
    "non-F": ""
}

In [178]:
from rdkit import Chem

def remove_functional_group_from_smiles(smiles, f_group):
    f_group_smiles = functional_group_to_smiles[f_group]

    mol = Chem.MolFromSmiles(smiles)
    mol_f_group = Chem.MolFromSmiles(f_group_smiles)

    if mol.HasSubstructMatch(mol_f_group):
        mol = Chem.DeleteSubstructs(mol, mol_f_group)

    smiles_without_group = Chem.MolToSmiles(mol)
    return Chem.CanonSmiles(smiles_without_group)

canon_smiles = []
functional_groups = []
smiles_with_removed_flurine = []
for SMILES in df['Smiles']:
    functional_group = SMILES_to_functional_group[SMILES]
    functional_groups.append(functional_group)

    canon_smile = Chem.CanonSmiles(SMILES)
    canon_smiles.append(canon_smile)
    
    SMILES_without_fluorine_group = remove_functional_group_from_smiles(smiles=canon_smile, f_group=functional_group)
    smiles_with_removed_flurine.append(SMILES_without_fluorine_group)

df['Smiles'] = canon_smiles
df['F group'] = functional_groups
df['non-F Smiles'] = smiles_with_removed_flurine

In [179]:
main_df_for_train = df.copy()

In [180]:
main_df_for_train

Unnamed: 0,fold_id,logP,Smiles,F group,non-F Smiles
1,0.0,2.88,O=C(Nc1ccccc1)C1CCCCC1,non-F,O=C(Nc1ccccc1)C1CCCCC1
2,1.0,2.87,O=C(Nc1ccccc1)[C@H]1C[C@@H]2C[C@@H]2C1,non-F,O=C(Nc1ccccc1)[C@H]1C[C@@H]2C[C@@H]2C1
3,1.0,2.92,O=C(Nc1ccccc1)[C@@H]1C[C@@H]2C[C@@H]2C1,non-F,O=C(Nc1ccccc1)[C@@H]1C[C@@H]2C[C@@H]2C1
4,0.0,2.37,O=C(Nc1ccccc1)[C@H]1C[C@@H]2[C@H](C1)C2(F)F,gem-CF2,O=C(Nc1ccccc1)[C@@H]1C[C@@H]2C[C@@H]2C1
5,0.0,2.88,O=C(Nc1ccccc1)[C@@H]1C[C@@H]2[C@H](C1)C2(F)F,gem-CF2,O=C(Nc1ccccc1)[C@H]1C[C@@H]2C[C@@H]2C1
...,...,...,...,...,...
126,,1.07,O=C(NCCF)c1ccccc1,CH2F,CNC(=O)c1ccccc1
103,,2.48,O=C(NC1(C(F)F)CCC1)c1ccccc1,CHF2,O=C(NC1CCC1)c1ccccc1
49,,2.42,C[C@@H]1C[C@H]1C(=O)Nc1ccccc1,non-F,C[C@@H]1C[C@H]1C(=O)Nc1ccccc1
22,,2.72,O=C(Nc1ccccc1)C1CCCC(F)(F)C1,gem-CF2,O=C(Nc1ccccc1)C1CCCCC1


In [186]:
index_to_remain = []
for index_non_F, row_non_F in main_df_for_train.iterrows():
    SMILES_to_find = row_non_F['non-F Smiles']
    is_on_train = False
     
    for index, row in main_df_for_train.iterrows():
        if row['Smiles'] == SMILES_to_find:
            is_on_train = True
    
    # if row_non_F['F group'] != 'non-F':
    index_to_remain.append(index_non_F)
        # print(row_non_F['Smiles'], SMILES_to_find, row_non_F['F group'], is_on_train)


In [188]:
df_for_train = main_df_for_train.copy().loc[index_to_remain]

for index, row in df_for_train.iterrows():
    SMILES = row['Smiles']
    SMILES_non_F = row['non-F Smiles']

    indexes_to_remove = []
    for index_train, row_train in df.iterrows():
        if row_train['Smiles'] == SMILES:
            indexes_to_remove.append(index_train)
        if row_train['Smiles'] == SMILES_non_F and SMILES_non_F != SMILES:
            indexes_to_remove.append(index_train)
    
    print(indexes_to_remove)
    df_train = df.drop(indexes_to_remove, axis=0, inplace=False)
    df_test = df.loc[indexes_to_remove]

    print(df_test['F group'].unique())

    # break


[1]
['non-F']
[2]
['non-F']
[3]
['non-F']
[3, 4]
['non-F' 'gem-CF2']
[2, 5]
['non-F' 'gem-CF2']
[6, 30]
['non-F']
[6, 7, 30]
['non-F' 'CH2F']
[6, 8, 30]
['non-F' 'CHF2']
[6, 9, 30]
['non-F' 'CF3']
[6, 10, 30]
['non-F' 'CF3']
[6, 11, 30]
['non-F' 'CF3']
[6, 12, 30]
['non-F' 'CHF2']
[6, 13, 30]
['non-F' 'CHF2']
[6, 14, 30]
['non-F' 'CH2F']
[6, 15, 30]
['non-F' 'CH2F']
[6, 18, 30]
['non-F' 'gem-CF2']
[19, 31]
['gem-CF2' 'non-F']
[20, 31]
['gem-CF2' 'non-F']
[1, 21]
['non-F' 'gem-CF2']
[23, 32]
['gem-CF2' 'non-F']
[24, 32]
['gem-CF2' 'non-F']
[25, 32]
['gem-CF2' 'non-F']
[26, 33]
['gem-CF2' 'non-F']
[28]
['CHF2']
[29]
['non-F']
[6, 30]
['non-F']
[31]
['non-F']
[32]
['non-F']
[33]
['non-F']
[37, 39]
['CF3' 'non-F']
[39]
['non-F']
[40]
['non-F']
[6, 30, 41]
['non-F' 'CF3']
[6, 30, 42]
['non-F' 'CHF2']
[6, 30, 43]
['non-F' 'CH2F']
[29, 44]
['non-F' 'CF3']
[29, 45]
['non-F' 'CF3']
[29, 46]
['non-F' 'CHF2']
[29, 47]
['non-F' 'CHF2']
[29, 48]
['non-F' 'CH2F']
[52]
['non-F']
[52, 53]
['non-F' 'ge

In [189]:
df_for_train

Unnamed: 0,fold_id,logP,Smiles,F group,non-F Smiles
1,0.0,2.88,O=C(Nc1ccccc1)C1CCCCC1,non-F,O=C(Nc1ccccc1)C1CCCCC1
2,1.0,2.87,O=C(Nc1ccccc1)[C@H]1C[C@@H]2C[C@@H]2C1,non-F,O=C(Nc1ccccc1)[C@H]1C[C@@H]2C[C@@H]2C1
3,1.0,2.92,O=C(Nc1ccccc1)[C@@H]1C[C@@H]2C[C@@H]2C1,non-F,O=C(Nc1ccccc1)[C@@H]1C[C@@H]2C[C@@H]2C1
4,0.0,2.37,O=C(Nc1ccccc1)[C@H]1C[C@@H]2[C@H](C1)C2(F)F,gem-CF2,O=C(Nc1ccccc1)[C@@H]1C[C@@H]2C[C@@H]2C1
5,0.0,2.88,O=C(Nc1ccccc1)[C@@H]1C[C@@H]2[C@H](C1)C2(F)F,gem-CF2,O=C(Nc1ccccc1)[C@H]1C[C@@H]2C[C@@H]2C1
...,...,...,...,...,...
126,,1.07,O=C(NCCF)c1ccccc1,CH2F,CNC(=O)c1ccccc1
103,,2.48,O=C(NC1(C(F)F)CCC1)c1ccccc1,CHF2,O=C(NC1CCC1)c1ccccc1
49,,2.42,C[C@@H]1C[C@H]1C(=O)Nc1ccccc1,non-F,C[C@@H]1C[C@H]1C(=O)Nc1ccccc1
22,,2.72,O=C(Nc1ccccc1)C1CCCC(F)(F)C1,gem-CF2,O=C(Nc1ccccc1)C1CCCCC1


In [190]:
df_for_train.to_csv(r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\logP_lipophilicity_data\gnn_cv\train_test_for_SMILES_per_model.csv')

In [192]:
df_for_train = main_df_for_train.copy().loc[index_to_remain]

for index, row in df_for_train.iterrows():
    SMILES = row['Smiles']
    SMILES_non_F = row['non-F Smiles']

    indexes_to_remove = []
    for index_train, row_train in df.iterrows():
        if row_train['Smiles'] == SMILES:
            indexes_to_remove.append(index_train)
        if row_train['Smiles'] == SMILES_non_F and SMILES_non_F != SMILES:
            indexes_to_remove.append(index_train)
    
    print(indexes_to_remove)
    df_train = df.drop(indexes_to_remove, axis=0, inplace=False)
    df_test = df.loc[indexes_to_remove]

    print(df_test['F group'].unique())

    # break


[1]
['non-F']
[2]
['non-F']
[3]
['non-F']
[3, 4]
['non-F' 'gem-CF2']
[2, 5]
['non-F' 'gem-CF2']
[6, 30]
['non-F']
[6, 7, 30]
['non-F' 'CH2F']
[6, 8, 30]
['non-F' 'CHF2']
[6, 9, 30]
['non-F' 'CF3']
[6, 10, 30]
['non-F' 'CF3']
[6, 11, 30]
['non-F' 'CF3']
[6, 12, 30]
['non-F' 'CHF2']
[6, 13, 30]
['non-F' 'CHF2']
[6, 14, 30]
['non-F' 'CH2F']
[6, 15, 30]
['non-F' 'CH2F']
[6, 18, 30]
['non-F' 'gem-CF2']
[19, 31]
['gem-CF2' 'non-F']
[20, 31]
['gem-CF2' 'non-F']
[1, 21]
['non-F' 'gem-CF2']
[23, 32]
['gem-CF2' 'non-F']
[24, 32]
['gem-CF2' 'non-F']
[25, 32]
['gem-CF2' 'non-F']
[26, 33]
['gem-CF2' 'non-F']
[28]
['CHF2']
[29]
['non-F']
[6, 30]
['non-F']
[31]
['non-F']
[32]
['non-F']
[33]
['non-F']
[37, 39]
['CF3' 'non-F']
[39]
['non-F']
[40]
['non-F']
[6, 30, 41]
['non-F' 'CF3']
[6, 30, 42]
['non-F' 'CHF2']
[6, 30, 43]
['non-F' 'CH2F']
[29, 44]
['non-F' 'CF3']
[29, 45]
['non-F' 'CF3']
[29, 46]
['non-F' 'CHF2']
[29, 47]
['non-F' 'CHF2']
[29, 48]
['non-F' 'CH2F']
[52]
['non-F']
[52, 53]
['non-F' 'ge

In [193]:
df_for_train = pd.read_csv(r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\logP_lipophilicity_data\gnn_cv\train_test_for_SMILES_per_model.csv', index_col=0)

In [194]:
df_for_train

Unnamed: 0,fold_id,logP,Smiles,F group,non-F Smiles
1,0.0,2.88,O=C(Nc1ccccc1)C1CCCCC1,non-F,O=C(Nc1ccccc1)C1CCCCC1
2,1.0,2.87,O=C(Nc1ccccc1)[C@H]1C[C@@H]2C[C@@H]2C1,non-F,O=C(Nc1ccccc1)[C@H]1C[C@@H]2C[C@@H]2C1
3,1.0,2.92,O=C(Nc1ccccc1)[C@@H]1C[C@@H]2C[C@@H]2C1,non-F,O=C(Nc1ccccc1)[C@@H]1C[C@@H]2C[C@@H]2C1
4,0.0,2.37,O=C(Nc1ccccc1)[C@H]1C[C@@H]2[C@H](C1)C2(F)F,gem-CF2,O=C(Nc1ccccc1)[C@@H]1C[C@@H]2C[C@@H]2C1
5,0.0,2.88,O=C(Nc1ccccc1)[C@@H]1C[C@@H]2[C@H](C1)C2(F)F,gem-CF2,O=C(Nc1ccccc1)[C@H]1C[C@@H]2C[C@@H]2C1
...,...,...,...,...,...
126,,1.07,O=C(NCCF)c1ccccc1,CH2F,CNC(=O)c1ccccc1
103,,2.48,O=C(NC1(C(F)F)CCC1)c1ccccc1,CHF2,O=C(NC1CCC1)c1ccccc1
49,,2.42,C[C@@H]1C[C@H]1C(=O)Nc1ccccc1,non-F,C[C@@H]1C[C@H]1C(=O)Nc1ccccc1
22,,2.72,O=C(Nc1ccccc1)C1CCCC(F)(F)C1,gem-CF2,O=C(Nc1ccccc1)C1CCCCC1


In [195]:
df_for_train = main_df_for_train.copy().loc[index_to_remain]

for index, row in df_for_train.iterrows():
    SMILES = row['Smiles']
    SMILES_non_F = row['non-F Smiles']

    indexes_to_remove = []
    for index_train, row_train in df.iterrows():
        if row_train['Smiles'] == SMILES:
            indexes_to_remove.append(index_train)
        if row_train['Smiles'] == SMILES_non_F and SMILES_non_F != SMILES:
            indexes_to_remove.append(index_train)
    
    print(indexes_to_remove)
    df_train = df.drop(indexes_to_remove, axis=0, inplace=False)
    df_test = df.loc[indexes_to_remove]

    print(df_test['F group'].unique())

    # break


[1]
['non-F']
[2]
['non-F']
[3]
['non-F']
[3, 4]
['non-F' 'gem-CF2']
[2, 5]
['non-F' 'gem-CF2']
[6, 30]
['non-F']
[6, 7, 30]
['non-F' 'CH2F']
[6, 8, 30]
['non-F' 'CHF2']
[6, 9, 30]
['non-F' 'CF3']
[6, 10, 30]
['non-F' 'CF3']
[6, 11, 30]
['non-F' 'CF3']
[6, 12, 30]
['non-F' 'CHF2']
[6, 13, 30]
['non-F' 'CHF2']
[6, 14, 30]
['non-F' 'CH2F']
[6, 15, 30]
['non-F' 'CH2F']
[6, 18, 30]
['non-F' 'gem-CF2']
[19, 31]
['gem-CF2' 'non-F']
[20, 31]
['gem-CF2' 'non-F']
[1, 21]
['non-F' 'gem-CF2']
[23, 32]
['gem-CF2' 'non-F']
[24, 32]
['gem-CF2' 'non-F']
[25, 32]
['gem-CF2' 'non-F']
[26, 33]
['gem-CF2' 'non-F']
[28]
['CHF2']
[29]
['non-F']
[6, 30]
['non-F']
[31]
['non-F']
[32]
['non-F']
[33]
['non-F']
[37, 39]
['CF3' 'non-F']
[39]
['non-F']
[40]
['non-F']
[6, 30, 41]
['non-F' 'CF3']
[6, 30, 42]
['non-F' 'CHF2']
[6, 30, 43]
['non-F' 'CH2F']
[29, 44]
['non-F' 'CF3']
[29, 45]
['non-F' 'CF3']
[29, 46]
['non-F' 'CHF2']
[29, 47]
['non-F' 'CHF2']
[29, 48]
['non-F' 'CH2F']
[52]
['non-F']
[52, 53]
['non-F' 'ge

In [161]:
df = pd.read_csv(r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\logP_lipophilicity_data\gnn_cv\train_test_for_SMILES_per_model.csv', index_col=0)

In [162]:
df

Unnamed: 0,fold_id,logP,Smiles,F group,non-F Smiles
4,0.0,2.37,O=C(Nc1ccccc1)[C@H]1C[C@@H]2[C@H](C1)C2(F)F,gem-CF2,O=C(Nc1ccccc1)[C@@H]1C[C@@H]2C[C@@H]2C1
5,0.0,2.88,O=C(Nc1ccccc1)[C@@H]1C[C@@H]2[C@H](C1)C2(F)F,gem-CF2,O=C(Nc1ccccc1)[C@H]1C[C@@H]2C[C@@H]2C1
7,0.0,1.88,O=C(Nc1ccccc1)C1(CF)CCC1,CH2F,O=C(Nc1ccccc1)C1CCC1
8,0.0,2.19,O=C(Nc1ccccc1)C1(C(F)F)CCC1,CHF2,O=C(Nc1ccccc1)C1CCC1
9,0.0,2.51,O=C(Nc1ccccc1)C1(C(F)(F)F)CCC1,CF3,O=C(Nc1ccccc1)C1CCC1
...,...,...,...,...,...
55,,2.23,O=C(N[C@@H]1C[C@@H]2[C@H](C1)C2(F)F)c1ccccc1,gem-CF2,O=C(N[C@H]1C[C@@H]2C[C@@H]2C1)c1ccccc1
126,,1.07,O=C(NCCF)c1ccccc1,CH2F,CNC(=O)c1ccccc1
103,,2.48,O=C(NC1(C(F)F)CCC1)c1ccccc1,CHF2,O=C(NC1CCC1)c1ccccc1
22,,2.72,O=C(Nc1ccccc1)C1CCCC(F)(F)C1,gem-CF2,O=C(Nc1ccccc1)C1CCCCC1


-------

-------

-------

-------

-------

PKA

In [266]:
import pandas as pd

train_set_basic = pd.read_csv(r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\pKa_basicity_data\gnn_cv_canon_smiles\train_basic.csv', index_col=0)
test_set_basic = pd.read_csv(r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\pKa_basicity_data\gnn_cv_canon_smiles\test_basic.csv', index_col=0)

train_set_acid = pd.read_csv(r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\pKa_basicity_data\gnn_cv_canon_smiles\train_acid.csv', index_col=0)
test_set_acid = pd.read_csv(r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\pKa_basicity_data\gnn_cv_canon_smiles\test_acid.csv', index_col=0)

train_df = pd.concat([train_set_basic, train_set_acid], axis=0)
test_df = pd.concat([test_set_basic, test_set_acid], axis=0)

df = pd.concat([train_df, test_df], axis=0)
# df['Smiles'] = df['Smiles'].apply(convert_to_canon)

In [267]:
df_init = pd.read_csv(r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\init_data\pKa_Prediction_Starting data_2024.05.07.csv', index_col=0)
# df_init['Smiles'] = df_init['Smiles'].apply(convert_to_canon)

In [268]:
functional_group_to_smiles = {
    "CF3": "C(F)(F)F", 
    "CH2F": "CF", 
    "gem-CF2": "F", 
    "CHF2": "C(F)(F)",
    "CHF": "F",
    "non-F": ""
}

In [269]:
SMILES_to_functional_group = {}

for index, row in df_init.iterrows():
    if pd.isnull(row['Smiles']):
        continue

    SMILES_to_functional_group[row['Smiles']] = row['F group']

In [270]:
from rdkit import Chem

def remove_functional_group_from_smiles(smiles, f_group):
    f_group_smiles = functional_group_to_smiles[f_group]

    mol = Chem.MolFromSmiles(smiles)
    mol_f_group = Chem.MolFromSmiles(f_group_smiles)

    if mol.HasSubstructMatch(mol_f_group):
        mol = Chem.DeleteSubstructs(mol, mol_f_group)

    smiles_without_group = Chem.MolToSmiles(mol)
    return smiles_without_group

canon_smiles = []
functional_groups = []
smiles_with_removed_flurine = []
for SMILES in df['Smiles']:
    functional_group = SMILES_to_functional_group[SMILES]
    functional_groups.append(functional_group)

    # canon_smile = Chem.CanonSmiles(SMILES)
    # canon_smiles.append(canon_smile)
    
    SMILES_without_fluorine_group = remove_functional_group_from_smiles(smiles=SMILES, f_group=functional_group)
    smiles_with_removed_flurine.append(SMILES_without_fluorine_group)

# df['Smiles'] = canon_smiles
df['F group'] = functional_groups
df['non-F Smiles'] = smiles_with_removed_flurine

In [271]:
for index, row in df.iterrows():
    print(row['Smiles'], row['non-F Smiles'])

N[C@@H]1C[C@@H]2[C@H](C1)C2(F)F N[C@H]1C[C@@H]2C[C@@H]2C1
N[C@H]1C[C@@H]2[C@H](C1)C2(F)F N[C@@H]1C[C@@H]2C[C@@H]2C1
NC1CCC(F)(F)CC1 NC1CCCCC1
NC1CCCCC1 NC1CCCCC1
N[C@@H]1C[C@@H]2C[C@@H]2C1 N[C@@H]1C[C@@H]2C[C@@H]2C1
N[C@H]1C[C@@H]2C[C@@H]2C1 N[C@H]1C[C@@H]2C[C@@H]2C1
FC(F)(F)C12CC(CN1)C2 C1NC2CC1C2
C1NC2CC1C2 C1NC2CC1C2
C1CCNC1 C1CCNC1
CC12CC(CN1)C2 CC12CC(CN1)C2
FC(F)(F)C12CCC(CC1)CN2 C1CC2CCC1CN2
C1CC2CC1CN2 C1CC2CC1CN2
C1CC2CCC1CN2 C1CC2CCC1CN2
C1CCNCC1 C1CCNCC1
FC(F)(F)C12CCCC(CN1)C2 C1CC2CNC(C1)C2
C1CC2CNC(C1)C2 C1CC2CNC(C1)C2
FC1(F)CC2CNCC(C2)C1 C1CC2CNCC(C1)C2
FC1(F)CC2CNCC(C1)C2(F)F C1CC2CNCC(C1)C2
FC1(F)C2CCCC1CNC2 C1CC2CNCC(C1)C2
C1CC2CNCC(C1)C2 C1CC2CNCC(C1)C2
FC1(F)C2CNCC1COC2 C1NCC2COCC1C2
C1NCC2COCC1C2 C1NCC2COCC1C2
C1CC2CNCC1C2 C1CC2CNCC1C2
C1CNC1 C1CNC1
FC1(F)CNC1 C1CNC1
FC1CCNC1 C1CCNC1
FC1(F)CCNC1 C1CCNC1
F[C@@H]1CNC[C@@H]1F C1CCNC1
F[C@@H]1CNC[C@H]1F C1CCNC1
FC1CCCNC1 C1CCNCC1
FC1(F)CCCNC1 C1CCNCC1
F[C@@H]1CNC[C@H](F)C1 C1CCNCC1
F[C@@H]1CNC[C@@H](F)C1 C1CCNCC1
FC1CCN

In [272]:
mol = Chem.MolFromSmiles(Chem.CanonSmiles('N[C@@H]1C[C@@H]2[C@H](C1)C2(F)F'))

mol_f_group = Chem.MolFromSmiles('F')

if mol.HasSubstructMatch(mol_f_group):
    mol = Chem.DeleteSubstructs(mol, mol_f_group)

Chem.MolToSmiles(mol)

'N[C@H]1C[C@@H]2C[C@@H]2C1'

In [273]:
mol

<rdkit.Chem.rdchem.Mol at 0x201db5a10e0>

In [274]:
main_df_for_train = df.copy()

index_to_remain = []
for index_non_F, row_non_F in main_df_for_train.iterrows():
    SMILES_to_find = row_non_F['non-F Smiles']
    is_on_train = False
     
    for index, row in main_df_for_train.iterrows():
        if row['Smiles'] == SMILES_to_find:
            is_on_train = True
    
    index_to_remain.append(index_non_F)
    if row_non_F['F group'] != 'non-F':
        print(row_non_F['Smiles'], SMILES_to_find, row_non_F['F group'], index_non_F, is_on_train)


N[C@@H]1C[C@@H]2[C@H](C1)C2(F)F N[C@H]1C[C@@H]2C[C@@H]2C1 gem-CF2 54 True
N[C@H]1C[C@@H]2[C@H](C1)C2(F)F N[C@@H]1C[C@@H]2C[C@@H]2C1 gem-CF2 55 True
NC1CCC(F)(F)CC1 NC1CCCCC1 gem-CF2 56 True
FC(F)(F)C12CC(CN1)C2 C1NC2CC1C2 CF3 60 True
FC(F)(F)C12CCC(CC1)CN2 C1CC2CCC1CN2 CF3 66 True
FC(F)(F)C12CCCC(CN1)C2 C1CC2CNC(C1)C2 CF3 72 True
FC1(F)CC2CNCC(C2)C1 C1CC2CNCC(C1)C2 gem-CF2 75 True
FC1(F)CC2CNCC(C1)C2(F)F C1CC2CNCC(C1)C2 gem-CF2 76 True
FC1(F)C2CCCC1CNC2 C1CC2CNCC(C1)C2 gem-CF2 77 True
FC1(F)C2CNCC1COC2 C1NCC2COCC1C2 gem-CF2 79 True
FC1(F)CNC1 C1CNC1 gem-CF2 88 True
FC1CCNC1 C1CCNC1 CHF 89 True
FC1(F)CCNC1 C1CCNC1 gem-CF2 90 True
F[C@@H]1CNC[C@@H]1F C1CCNC1 CHF 91 True
F[C@@H]1CNC[C@H]1F C1CCNC1 CHF 92 True
FC1CCCNC1 C1CCNCC1 CHF 93 True
FC1(F)CCCNC1 C1CCNCC1 gem-CF2 94 True
F[C@@H]1CNC[C@H](F)C1 C1CCNCC1 CHF 95 True
F[C@@H]1CNC[C@@H](F)C1 C1CCNCC1 CHF 96 True
FC1CCNCC1 C1CCNCC1 CHF 97 True
F[C@@H]1CCNC[C@@H]1F C1CCNCC1 CHF 98 True
F[C@@H]1CCNC[C@H]1F C1CCNCC1 CHF 99 True
NC1(C(F)(F)F)C

In [275]:
df_for_train = main_df_for_train.copy().loc[index_to_remain]

for index, row in df_for_train.iterrows():
    SMILES = row['Smiles']
    SMILES_non_F = row['non-F Smiles']

    indexes_to_remove = []
    for index_train, row_train in df.iterrows():
        if row_train['Smiles'] == SMILES:
            indexes_to_remove.append(index_train)
        if row_train['Smiles'] == SMILES_non_F and SMILES_non_F != SMILES:
            indexes_to_remove.append(index_train)
    
    print(indexes_to_remove)
    df_train = df.drop(indexes_to_remove, axis=0, inplace=False)
    df_test = df.loc[indexes_to_remove]

    print(df_test['F group'].unique())

    # break


[54, 59]
['gem-CF2' 'non-F']
[55, 58]
['gem-CF2' 'non-F']
[56, 57]
['gem-CF2' 'non-F']
[57]
['non-F']
[58]
['non-F']
[59]
['non-F']
[60, 62]
['CF3' 'non-F']
[62]
['non-F']
[63]
['non-F']
[64]
['non-F']
[66, 70]
['CF3' 'non-F']
[68]
['non-F']
[70]
['non-F']
[71]
['non-F']
[72, 73]
['CF3' 'non-F']
[73]
['non-F']
[75, 78]
['gem-CF2' 'non-F']
[76, 78]
['gem-CF2' 'non-F']
[77, 78]
['gem-CF2' 'non-F']
[78]
['non-F']
[79, 80]
['gem-CF2' 'non-F']
[80]
['non-F']
[84]
['non-F']
[86]
['non-F']
[86, 88]
['non-F' 'gem-CF2']
[63, 89]
['non-F' 'CHF']
[63, 90]
['non-F' 'gem-CF2']
[63, 91]
['non-F' 'CHF']
[63, 92]
['non-F' 'CHF']
[71, 93]
['non-F' 'CHF']
[71, 94]
['non-F' 'gem-CF2']
[71, 95]
['non-F' 'CHF']
[71, 96]
['non-F' 'CHF']
[71, 97]
['non-F' 'CHF']
[71, 98]
['non-F' 'CHF']
[71, 99]
['non-F' 'CHF']
[100, 103]
['CF3' 'non-F']
[101, 103]
['CHF2' 'non-F']
[102, 103]
['CH2F' 'non-F']
[103]
['non-F']
[103, 104]
['non-F' 'CF3']
[103, 105]
['non-F' 'CF3']
[103, 106]
['non-F' 'CHF2']
[103, 107]
['non-F'

In [276]:
df_for_train.to_csv(r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\pKa_basicity_data\gnn_cv_canon_smiles\train_test_for_SMILES_per_model.csv')

In [277]:
df_for_train

Unnamed: 0,fold_id,pKa,Smiles,F group,non-F Smiles
54,0.0,8.89,N[C@@H]1C[C@@H]2[C@H](C1)C2(F)F,gem-CF2,N[C@H]1C[C@@H]2C[C@@H]2C1
55,0.0,8.95,N[C@H]1C[C@@H]2[C@H](C1)C2(F)F,gem-CF2,N[C@@H]1C[C@@H]2C[C@@H]2C1
56,1.0,9.45,NC1CCC(F)(F)CC1,gem-CF2,NC1CCCCC1
57,1.0,10.58,NC1CCCCC1,non-F,NC1CCCCC1
58,0.0,9.79,N[C@@H]1C[C@@H]2C[C@@H]2C1,non-F,N[C@@H]1C[C@@H]2C[C@@H]2C1
...,...,...,...,...,...
22,,4.18,O=C(O)C1CCCC(F)(F)C1,gem-CF2,O=C(O)C1CCCCC1
3,,4.24,O=C(O)[C@@H]1C[C@@H]2C[C@@H]2C1,non-F,O=C(O)[C@@H]1C[C@@H]2C[C@@H]2C1
34,,4.39,O=C(O)C1CC2(CC(F)C2)C1,CHF,O=C(O)C1CC2(CCC2)C1
51,,4.61,O=C(O)C1CC12CC2,non-F,O=C(O)C1CC12CC2


In [3]:
import pandas as pd

df_for_train = pd.read_csv(r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\pKa_basicity_data\gnn_cv_canon_smiles\train_test_for_SMILES_per_model.csv', index_col=0)

In [6]:
for index, row in df_for_train.iterrows():
    ok = False
    for index_temp, row_temp in df_for_train.iterrows():
        if row['non-F Smiles'] == row_temp['Smiles']:
            ok = True
    
    print(ok, row['Smiles'], row['non-F Smiles'])


True N[C@@H]1C[C@@H]2[C@H](C1)C2(F)F N[C@H]1C[C@@H]2C[C@@H]2C1
True N[C@H]1C[C@@H]2[C@H](C1)C2(F)F N[C@@H]1C[C@@H]2C[C@@H]2C1
True NC1CCC(F)(F)CC1 NC1CCCCC1
True NC1CCCCC1 NC1CCCCC1
True N[C@@H]1C[C@@H]2C[C@@H]2C1 N[C@@H]1C[C@@H]2C[C@@H]2C1
True N[C@H]1C[C@@H]2C[C@@H]2C1 N[C@H]1C[C@@H]2C[C@@H]2C1
True FC(F)(F)C12CC(CN1)C2 C1NC2CC1C2
True C1NC2CC1C2 C1NC2CC1C2
True C1CCNC1 C1CCNC1
True CC12CC(CN1)C2 CC12CC(CN1)C2
True FC(F)(F)C12CCC(CC1)CN2 C1CC2CCC1CN2
True C1CC2CC1CN2 C1CC2CC1CN2
True C1CC2CCC1CN2 C1CC2CCC1CN2
True C1CCNCC1 C1CCNCC1
True FC(F)(F)C12CCCC(CN1)C2 C1CC2CNC(C1)C2
True C1CC2CNC(C1)C2 C1CC2CNC(C1)C2
True FC1(F)CC2CNCC(C2)C1 C1CC2CNCC(C1)C2
True FC1(F)CC2CNCC(C1)C2(F)F C1CC2CNCC(C1)C2
True FC1(F)C2CCCC1CNC2 C1CC2CNCC(C1)C2
True C1CC2CNCC(C1)C2 C1CC2CNCC(C1)C2
True FC1(F)C2CNCC1COC2 C1NCC2COCC1C2
True C1NCC2COCC1C2 C1NCC2COCC1C2
True C1CC2CNCC1C2 C1CC2CNCC1C2
True C1CNC1 C1CNC1
True FC1(F)CNC1 C1CNC1
True FC1CCNC1 C1CCNC1
True FC1(F)CCNC1 C1CCNC1
True F[C@@H]1CNC[C@@H]1F C1CCN

OUT OF SCOPE

In [5]:
import pandas as pd

pKa_df = pd.read_csv(r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\pKa_basicity_data\gnn_cv_canon_smiles\train_test_for_SMILES_per_model.csv', index_col=0)

decreased_df = pKa_df.loc[[134, 100]]

In [7]:
decreased_df.to_csv(r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\pKa_basicity_data\gnn_cv_canon_smiles\train_test_for_SMILES_per_model_CF3_2_mols.csv')