In [1]:
# set random seed for reproducibility
import numpy as np
import random

np.random.seed(0)
random.seed(0)

In [2]:
import pandas as pd

TARGETS = ["AChE", "D2R", "D3R", "_5HT2A", "MAOB", "BBB"]

for target in TARGETS:
    assay_df = pd.read_csv(f"../data/Bioassays/{target}.csv")
    print(target, assay_df.shape)
    print(assay_df["activity"].value_counts())

AChE (7409, 4)
inactive    7021
active       388
Name: activity, dtype: int64
D2R (299477, 4)
inactive    292615
active        6862
Name: activity, dtype: int64
D3R (407605, 4)
inactive    405225
active        2380
Name: activity, dtype: int64
_5HT2A (91212, 4)
inactive    88800
active       2412
Name: activity, dtype: int64
MAOB (8575, 2)
inactive    6919
active      1656
Name: activity, dtype: int64
BBB (7807, 2)
active      4956
inactive    2851
Name: activity, dtype: int64


In [3]:
# Create fine-tuning datasets with only active compounds
# File .smi format:
# SMILES\tTargetID

# Convert InChI to SMILES
from rdkit import Chem

# Ignore rdkit warnings
from rdkit import RDLogger

lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)

targets_dfs = list()
ids = list(range(1, 1 + len(TARGETS)))


def inchi_to_smiles(inchi):
    try:
        return Chem.MolToSmiles(Chem.inchi.MolFromInchi(inchi))
    except:
        return None


for ti, target in enumerate(TARGETS):
    assay_df = pd.read_csv(f"../data/Bioassays/{target}.csv")
    print(target, assay_df.shape)

    # Convert InChI to SMILES
    assay_df["SMILES"] = assay_df["InChI"].apply(
        inchi_to_smiles
    )
    assay_df = assay_df.dropna(subset=["SMILES"])
    active_compounds = assay_df[assay_df["activity"] == "active"].copy()

    # randomly sample half of the active compounds for fine-tuning
    # active_compounds = active_compounds.sample(frac=0.5, random_state=42)
    active_compounds["TargetID"] = ids[ti]

    targets_dfs.append(active_compounds)

    # the other half is used for training the models
    qsar_df = assay_df  # .drop(active_compounds.index)
    qsar_df["activity"] = qsar_df["activity"].replace({"active": 1, "inactive": 0})
    qsar_df.to_csv(f"../data/Bioassays/{target}_SMILES.csv", index=False)

targets_df = pd.concat(targets_dfs)
targets_df

AChE (7409, 4)
D2R (299477, 4)
D3R (407605, 4)
_5HT2A (91212, 4)
MAOB (8575, 2)
BBB (7807, 2)


Unnamed: 0,pubchem_molecule_id,pubchem_molecule_type,InChI,activity,SMILES,TargetID
0,6.0,compounds,InChI=1S/C6H3ClN2O4/c7-5-2-1-4(8(10)11)3-6(5)9...,active,O=[N+]([O-])c1ccc(Cl)c([N+](=O)[O-])c1,1
1,177.0,compounds,"InChI=1S/C2H4O/c1-2-3/h2H,1H3",active,CC=O,1
2,243.0,compounds,"InChI=1S/C7H6O2/c8-7(9)6-4-2-1-3-5-6/h1-5H,(H,...",active,O=C(O)c1ccccc1,1
3,244.0,compounds,"InChI=1S/C7H8O/c8-6-7-4-2-1-3-5-7/h1-5,8H,6H2",active,OCc1ccccc1,1
4,743.0,compounds,"InChI=1S/C5H8O4/c6-4(7)2-1-3-5(8)9/h1-3H2,(H,6...",active,O=C(O)CCCC(=O)O,1
...,...,...,...,...,...,...
7755,,,InChI=1S/C12H7Cl3O2/c13-7-1-3-11(9(15)5-7)17-1...,active,Oc1cc(Cl)ccc1Oc1ccc(Cl)cc1Cl,6
7769,,,InChI=1S/C21H30O2/c1-5-6-7-8-15-12-18(22)20-16...,active,CCCCCc1cc(O)c2c(c1)OC(C)(C)C1CCC(C)=CC21,6
7780,,,InChI=1S/C14H17N3O/c1-16-9-3-5-13-11(7-9)10-6-...,active,CNC1CCc2[nH]c3ccc(C(=N)O)cc3c2C1,6
7803,,,InChI=1S/C17H26N4O/c1-3-22-14-13-21-16-8-5-4-7...,active,CCOCCn1c(N2CCCN(C)CC2)nc2ccccc21,6


In [4]:
# Save smiles and target id to file withouth header
targets_df[["SMILES", "TargetID"]].to_csv(
    "../data/active_compounds.smi", sep="\t", header=False, index=False
)

In [5]:
# Target to ID
target_to_id = pd.DataFrame({"Target": TARGETS, "TargetID": ids})

target_to_id

Unnamed: 0,Target,TargetID
0,AChE,1
1,D2R,2
2,D3R,3
3,_5HT2A,4
4,MAOB,5
5,BBB,6
