In [None]:
# set random seed for reproducibility
import numpy as np
import random

np.random.seed(0)
random.seed(0)

In [None]:
import pandas as pd

TARGETS = ["AChE", "D2R", "D3R", "_5HT2A", "MAOB", "BBB"]

for target in TARGETS:
    assay_df = pd.read_csv(f"../data/Bioassays/{target}.csv")
    print(target, assay_df.shape)
    print(assay_df["activity"].value_counts())

In [None]:
# Create fine-tuning datasets with only active compounds
# File .smi format:
# SMILES\tTargetID

# Convert InChI to SMILES
from rdkit import Chem

# Ignore rdkit warnings
from rdkit import RDLogger

lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)

targets_dfs = list()
ids = list(range(1, 1 + len(TARGETS)))


def inchi_to_smiles(inchi):
    try:
        return Chem.MolToSmiles(Chem.inchi.MolFromInchi(inchi))
    except:
        return None


for ti, target in enumerate(TARGETS):
    assay_df = pd.read_csv(f"../data/Bioassays/{target}.csv")
    print(target, assay_df.shape)
    
    # Convert InChI to SMILES
    assay_df["SMILES"] = assay_df["InChI"].apply(
        inchi_to_smiles
    )
    assay_df = assay_df.dropna(subset=["SMILES"])
    
    active_compounds = assay_df[assay_df["activity"] == "active"].copy()

    # randomly sample half of the active compounds for fine-tuning
    active_compounds = active_compounds.sample(frac=0.5, random_state=42)
    active_compounds["TargetID"] = ids[ti]

    targets_dfs.append(active_compounds)

    # the other half is used for training the models
    qsar_df = assay_df.drop(active_compounds.index)
    qsar_df["activity"] = qsar_df["activity"].replace({"active": 1, "inactive": 0})
    qsar_df.to_csv(f"./data/Bioassays/{target}_SMILES.csv", index=False)

targets_df = pd.concat(targets_dfs)
targets_df

In [None]:
# Save smiles and target id to file withouth header
targets_df[["SMILES", "TargetID"]].to_csv(
    "../data/active_compounds.smi", sep="\t", header=False, index=False
)