In [7]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import sklearn
from tqdm import tqdm
import pubchempy as pcp

In [8]:
def get_canonical_smiles(drug_name):
    try:
        results = pcp.get_compounds(drug_name, 'name')
        if results:
            return results[0].canonical_smiles
        else:
            return np.nan
    except Exception as e:
        print(f"Error retrieving canonical SMILES for {drug_name}: {str(e)}")
        return np.nan

In [10]:
folder = "/scratch/work/masooda1/bert-invitro-adme/data/rawdata/"

# labels 
tggateINHANDS = pd.read_excel(folder + "tx2c00378_si_001.xlsx", sheet_name = "Sheet1")
compound_list = tggateINHANDS.COMPOUND_NAME.unique().tolist()[:10]

compound_smiles_dict = {}
# Retrieve SMILES for each drug name
for name in tqdm(compound_list):
    smiles = get_canonical_smiles(name)
    compound_smiles_dict[name] = smiles

SMILES = pd.DataFrame(list(compound_smiles_dict.items()), columns=['COMPOUND_NAME', 'SMILES'])
SMILES.to_csv("/scratch/work/masooda1/bert-invitro-adme/data/rawdata/TG_GATES_SMILES.csv", index = False)

100%|██████████| 10/10 [00:05<00:00,  1.79it/s]


Unnamed: 0,COMPOUND_NAME,SMILES
0,acetaminophen,CC(=O)NC1=CC=C(C=C1)O
1,isoniazid,C1=CN=CC=C1C(=O)NN
2,carbon tetrachloride,C(Cl)(Cl)(Cl)Cl
3,valproic acid,CCCC(CCC)C(=O)O
4,clofibrate,CCOC(=O)C(C)(C)OC1=CC=C(C=C1)Cl
5,rifampicin,CC1C=CC=C(C(=O)NC2=C(C(=C3C(=C2O)C(=C(C4=C3C(=...
6,naphthyl isothiocyanate,
7,allyl alcohol,C=CCO
8,phenylbutazone,CCCCC1C(=O)N(N(C1=O)C2=CC=CC=C2)C3=CC=CC=C3
9,omeprazole,CC1=CN=C(C(=C1OC)C)CS(=O)C2=NC3=C(N2)C=C(C=C3)OC


In [None]:
folder = "/scratch/work/masooda1/bert-invitro-adme/data/rawdata"
tggateINHANDS = pd.read_csv(folder + "tx2c00378_si_001.xlsx", sep = "\t")

# Read SMILES
tggate_SMILES = pd.read_csv(folder + "tggateSmiles.txt", sep = "\t")

# retain only those compounds for which we have inhand FINDINGS
tggate_SMILES = tggate_SMILES[tggate_SMILES.COMPOUND_NAME.isin(tggateINHANDS.COMPOUND_NAME)]
tggate_data = pd.merge(tggate_SMILES,tggateINHANDS, how = 'left', on = 'COMPOUND_NAME')

selected_column = ['COMPOUND_NAME','SMILES','Dose_Level','Time','Grade','Number of Animals','Finding: Final INHANDS nomenclature']
tggate_data = tggate_data[selected_column]

# drop rows where Finding == Nan, but grade != Nan, or vice verse
rows_to_drop = tggate_data[(tggate_data['Grade'].isnull()) & ~(tggate_data['Finding: Final INHANDS nomenclature'].isnull())].index
if rows_to_drop.shape[0] != 0:
    tggate_data.drop(rows_to_drop, inplace = True)

rows_to_drop = tggate_data[tggate_data['Finding: Final INHANDS nomenclature'].isnull() & ~(tggate_data['Grade'].isnull())].index
if rows_to_drop.shape[0] != 0:
    tggate_data.drop(rows_to_drop, inplace = True)
print(tggate_data.shape)

# Remove outliers from Grades
outliers = ["2/5","1/5","3/5","/"]
tggate_data = tggate_data[~tggate_data['Grade'].isin(outliers)]
print(tggate_data.shape)

# Fill nan with appropriate values
tggate_data["Finding: Final INHANDS nomenclature"].fillna('NonToxic', inplace = True)
tggate_data["Number of Animals"].fillna(5, inplace = True)
tggate_data["Grade"].fillna('NonToxic', inplace = True)

# convert gradings to numeric coding
toxicity_categories = { 'NonToxic':0,'minimal':0.2,'slight':0.4,'moderate':0.6,'marked':0.8,'severe':1}
tggate_data.Grade = tggate_data.Grade.map(toxicity_categories)

tggate_data.rename(columns= {"Finding: Final INHANDS nomenclature":"Findings"},inplace = True)
# Change name to few findings

tggate_data["Findings"].replace({"Necrosis, Zonal; Inflammation":"Necrosis, Zonal",
                                       "Apoptosis/Single cell necrosis":"Single Cell Necrosis"}, inplace = True)

# DILI labels
# DILI positive: if Grade > 0, Number of Animals > 1
tggate_data['DILI_labels'] = (tggate_data["Number of Animals"] > 1) & (tggate_data["Grade"] > 0)

# if Findings is positive at any dose and time,consider it positive
selected_column = ['COMPOUND_NAME','SMILES','Findings','DILI_labels']
tggate_data = tggate_data[selected_column].groupby(['COMPOUND_NAME','SMILES','Findings']).sum().reset_index()
tggate_data['DILI_labels'] = tggate_data['DILI_labels'].astype(bool)

# each finding deserves one column
tggate_data = tggate_data.pivot(index = ['COMPOUND_NAME','SMILES'], columns= 'Findings', values= "DILI_labels").rename_axis(None, axis=1).reset_index()

# Fill na with 0
tggate_data = tggate_data.fillna(0) * 1
tggate_data