In [1]:
import pandas as pd

In [2]:
import pandas as pd

def load_data(base_folder):
    """Load and merge TG-GATES data with SMILES"""
    # Load labels from Excel
    tggate_inhands = pd.read_excel(
        base_folder + "tx2c00378_si_001.xlsx", 
        sheet_name="ALL_data"
    )
    
    # Load SMILES data
    tggate_smiles = pd.read_csv(base_folder + "TG_GATES_SMILES.csv")
    
    # Filter and merge data
    tggate_smiles = tggate_smiles[tggate_smiles.COMPOUND_NAME.isin(tggate_inhands.COMPOUND_NAME)]
    merged_data = pd.merge(tggate_smiles, tggate_inhands, how='left', on='COMPOUND_NAME')
    
    return merged_data

def preprocess_data(df):
    """Preprocess the merged dataset"""
    # Create a copy to avoid SettingWithCopyWarning
    df = df.copy()
    
    # Rename column
    df = df.rename(columns={"Finding: Final INHANDS nomenclature": "Findings"})
    
    # Fill missing values using recommended approach
    df = df.assign(Findings=df["Findings"].fillna('NonToxic'))
    
    # Select relevant columns
    selected_columns = ['COMPOUND_NAME', 'SMILES', 'Dose_Level', 'Time', 'Findings']
    df = df[selected_columns]
    
    # Create DILI labels using recommended approach
    df = df.assign(
        DILI_labels=df['Findings'].notna().astype(int)
    )
    
    return df

def create_binary_matrix(df):
    """Create binary matrix of findings"""
    # Group by compound and create binary indicators
    grouped_data = df.groupby(['COMPOUND_NAME', 'SMILES', 'Findings']).DILI_labels.sum().reset_index()
    grouped_data = grouped_data.assign(
        DILI_labels=grouped_data['DILI_labels'].astype(bool)
    )
    grouped_data.loc[grouped_data['Findings'] == "NonToxic", 'DILI_labels'] = False
    
    # Pivot and create binary matrix
    binary_matrix = grouped_data.pivot(
        index=['COMPOUND_NAME', 'SMILES'],
        columns='Findings',
        values="DILI_labels"
    ).rename_axis(None, axis=1).reset_index()
    
    # Handle fillna warning with explicit type conversion
    binary_matrix = binary_matrix.fillna(0)
    binary_matrix.iloc[:, 2:] = binary_matrix.iloc[:, 2:].astype(int)
    
    return binary_matrix

def filter_frequent_findings(df, min_frequency=6):
    """Filter findings based on minimum frequency"""
    findings_freq = df.iloc[:, 2:].sum(axis=0).reset_index()
    findings_freq.columns = ["Finding", "Frequency"]
    
    selected_findings = findings_freq[
        findings_freq.Frequency > min_frequency
    ].sort_values(by="Frequency").reset_index(drop=True)
    
    selected_columns = ["COMPOUND_NAME", "SMILES"] + selected_findings.Finding.tolist()
    return df[selected_columns]

def main():
    # Define input/output paths
    input_folder = "/scratch/work/masooda1/bert-invitro-adme/data/rawdata/"
    output_path = "/scratch/work/masooda1/bert-invitro-adme/data/binary_data/histopathology_binary_data.csv"
    
    # Process data
    raw_data = load_data(input_folder)
    processed_data = preprocess_data(raw_data)
    binary_matrix = create_binary_matrix(processed_data)
    final_data = filter_frequent_findings(binary_matrix)
    
    # Save results
    final_data.to_csv(output_path, index=False)

if __name__ == "__main__":
    main()

  binary_matrix = binary_matrix.fillna(0)
1      0
2      0
3      0
4      0
      ..
124    0
125    0
126    0
127    0
128    0
Name: NonToxic, Length: 129, dtype: int64' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  binary_matrix.iloc[:, 2:] = binary_matrix.iloc[:, 2:].astype(int)


In [39]:
    return final_data
if __name__ == "__main__":
    main()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Findings"].fillna('NonToxic', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df['Findings'].notna(), 'DILI_labels'] = 1
  binary_matrix = binary_matrix.fillna(0) * 1


In [35]:
final_data.sum(axis = 0)

COMPOUND_NAME                                             WY-14643acarboseacetaminophenacetazolamideadap...
SMILES                                                    CC1=C(C(=CC=C1)NC2=CC(=NC(=N2)SCC(=O)O)Cl)CCC1...
Cytoplasmic alteration (Glycogen accumulation)                                                            7
Extramedullary Hematopoiesis                                                                              7
Necrosis, Zonal                                                                                           7
Pigmentation (pigment deposition)                                                                         9
Apoptosis                                                                                                11
Infiltration, Mononuclear                                                                                12
Cytoplasmic alteration (Basophilic/glycogen depletion)                                                   16
Hypertrophy/Hyperplasia     

In [27]:
folder = "/scratch/work/masooda1/bert-invitro-adme/data/rawdata/"

# Labels
tggateINHANDS = pd.read_excel(folder + "tx2c00378_si_001.xlsx", sheet_name = "ALL_data")

# SMILES
tggate_SMILES = pd.read_csv(folder + "TG_GATES_SMILES.csv")

# retain only those compounds for which we have inhand FINDINGS
tggate_SMILES = tggate_SMILES[tggate_SMILES.COMPOUND_NAME.isin(tggateINHANDS.COMPOUND_NAME)]
tggate_data = pd.merge(tggate_SMILES,tggateINHANDS, how = 'left', on = 'COMPOUND_NAME')

tggate_data.rename(columns= {"Finding: Final INHANDS nomenclature":"Findings"},inplace = True)
tggate_data["Findings"].fillna('NonToxic', inplace = True)

selected_column = ['COMPOUND_NAME','SMILES','Dose_Level','Time','Findings']
tggate_data = tggate_data[selected_column]

# Create DILI_labels column: 1 for non-null values, 0 for null values
tggate_data.loc[tggate_data['Findings'].notna(), 'DILI_labels'] = 1
tggate_data.loc[tggate_data['Findings'].isna(), 'DILI_labels'] = 0

# Count Toxic combinations, if any drug-dose-time-finding is Toxic, asign toxic label to this drug-finding
tggate_data = tggate_data.groupby(['COMPOUND_NAME','SMILES','Findings']).DILI_labels.sum().reset_index()
tggate_data['DILI_labels'] = tggate_data['DILI_labels'].astype(bool)
tggate_data.loc[tggate_data['Findings'] == "NonToxic", 'DILI_labels'] = False
tggate_data = tggate_data.pivot(index = ['COMPOUND_NAME','SMILES'], columns= 'Findings', values= "DILI_labels").rename_axis(None, axis=1).reset_index()
tggate_data = tggate_data.fillna(0) * 1
selected_findings = tggate_data.iloc[:,2:].sum(axis = 0).reset_index()
selected_findings.columns = ["Finding","Frequency"]
selected_findings = selected_findings[selected_findings.Frequency > 6].sort_values(by = "Frequency").reset_index(drop = True)
selected_columns  = ["COMPOUND_NAME", "SMILES"] + selected_findings.Finding.tolist()
tggate_data = tggate_data[selected_columns]
tggate_data.to_csv("scratch/work/masooda1/bert-invitro-adme/data/binary_data/histopathology_binary_data.csv", index = False)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  tggate_data["Findings"].fillna('NonToxic', inplace = True)
  tggate_data = tggate_data.fillna(0) * 1


Unnamed: 0,COMPOUND_NAME,SMILES,Cytoplasmic alteration (Glycogen accumulation),Extramedullary Hematopoiesis,"Necrosis, Zonal",Pigmentation (pigment deposition),Apoptosis,"Infiltration, Mononuclear",Cytoplasmic alteration (Basophilic/glycogen depletion),Hypertrophy/Hyperplasia,Single Cell Necrosis,Cytoplasmic alteration (Eosinophilic),Increased mitoses,Vacuolation,"Hypertrophy, hepatocellular"
0,WY-14643,CC1=C(C(=CC=C1)NC2=CC(=NC(=N2)SCC(=O)O)Cl)C,0,0,0,0,0,0,1,0,0,1,1,1,1
1,acarbose,CC1C(C(C(C(O1)OC2C(OC(C(C2O)O)OC3C(OC(C(C3O)O)...,0,0,0,0,0,0,0,0,0,0,0,0,0
2,acetaminophen,CC(=O)NC1=CC=C(C=C1)O,0,0,1,0,1,1,0,0,0,1,1,0,0
3,acetazolamide,CC(=O)NC1=NN=C(S1)S(=O)(=O)N,0,0,0,0,0,0,0,0,0,0,0,0,0
4,adapin,CN(C)CCC=C1C2=CC=CC=C2COC3=CC=CC=C31.Cl,0,0,0,0,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124,triamterene,C1=CC=C(C=C1)C2=NC3=C(N=C(N=C3N=C2N)N)N,1,0,0,0,0,0,0,0,0,0,0,0,0
125,triazolam,CC1=NN=C2N1C3=C(C=C(C=C3)Cl)C(=NC2)C4=CC=CC=C4Cl,0,0,0,0,0,0,0,0,0,0,0,0,0
126,trimethadione,CC1(C(=O)N(C(=O)O1)C)C,0,0,0,0,0,0,0,0,0,0,0,0,0
127,valproic acid,CCCC(CCC)C(=O)O,0,0,0,0,0,0,0,0,0,0,0,0,0


In [59]:

tggate_data = tggate_data.drop_duplicates().reset_index(drop = True)
tggate_data = tggate_data.groupby(['COMPOUND_NAME','SMILES','Findings','DILI_labels']).count().reset_index()
tggate_data# = tggate_data.pivot(index = ['COMPOUND_NAME','SMILES'], columns= 'Findings', values= "DILI_labels").rename_axis(None, axis=1).reset_index()
tggate_data

KeyError: "['Finding: Final INHANDS nomenclature'] not in index"

In [54]:
tggate_data = tggate_data.pivot(index = ['COMPOUND_NAME','SMILES'], columns= 'Findings', values= "DILI_labels").rename_axis(None, axis=1).reset_index()


ValueError: Index contains duplicate entries, cannot reshape

In [None]:
folder = "/scratch/work/masooda1/bert-invitro-adme/data/rawdata"
tggateINHANDS = pd.read_csv(folder + "tx2c00378_si_001.xlsx", sep = "\t")

# Read SMILES
tggate_SMILES = pd.read_csv(folder + "tggateSmiles.txt", sep = "\t")

# retain only those compounds for which we have inhand FINDINGS
tggate_SMILES = tggate_SMILES[tggate_SMILES.COMPOUND_NAME.isin(tggateINHANDS.COMPOUND_NAME)]
tggate_data = pd.merge(tggate_SMILES,tggateINHANDS, how = 'left', on = 'COMPOUND_NAME')

selected_column = ['COMPOUND_NAME','SMILES','Dose_Level','Time','Grade','Number of Animals','Finding: Final INHANDS nomenclature']
tggate_data = tggate_data[selected_column]

# drop rows where Finding == Nan, but grade != Nan, or vice verse
rows_to_drop = tggate_data[(tggate_data['Grade'].isnull()) & ~(tggate_data['Finding: Final INHANDS nomenclature'].isnull())].index
if rows_to_drop.shape[0] != 0:
    tggate_data.drop(rows_to_drop, inplace = True)

rows_to_drop = tggate_data[tggate_data['Finding: Final INHANDS nomenclature'].isnull() & ~(tggate_data['Grade'].isnull())].index
if rows_to_drop.shape[0] != 0:
    tggate_data.drop(rows_to_drop, inplace = True)
print(tggate_data.shape)

# Remove outliers from Grades
outliers = ["2/5","1/5","3/5","/"]
tggate_data = tggate_data[~tggate_data['Grade'].isin(outliers)]
print(tggate_data.shape)

# Fill nan with appropriate values
tggate_data["Finding: Final INHANDS nomenclature"].fillna('NonToxic', inplace = True)
tggate_data["Number of Animals"].fillna(5, inplace = True)
tggate_data["Grade"].fillna('NonToxic', inplace = True)

# convert gradings to numeric coding
toxicity_categories = { 'NonToxic':0,'minimal':0.2,'slight':0.4,'moderate':0.6,'marked':0.8,'severe':1}
tggate_data.Grade = tggate_data.Grade.map(toxicity_categories)

tggate_data.rename(columns= {"Finding: Final INHANDS nomenclature":"Findings"},inplace = True)
# Change name to few findings

tggate_data["Findings"].replace({"Necrosis, Zonal; Inflammation":"Necrosis, Zonal",
                                       "Apoptosis/Single cell necrosis":"Single Cell Necrosis"}, inplace = True)

# DILI labels
# DILI positive: if Grade > 0, Number of Animals > 1
tggate_data['DILI_labels'] = (tggate_data["Number of Animals"] > 1) & (tggate_data["Grade"] > 0)

# if Findings is positive at any dose and time,consider it positive
selected_column = ['COMPOUND_NAME','SMILES','Findings','DILI_labels']
tggate_data = tggate_data[selected_column].groupby(['COMPOUND_NAME','SMILES','Findings']).sum().reset_index()
tggate_data['DILI_labels'] = tggate_data['DILI_labels'].astype(bool)

# each finding deserves one column
tggate_data = tggate_data.pivot(index = ['COMPOUND_NAME','SMILES'], columns= 'Findings', values= "DILI_labels").rename_axis(None, axis=1).reset_index()

# Fill na with 0
tggate_data = tggate_data.fillna(0) * 1
tggate_data

# Extract SMILES from PUBCHEM

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import pubchempy as pcp
import argparse
import sys
from pathlib import Path

In [8]:
def get_canonical_smiles(drug_name):
    """
    Retrieve canonical SMILES for a given drug name using PubChemPy.
    
    Args:
        drug_name (str): Name of the drug compound
    
    Returns:
        str or np.nan: Canonical SMILES string if found, np.nan otherwise
    """
    try:
        results = pcp.get_compounds(drug_name, 'name')
        print(results)
        if results:
            smiles = results[0].canonical_smiles
            print(smiles)
            return smiles
        else:
            return np.nan
    except Exception as e:
        print(f"Error retrieving canonical SMILES for {drug_name}: {str(e)}")
        return np.nan

In [9]:
get_canonical_smiles("acetaminophen")

[Compound(1983)]
None


In [1]:
import pubchempy as pcp
print(pcp.__version__)
def get_smiles_from_name(compound_name):
  """
  Retrieves the SMILES representation of a compound given its name.

  Args:
    compound_name: The name of the compound (e.g., "glucose").

  Returns:
    The SMILES string if found, otherwise None.
  """
  try:
    compounds = pcp.get_compounds(compound_name, 'name')
    if compounds:
      return compounds[0].isomeric_smiles
    else:
      return None
  except Exception as e:
    print(f"An error occurred: {e}")
    return None

# Example usage:
compound_name = "glucose"
smiles = get_smiles_from_name(compound_name)

if smiles:
  print(f"The SMILES for {compound_name} is: {smiles}")
else:
  print(f"Could not retrieve SMILES for {compound_name}")

1.0.4
Could not retrieve SMILES for glucose


In [2]:
results = pcp.get_compounds('Glucose', 'name')
results[0].isomeric_smiles

In [3]:
pcp.__version__

'1.0.4'

In [3]:
from pubchempy import get_compounds, Compound
comp = Compound.from_cid(1423)
print(comp.isomeric_smiles)
comps = get_compounds('Aspirin', 'name')
print(comps[0].isomeric_smiles)

None
None


In [5]:
import pandas as pd

In [9]:
pd.read_csv("/scratch/work/masooda1/ToxBERT_github/data/binary_data/histopathology_binary_data.csv").sum()

COMPOUND_NAME                                             WY-14643acarboseacetaminophenacetazolamideadap...
SMILES                                                    CC1=C(C(=CC=C1)NC2=CC(=NC(=N2)SCC(=O)O)Cl)CCC1...
Cytoplasmic alteration (Glycogen accumulation)                                                            7
Extramedullary Hematopoiesis                                                                              7
Necrosis, Zonal                                                                                           7
Pigmentation (pigment deposition)                                                                         9
Apoptosis                                                                                                11
Infiltration, Mononuclear                                                                                12
Cytoplasmic alteration (Basophilic/glycogen depletion)                                                   16
Hypertrophy/Hyperplasia     

In [8]:
pd.read_csv("/scratch/work/masooda1/ToxBERT_github/data/testing/final_results.csv").sum()

COMPOUND_NAME                                             WY-14643acarboseacetaminophenacetazolamideadap...
SMILES                                                    CC1=C(C(=CC=C1)NC2=CC(=NC(=N2)SCC(=O)O)Cl)CCC1...
Cytoplasmic alteration (Glycogen accumulation)                                                            7
Extramedullary Hematopoiesis                                                                              7
Necrosis, Zonal                                                                                           7
Pigmentation (pigment deposition)                                                                         9
Apoptosis                                                                                                11
Infiltration, Mononuclear                                                                                12
Cytoplasmic alteration (Basophilic/glycogen depletion)                                                   16
Hypertrophy/Hyperplasia     