In [1]:
import pandas as pd
import numpy as np

# Load with semicolon delimiter
df = pd.read_csv("../data/chembl_egfr_bioactivity.csv", sep=";")

# Keep only rows where Standard Type is IC50 and relation is '='
df = df[(df["Standard Type"] == "IC50") & (df["Standard Relation"] == "'='")]

# Drop rows with missing SMILES or IC50
df = df.dropna(subset=["Smiles", "Standard Value"])

# Convert IC50 (nM) to pIC50 = -log10(IC50 [M])
df["IC50_M"] = df["Standard Value"].astype(float) * 1e-9
df["pIC50_calc"] = -np.log10(df["IC50_M"])

# Use existing pChEMBL Value if available, else use pIC50_calc
df["pIC50"] = df["pChEMBL Value"]
df["pIC50"] = df["pIC50"].fillna(df["pIC50_calc"])

# Select final columns
df_clean = df[["Smiles", "Standard Value", "Standard Units", "pIC50", "Molecule ChEMBL ID"]].copy()

# Save cleaned version
df_clean.to_csv("../data/chembl_egfr_bioactivity_clean.csv", index=False)
print("✅ Cleaned data saved to data/chembl_egfr_bioactivity_clean.csv")


✅ Cleaned data saved to data/chembl_egfr_bioactivity_clean.csv


  df = pd.read_csv("../data/chembl_egfr_bioactivity.csv", sep=";")
