### Data Processing for Random Forests and XGBoost: ECFP feature generation

In [4]:
import pandas as pd
from pathlib import Path

# Heuristic: project root is the first parent that has data/processed
def get_project_root():
    here = Path.cwd().resolve()
    for p in [here] + list(here.parents):
        if (p / "data" / "processed").exists():
            return p
    return here  # fallback

PROJECT_ROOT = get_project_root()
print("PROJECT_ROOT:", PROJECT_ROOT)

PROC = PROJECT_ROOT / "data" / "processed"
print("Processed dir exists:", PROC.exists())
print("Contents:", list(PROC.iterdir()))

train_path = PROC / "train_brd4_50k_clean_blocks.parquet"
test_path  = PROC / "test_brd4_50k_clean_blocks.parquet"

print("Train path:", train_path)
print("Test path :", test_path)

train_df = pd.read_parquet(train_path)
test_df  = pd.read_parquet(test_path)

print("Train shape:", train_df.shape)
print("Test shape :", test_df.shape)

train_df.head()

PROJECT_ROOT: /Users/fut_payi/Desktop/F.I.T-Proteins
Processed dir exists: True
Contents: [PosixPath('/Users/fut_payi/Desktop/F.I.T-Proteins/data/processed/test_brd4_50k_clean.parquet'), PosixPath('/Users/fut_payi/Desktop/F.I.T-Proteins/data/processed/y_train_full.npy'), PosixPath('/Users/fut_payi/Desktop/F.I.T-Proteins/data/processed/prep_metadata.joblib'), PosixPath('/Users/fut_payi/Desktop/F.I.T-Proteins/data/processed/.DS_Store'), PosixPath('/Users/fut_payi/Desktop/F.I.T-Proteins/data/processed/X_test.npz'), PosixPath('/Users/fut_payi/Desktop/F.I.T-Proteins/data/processed/train_brd4_50k_clean.parquet'), PosixPath('/Users/fut_payi/Desktop/F.I.T-Proteins/data/processed/.gitkeep'), PosixPath('/Users/fut_payi/Desktop/F.I.T-Proteins/data/processed/train_brd4_50k_clean_blocks.parquet'), PosixPath('/Users/fut_payi/Desktop/F.I.T-Proteins/data/processed/test_brd4_50k_clean_blocks.parquet'), PosixPath('/Users/fut_payi/Desktop/F.I.T-Proteins/data/processed/chemistry_blocks_metadata.joblib'), 

Unnamed: 0,id,protein_name,molecule_smiles,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,binds,smiles_clean,buildingblock1_smiles_clean,buildingblock2_smiles_clean,buildingblock3_smiles_clean,shared_block_any,split_group
0,9051,BRD4,C#CC[C@@H](CC(=O)N[Dy])Nc1nc(NCCOCC(=C)C)nc(Nc...,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C=C(C)COCCN.Cl,COc1ncccc1N,0,C#CC[C@@H](CC(N)=O)Nc1nc(NCCOCC(=C)C)nc(Nc2ccc...,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C=C(C)COCCN.Cl,COc1ncccc1N,True,train_in
1,71760,BRD4,C#CC[C@@H](CC(=O)N[Dy])Nc1nc(NCCC2OCCC2(C)C)nc...,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,CC1(C)CCOC1CCN,CC(CN)c1c(Cl)cccc1Cl,0,C#CC[C@@H](CC(N)=O)Nc1nc(NCCC2OCCC2(C)C)nc(NCC...,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,CC1(C)CCOC1CCN,CC(CN)c1c(Cl)cccc1Cl,True,val_ood
2,92688,BRD4,C#CC[C@@H](CC(=O)N[Dy])Nc1nc(NCCC(C)(O)CC)nc(N...,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,CCC(C)(O)CCN,Nc1cc(Br)ccn1,0,C#CC[C@@H](CC(N)=O)Nc1nc(NCCC(C)(O)CC)nc(Nc2cc...,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,CCC(C)(O)CCN,Nc1cc(Br)ccn1,True,train_in
3,120681,BRD4,C#CC[C@@H](CC(=O)N[Dy])Nc1nc(NCCCOCC)nc(NCC2CC...,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,CCOCCCN,Cl.NCC1CCCOC1,0,C#CC[C@@H](CC(N)=O)Nc1nc(NCCCOCC)nc(NCC2CCCOC2)n1,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,CCOCCCN,Cl.NCC1CCCOC1,True,train_in
4,39303,BRD4,C#CC[C@@H](CC(=O)N[Dy])Nc1nc(NCc2nnc(C(C)(C)C)...,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,CC(C)(C)c1nnc(CN)s1.Cl,Nc1ccnc(-c2ccccc2)c1,0,C#CC[C@@H](CC(N)=O)Nc1nc(NCc2nnc(C(C)(C)C)s2)n...,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,CC(C)(C)c1nnc(CN)s1.Cl,Nc1ccnc(-c2ccccc2)c1,True,train_in


In [5]:
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
from scipy import sparse
import joblib

# --- Parameters ---
radius = 2
n_bits = 2048
use_chirality = True

def smiles_to_ecfp(smiles):
    """Convert a SMILES string into a 2048-bit ECFP fingerprint (sparse row)."""
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        # Return an all-zero vector if RDKit can't parse
        return sparse.csr_matrix((1, n_bits), dtype=np.int8)

    fp = AllChem.GetMorganFingerprintAsBitVect(
        mol,
        radius,
        nBits=n_bits,
        useChirality=use_chirality
    )

    arr = np.zeros((1, n_bits), dtype=np.int8)
    Chem.DataStructs.ConvertToNumpyArray(fp, arr[0])
    return sparse.csr_matrix(arr)

# --- Build X_train / X_test ---
train_smiles = train_df["smiles_clean"].tolist()
test_smiles  = test_df["smiles_clean"].tolist()

print("Computing ECFP for train…")
X_train = sparse.vstack([smiles_to_ecfp(s) for s in train_smiles], format="csr")

print("Computing ECFP for test…")
X_test = sparse.vstack([smiles_to_ecfp(s) for s in test_smiles], format="csr")

# --- Labels and IDs ---
y_train = train_df["binds"].values.astype(int)
ids_train = train_df["id"].values
ids_test  = test_df["id"].values

# --- Save artifacts ---
out_dir = PROJECT_ROOT / "data" / "processed"

sparse.save_npz(out_dir / "X_train_full.npz", X_train)
sparse.save_npz(out_dir / "X_test.npz", X_test)

np.save(out_dir / "y_train_full.npy", y_train)
np.save(out_dir / "ids_train_full.npy", ids_train)
np.save(out_dir / "ids_test.npy", ids_test)

# Metadata for reproducibility
meta = {
    "radius": radius,
    "n_bits": n_bits,
    "use_chirality": use_chirality,
    "train_path": str(train_path),
    "test_path": str(test_path)
}
joblib.dump(meta, out_dir / "prep_metadata.joblib")

print("Done!")
print("Shapes:", X_train.shape, X_test.shape)


Computing ECFP for train…
Computing ECFP for test…
Done!
Shapes: (50000, 2048) (50000, 2048)


### Some sanity checks...

In [6]:
from scipy import sparse
import numpy as np
import joblib
from pathlib import Path

out_dir = PROJECT_ROOT / "data" / "processed"

X_train = sparse.load_npz(out_dir / "X_train_full.npz")
X_test  = sparse.load_npz(out_dir / "X_test.npz")

y_train  = np.load(out_dir / "y_train_full.npy")
ids_tr   = np.load(out_dir / "ids_train_full.npy")
ids_te   = np.load(out_dir / "ids_test.npy")
meta     = joblib.load(out_dir / "prep_metadata.joblib")

print("X_train:", X_train.shape)
print("X_test :", X_test.shape)
print("y_train:", y_train.shape)
print("ids_tr :", ids_tr.shape)
print("ids_te :", ids_te.shape)
print("meta   :", meta)


X_train: (50000, 2048)
X_test : (50000, 2048)
y_train: (50000,)
ids_tr : (50000,)
ids_te : (50000,)
meta   : {'radius': 2, 'n_bits': 2048, 'use_chirality': True, 'train_path': '/Users/fut_payi/Desktop/F.I.T-Proteins/data/processed/train_brd4_50k_clean_blocks.parquet', 'test_path': '/Users/fut_payi/Desktop/F.I.T-Proteins/data/processed/test_brd4_50k_clean_blocks.parquet'}


In [7]:
pos_rate = y_train.mean()
n_pos = y_train.sum()
n_tot = y_train.shape[0]

print(f"Total train molecules: {n_tot}")
print(f"Positives: {n_pos}")
print(f"Positive rate: {pos_rate:.4f}")


Total train molecules: 50000
Positives: 269
Positive rate: 0.0054


In [8]:
# average number of active bits per molecule
avg_bits_on = X_train.nnz / X_train.shape[0]
print(f"Average # of bits=1 per molecule: {avg_bits_on:.1f}")

# quick sanity: for 5 random molecules, show how many bits are on
rows = np.random.choice(X_train.shape[0], size=5, replace=False)
for r in rows:
    n_on = X_train[r].nnz
    print(f"Row {r}: bits on = {n_on}")


Average # of bits=1 per molecule: 68.7
Row 14105: bits on = 69
Row 21418: bits on = 81
Row 49095: bits on = 70
Row 1643: bits on = 64
Row 23637: bits on = 65
