In [1]:
import faiss
print("FAISS version:", faiss.__version__)


ModuleNotFoundError: No module named 'faiss'

In [2]:
import faiss
print("FAISS:", faiss.__version__)


FAISS: 1.12.0


In [3]:
import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer
import numpy as np
from pathlib import Path
import json


In [4]:
processed_dir = Path("../data/processed")

# BindingDB standardized
df_bindingdb = pd.read_parquet(processed_dir / "bindingdb_standardized.parquet")
print("BindingDB:", df_bindingdb.shape)

# TDC datasets
import glob
tdc_files = glob.glob(str(processed_dir / "tdc_*.parquet"))
dfs_tdc = [pd.read_parquet(f) for f in tdc_files]
df_tdc = pd.concat(dfs_tdc, ignore_index=True)
print("TDC:", df_tdc.shape)

# Merge all
df_all = pd.concat([df_bindingdb, df_tdc], ignore_index=True)
print("Unified dataset:", df_all.shape)
df_all.head(3)


BindingDB: (439214, 7)
TDC: (623507, 7)
Unified dataset: (1062721, 7)


  df_all = pd.concat([df_bindingdb, df_tdc], ignore_index=True)


Unnamed: 0,smiles,inchikey,ligand_name,target,activity_type,value,pValue
0,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(CCCCCC(O)=...,XGEGDSLAQZJGCW-HHGOQMMWSA-N,,,Ki,0.24,9.619789
1,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(C\C=C\c2cn...,UZLMEAPBHYEHAC-UNTBESQGSA-N,,,Ki,0.25,9.60206
2,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(CC2CC2)C(=...,HYNYUFZPPJMPOB-UTWJFGBXSA-N,,,Ki,0.41,9.387216


In [5]:
def build_text(row):
    ligand = row.get("ligand_name", "")
    target = row.get("target", "")
    act_type = row.get("activity_type", "")
    val = row.get("value", "")
    pval = row.get("pValue", "")
    return f"Ligand: {ligand}, Target: {target}, Activity: {act_type}, Value: {val}, pValue: {pval}"

df_all["evidence_text"] = df_all.apply(build_text, axis=1)
df_all["evidence_text"].head(3)


0    Ligand: None, Target: None, Activity: Ki, Valu...
1    Ligand: None, Target: None, Activity: Ki, Valu...
2    Ligand: None, Target: None, Activity: Ki, Valu...
Name: evidence_text, dtype: object

In [6]:
# Use a lightweight sentence transformer
model = SentenceTransformer("all-MiniLM-L6-v2")

embeddings = model.encode(df_all["evidence_text"].tolist(),
                          batch_size=64, show_progress_bar=True)

embeddings = np.array(embeddings).astype("float32")
print("Embeddings shape:", embeddings.shape)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/16606 [00:00<?, ?it/s]

Embeddings shape: (1062721, 384)


In [5]:
import pandas as pd
from pathlib import Path
import glob

processed_dir = Path("../data/processed")

# Load BindingDB
df_bindingdb = pd.read_parquet(processed_dir / "bindingdb_standardized.parquet")
print("BindingDB:", df_bindingdb.shape)

# Load TDC datasets
tdc_files = glob.glob(str(processed_dir / "tdc_*.parquet"))
dfs_tdc = [pd.read_parquet(f) for f in tdc_files]
df_tdc = pd.concat(dfs_tdc, ignore_index=True)
print("TDC:", df_tdc.shape)

# Merge all
df_all = pd.concat([df_bindingdb, df_tdc], ignore_index=True)
print("Unified dataset:", df_all.shape)


BindingDB: (439214, 7)
TDC: (623507, 7)
Unified dataset: (1062721, 7)


  df_all = pd.concat([df_bindingdb, df_tdc], ignore_index=True)


In [6]:
def build_text(row):
    return (
        f"Ligand: {row.get('ligand_name', '')}, "
        f"Target: {row.get('target', '')}, "
        f"Activity: {row.get('activity_type', '')}, "
        f"Value: {row.get('value', '')}, "
        f"pValue: {row.get('pValue', '')}"
    )

df_all["evidence_text"] = df_all.apply(build_text, axis=1)
df_all["evidence_text"].head(3)


0    Ligand: None, Target: None, Activity: Ki, Valu...
1    Ligand: None, Target: None, Activity: Ki, Valu...
2    Ligand: None, Target: None, Activity: Ki, Valu...
Name: evidence_text, dtype: object

In [7]:
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer("all-MiniLM-L6-v2")

embeddings = model.encode(
    df_all["evidence_text"].tolist(),
    batch_size=64,
    show_progress_bar=True
)

embeddings = np.array(embeddings).astype("float32")
print("Embeddings shape:", embeddings.shape)


Batches:   0%|          | 0/16606 [00:00<?, ?it/s]

Embeddings shape: (1062721, 384)


In [3]:
import sys
print(sys.executable)


/Users/guruganesh/miniconda3/envs/ragops/bin/python


In [5]:
import pandas as pd
from pathlib import Path
import glob

processed_dir = Path("../data/processed")

# Load BindingDB
df_bindingdb = pd.read_parquet(processed_dir / "bindingdb_standardized.parquet")
print("BindingDB:", df_bindingdb.shape)

# Load all TDC datasets
tdc_files = glob.glob(str(processed_dir / "tdc_*.parquet"))
dfs_tdc = [pd.read_parquet(f) for f in tdc_files]
df_tdc = pd.concat(dfs_tdc, ignore_index=True)
print("TDC:", df_tdc.shape)

# Merge everything
df_all = pd.concat([df_bindingdb, df_tdc], ignore_index=True)
print("Unified dataset:", df_all.shape)

# Add evidence text
def build_text(row):
    return (
        f"Ligand: {row.get('ligand_name', '')}, "
        f"Target: {row.get('target', '')}, "
        f"Activity: {row.get('activity_type', '')}, "
        f"Value: {row.get('value', '')}, "
        f"pValue: {row.get('pValue', '')}"
    )

df_all["evidence_text"] = df_all.apply(build_text, axis=1)
print(df_all["evidence_text"].head(3))



BindingDB: (439214, 7)
TDC: (623507, 7)
Unified dataset: (1062721, 7)


  df_all = pd.concat([df_bindingdb, df_tdc], ignore_index=True)


0    Ligand: None, Target: None, Activity: Ki, Valu...
1    Ligand: None, Target: None, Activity: Ki, Valu...
2    Ligand: None, Target: None, Activity: Ki, Valu...
Name: evidence_text, dtype: object


In [6]:
df_sample = df_all.sample(5000, random_state=42)

from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer("all-MiniLM-L6-v2")

embeddings = model.encode(
    df_sample["evidence_text"].tolist(),
    batch_size=64,
    show_progress_bar=True
)

embeddings = np.array(embeddings).astype("float32")
print("Sample embeddings shape:", embeddings.shape)


Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Sample embeddings shape: (5000, 384)


In [9]:
# Build FAISS index on sample embeddings
import faiss

faiss.normalize_L2(embeddings)

index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)
print("Index size:", index.ntotal)

# Map FAISS IDs back to DataFrame rows
id_to_row = dict(enumerate(df_sample.to_dict(orient="records")))


Index size: 5000


In [10]:
def search(query, k=5):
    q_emb = model.encode([query], convert_to_numpy=True).astype("float32")
    faiss.normalize_L2(q_emb)
    scores, idxs = index.search(q_emb, k)
    results = []
    for score, i in zip(scores[0], idxs[0]):
        results.append((score, id_to_row[i]))
    return results

# Example queries
queries = [
    "Find ligands active on EGFR",
    "Compounds with IC50 under 10 nM against HER2"
]

for q in queries:
    print(f"\n🔎 Query: {q}")
    for score, row in search(q, k=3):
        print(f"  • {row['ligand_name']} | Target: {row['target']} | {row['activity_type']}={row['value']} (score={score:.3f})")



🔎 Query: Find ligands active on EGFR
  • None | Target: None | IC50= 17516 (score=0.576)
  • 87279791.0 | Target: MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAPGIFSSQPGHTPHPAASRDPVARTSPLQTPAAPGAAAGPALSPVPPVVHLTLRQAGDDFSRRYRRDFAEMSSQLHLTPFTARGRFATVVEELFRDGVNWGRIVAFFEFGGVMCVESVNREMSPLVDNIALWMTEYLNRHLHTWIQDNGGWDAFVELYGPSMRPLFDFSWLSLKTLLSLALVGACITLGAYLGHK | BindingDB_Ki=0.6779999999999999 (score=0.566)
  • 87316315.0 | Target: MAQAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDADAAPLGAAPTPGIFSFQPESNPMPAVHRDMAARTSPLRPLVATAGPALSPVPPVVHLTLRRAGDDFSRRYRRDFAEMSSQLHLTPFTARGRFATVVEELFRDGVNWGRIVAFFEFGGVMCVESVNREMSPLVDNIALWMTEYLNRHLHTWIQDNGGWDAFVELYGPSMRPLFDFSWLSLKTLLSLALVGACITLGAYLGHK | BindingDB_Ki=4.417 (score=0.563)

🔎 Query: Compounds with IC50 under 10 nM against HER2
  • None | Target: None | IC50=>2.50e+5 (score=0.464)
  • None | Target: None | IC50= 1851 (score=0.464)
  • None | Target: None | IC50=>15849 (score=0.459)


In [11]:
faiss.write_index(index, str(processed_dir / "evidence_faiss.index"))
print("✅ Index saved")


✅ Index saved


In [12]:
# Reload index
index2 = faiss.read_index(str(processed_dir / "evidence_faiss.index"))
print("Reloaded index size:", index2.ntotal)


Reloaded index size: 5000


In [13]:
def search(query, k=5):
    q_emb = model.encode([query], convert_to_numpy=True).astype("float32")
    faiss.normalize_L2(q_emb)
    scores, idxs = index2.search(q_emb, k)
    results = []
    for score, i in zip(scores[0], idxs[0]):
        results.append((score, id_to_row[i]))
    return results

for q in ["EGFR inhibitors", "HER2 IC50 < 10 nM"]:
    print(f"\n🔎 {q}")
    for score, row in search(q, k=3):
        print(f"  • {row['ligand_name']} | Target: {row['target']} | "
              f"{row['activity_type']}={row['value']} (score={score:.3f})")



🔎 EGFR inhibitors
  • 91663444.0 | Target: MLGNKRLGLSGLTLALSLLVCLGALAEAYPSKPDNPGEDAPAEDMARYYSALRHYINLITRQRYGKRSSPETLISDLLMRESTENVPRTRLEDPAMW | BindingDB_Ki=4.6 (score=0.362)
  • 44186669.0 | Target: MPPSISAFQAAYIGIEVLIALVSVPGNVLVIWAVKVNQALRDATFCFIVSLAVADVAVGALVIPLAILINIGPQTYFHTCLMVACPVLILTQSSILALLAIAVDRYLRVKIPLRYKMVVTPRRAAVAIAGCWILSFVVGLTPMFGWNNLSAVERAWAANGSMGEPVIKCEFEKVISMEYMVYFNFFVWVLPPLLLMVLIYLEVFYLIRKQLNKKVSASSGDPQKYYGKELKIAKSLALILFLFALSWLPLHILNCITLFCPSCHKPSILTYIAIFLTHGNSAMNPIVYAFRIQKFRVTFLKIWNDHFRCQPAPPIDEDLPEERPDD | BindingDB_Ki=2610.0 (score=0.348)
  • 137796736.0 | Target: MGFQKFSPFLALSILVLLQAGSLHAAPFRSALESSPADPATLSEDEARLLLAALVQNYVQMKASELEQEQEREGSRIIAQKRACDTATCVTHRLAGLLSRSGGVVKNNFVPTNVGSKAFGRRRRDLQA | BindingDB_Ki=0.05 (score=0.344)

🔎 HER2 IC50 < 10 nM
  • None | Target: None | IC50=>37500 (score=0.356)
  • None | Target: None | IC50=>130000 (score=0.355)
  • None | Target: None | IC50= 1020 (score=0.353)


In [14]:
import pickle

with open(processed_dir / "id_to_row.pkl", "wb") as f:
    pickle.dump(id_to_row, f)


In [15]:
with open(processed_dir / "id_to_row.pkl", "rb") as f:
    id_to_row = pickle.load(f)
