# Chemistry Tools Demo

Testing the new chemistry modules:
- `chemistry.parsers` - SMILES parsing and validation
- `chemistry.similarity` - Molecular fingerprint similarity
- `chemistry.chembl` - ChEMBL database queries

In [None]:
# Imports
from rag.chemistry import (
    # Parsers
    validate_smiles,
    parse_smiles,
    canonicalize_smiles,
    name_to_smiles,
    get_pubchem_info,
    # Similarity
    MolecularSimilarity,
    compute_similarity_matrix,
    # ChEMBL
    ChEMBLDatabase,
)

## 1. SMILES Parsing

In [None]:
# Test SMILES validation
test_smiles = [
    "CC(=O)OC1=CC=CC=C1C(=O)O",  # Aspirin
    "CN1C=NC2=C1C(=O)N(C(=O)N2C)C",  # Caffeine
    "invalid_smiles",
    "",
]

for smi in test_smiles:
    print(f"{smi[:30]:30} -> valid: {validate_smiles(smi)}")

In [None]:
# Parse aspirin and get properties
aspirin = parse_smiles("CC(=O)OC1=CC=CC=C1C(=O)O")
print(f"Molecule: Aspirin")
print(f"  Formula: {aspirin.formula}")
print(f"  MW: {aspirin.molecular_weight}")
print(f"  LogP: {aspirin.logp}")
print(f"  TPSA: {aspirin.tpsa}")
print(f"  HBD: {aspirin.num_hbd}")
print(f"  HBA: {aspirin.num_hba}")
print(f"  Rotatable bonds: {aspirin.num_rotatable_bonds}")

In [None]:
# Canonicalize SMILES
original = "c1ccccc1C(=O)O"  # Benzoic acid (non-canonical)
canonical = canonicalize_smiles(original)
print(f"Original:  {original}")
print(f"Canonical: {canonical}")

## 2. Name to SMILES (PubChem)

In [None]:
# Convert drug names to SMILES
drug_names = ["imatinib", "aspirin", "metformin", "caffeine"]

for name in drug_names:
    smiles = name_to_smiles(name)
    print(f"{name:15} -> {smiles[:50] if smiles else 'Not found'}...")

In [None]:
# Get full PubChem info
info = get_pubchem_info("imatinib")
if info:
    print(f"PubChem CID: {info['cid']}")
    print(f"IUPAC: {info['iupac_name']}")
    print(f"MW: {info['molecular_weight']}")
    print(f"XLogP: {info['xlogp']}")
    print(f"TPSA: {info['tpsa']}")

## 3. Molecular Similarity

In [None]:
# Create similarity searcher
sim = MolecularSimilarity(fingerprint="morgan", radius=2)

# Test compounds
compounds = [
    {"name": "Aspirin", "smiles": "CC(=O)OC1=CC=CC=C1C(=O)O"},
    {"name": "Salicylic acid", "smiles": "OC(=O)C1=CC=CC=C1O"},
    {"name": "Benzoic acid", "smiles": "OC(=O)C1=CC=CC=C1"},
    {"name": "Ibuprofen", "smiles": "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O"},
    {"name": "Paracetamol", "smiles": "CC(=O)NC1=CC=C(C=C1)O"},
    {"name": "Caffeine", "smiles": "CN1C=NC2=C1C(=O)N(C(=O)N2C)C"},
]

In [None]:
# Find compounds similar to aspirin
query = "CC(=O)OC1=CC=CC=C1C(=O)O"  # Aspirin
database = [{"smiles": c["smiles"], "name": c["name"]} for c in compounds]

results = sim.find_similar(query, database, top_k=5, threshold=0.2)

print("Compounds similar to Aspirin:")
for r in results:
    print(f"  {r.name:20} Tanimoto: {r.similarity:.3f}")

In [None]:
# Compute pairwise similarity matrix
smiles_list = [c["smiles"] for c in compounds]
names = [c["name"] for c in compounds]

matrix = compute_similarity_matrix(smiles_list, fingerprint="morgan")

# Display as table
import pandas as pd
df = pd.DataFrame(matrix, index=names, columns=names)
df.style.background_gradient(cmap="YlOrRd", vmin=0, vmax=1)

## 4. ChEMBL Database

ChEMBL 36 SQLite database: `~/data/chembl/chembl_36/chembl_36_sqlite/chembl_36.db`

In [None]:
# ChEMBL database path
from pathlib import Path

chembl_path = Path.home() / "data/chembl/chembl_36/chembl_36_sqlite/chembl_36.db"

if chembl_path.exists():
    print(f"ChEMBL found: {chembl_path}")
else:
    print(f"ChEMBL not found at: {chembl_path}")

In [None]:
# Skip ChEMBL tests if database not available
if chembl_path is None:
    print("Skipping ChEMBL tests - database not found")
else:
    db = ChEMBLDatabase(chembl_path)
    
    # Look up imatinib
    imatinib = db.get_compound("CHEMBL941")
    if imatinib:
        print(f"ChEMBL ID: {imatinib.chembl_id}")
        print(f"Name: {imatinib.name}")
        print(f"Type: {imatinib.molecule_type}")
        print(f"Max Phase: {imatinib.max_phase}")
        print(f"First Approval: {imatinib.first_approval}")
        print(f"SMILES: {imatinib.smiles[:60]}...")

In [None]:
# Get bioactivity data for imatinib
if chembl_path:
    activities = db.get_activities("CHEMBL941", activity_type="IC50", limit=10)
    
    print(f"\nIC50 values for Imatinib ({len(activities)} results):")
    for act in activities[:10]:
        print(f"  {act.target_name[:40]:40} {act.value:>10} {act.units}")

In [None]:
# Search by name
if chembl_path:
    results = db.search_by_name("metformin", limit=5)
    
    print("Search results for 'metformin':")
    for r in results:
        print(f"  {r.chembl_id}: {r.name}")

In [None]:
# Get approved drugs
if chembl_path:
    drugs = db.get_approved_drugs(limit=10)
    
    print(f"Recent approved drugs:")
    for d in drugs:
        print(f"  {d.chembl_id}: {d.name} ({d.first_approval})")

## 5. Combined Workflow

Example: Find approved drugs similar to a query compound

In [None]:
if chembl_path:
    # Get imatinib SMILES
    imatinib_smiles = name_to_smiles("imatinib")
    print(f"Query: Imatinib")
    print(f"SMILES: {imatinib_smiles[:50]}...")
    
    # Get approved drugs from ChEMBL
    approved = db.get_approved_drugs(limit=1000)
    drug_db = [{"smiles": d.smiles, "name": d.name, "chembl_id": d.chembl_id} 
               for d in approved if d.smiles]
    
    # Find similar
    sim = MolecularSimilarity(fingerprint="morgan")
    similar = sim.find_similar(imatinib_smiles, drug_db, top_k=10, threshold=0.3)
    
    print(f"\nApproved drugs similar to Imatinib:")
    for r in similar:
        print(f"  {r.chembl_id}: {r.name:30} (Tanimoto: {r.similarity:.3f})")

## Summary

Chemistry tools implemented:
- ✅ SMILES validation and parsing
- ✅ Molecular property calculation
- ✅ Name to SMILES conversion (PubChem)
- ✅ Fingerprint similarity search
- ✅ ChEMBL database queries

Next steps:
- Agent orchestrator (agents/orchestrator.py)
- Tool definitions for LangChain (agents/tools.py)
- ADME prediction (prediction/adme.py)