<center> <h1> Week 3 </h1> <center>


<h3> Aim: </h3>


<h3>Method: </h3> 

In [None]:
import urllib.request
import gzip
import os
import csv
import subprocess
import time
from pathlib import Path
from rdkit import Chem
from rdkit.Chem import AllChem
from openbabel import pybel
import tempfile
import shutil
import pandas as pd


In [9]:
# Cell 1 – Download FDA-approved drugs with SMILES (1,691 compounds, SMILES ready)
!mkdir -p ../data/fda_library
!curl -L https://www.cureffi.org/wp-content/uploads/2013/10/drugs.txt -o ../data/fda_library/fda.smi
print("FDA library downloaded → 1,691 compounds with SMILES")

smi_file = "../data/fda_library/fda.smi"

# Read the file (tab-separated: name SMILES other_columns)
df = pd.read_csv(smi_file, sep="\t")
print(f"FDA library loaded → {len(df)} compounds")
print("\nFirst 5 entries:")
print(df.head())

# Optional: count CNS-active drugs (column 3 = TRUE/FALSE)
cns_count = df[df.iloc[:, 2] == "TRUE"].shape[0]
print(f"\nNumber of known CNS-active drugs in library: {cns_count}")

The syntax of the command is incorrect.


FDA library downloaded → 1,691 compounds with SMILES
FDA library loaded → 1691 compounds

First 5 entries:
  generic_name  cns_drug                                             smiles
0     Abacavir     False  NC1=NC2=C(N=CN2[C@@H]2C[C@H](CO)C=C2)C(NC2CC2)=N1
1     Abarelix     False                                                NaN
2    Abatacept     False                                                NaN
3    Abciximab     False                                                NaN
4  Abiraterone     False  CC(=O)O[C@H]1CC[C@]2(C)C3CC[C@@]4(C)C(CC=C4C4=...

Number of known CNS-active drugs in library: 0


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  121k  100  121k    0     0   627k      0 --:--:-- --:--:-- --:--:--  673k


In [11]:
df = pd.read_csv("../data/fda_library/fda.smi", sep="\t", usecols=[0,2])
df.to_csv("../data/fda_library/fda_clean.smi", sep="\t", index=False)

print(f"Cleaned FDA library ready → {len(df)} compounds")
print("\nFirst 5 entries:")
print(df.head())

Cleaned FDA library ready → 1691 compounds

First 5 entries:
  generic_name                                             smiles
0     Abacavir  NC1=NC2=C(N=CN2[C@@H]2C[C@H](CO)C=C2)C(NC2CC2)=N1
1     Abarelix                                                NaN
2    Abatacept                                                NaN
3    Abciximab                                                NaN
4  Abiraterone  CC(=O)O[C@H]1CC[C@]2(C)C3CC[C@@]4(C)C(CC=C4C4=...


In [None]:
df = pd.read_csv("../data/fda_library/fda_clean.smi", sep="\t")

# Helpers: OpenBabel/pybel conversion and metal detection
def is_metal_smiles(smiles):
    try:
        m = Chem.MolFromSmiles(smiles)
        if m is None:
            return False
        return any(a.GetAtomicNum() > 20 for a in m.GetAtoms())
    except Exception:
        return False

def smiles_to_pdbqt_via_pybel(smiles, out_path, ff_steps=250):
    # Create pybel molecule from SMILES, build 3D, optimize and write PDBQT
    pb = pybel.readstring('smi', smiles)
    try:
        pb.addh()
    except Exception:
        pass
    try:
        pb.make3D()
    except Exception:
        pass
    # Prefer MMFF94, fall back to UFF if unavailable
    try:
        pb.localopt(ff='mmff94', steps=ff_steps)
    except Exception:
        try:
            pb.localopt(ff='uff', steps=ff_steps)
        except Exception:
            pass
    try:
        pb.calccharges('gasteiger')
    except Exception:
        pass
    pb.write('pdbqt', out_path, overwrite=True)

def rdkit_mol_to_pdbqt_via_pybel(rdmol, out_path):
    # Convert RDKit Mol (with 3D coords) to pdbqt via pybel (reads MolBlock)
    mol_block = Chem.MolToMolBlock(rdmol)
    pb = pybel.readstring('mol', mol_block)
    try:
        pb.addh()
    except Exception:
        pass
    try:
        pb.calccharges('gasteiger')
    except Exception:
        pass
    pb.write('pdbqt', out_path, overwrite=True)

output_dir = "../data/fda_pdbqt"
Path(output_dir).mkdir(exist_ok=True)

success = 0
failed = 0

for idx, row in df.iterrows():
    name = row["generic_name"].replace("/", "_")  # safe filename
    smiles = row["smiles"]
    if pd.isna(smiles):
        failed += 1
        continue

    out_path = os.path.join(output_dir, f"{name}.pdbqt")

    # Route metal-containing complexes through pybel directly (avoid RDKit UFF)
    if is_metal_smiles(smiles):
        try:
            smiles_to_pdbqt_via_pybel(smiles, out_path)
        except Exception as e:
            failed += 1
            try:
                dbg = Path(output_dir) / f"pybel_metal_error_{idx}_{name}.txt"
                dbg.write_text(str(e))
            except Exception:
                pass
            continue
        success += 1
        if (success + failed) % 200 == 0:
            print(f"Processed {success + failed} compounds | Success: {success} | Failed: {failed}")
        continue

    # Non-metal: try RDKit embedding + conversion; fallback to pybel on failure
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            raise ValueError('RDKit failed to parse SMILES')
        mol = Chem.AddHs(mol)
        
        if emb != 0:
            raise ValueError('RDKit embed failure')

    except Exception as e:
        # fallback: try pybel from SMILES (may handle odd cases better)
        try:
            smiles_to_pdbqt_via_pybel(smiles, out_path)
        except Exception as e2:
            failed += 1
            try:
                dbg = Path(output_dir) / f"conversion_error_{idx}_{name}.txt"
                dbg.write_text(f"rdkit_err: {e}\npybel_err: {e2}")
            except Exception:
                pass
            continue
    else:
        success += 1
    
    if (success + failed) % 200 == 0:
        print(f"Processed {success + failed} compounds | Success: {success} | Failed: {failed}")

print(f"\nBATCH COMPLETE!")
print(f"   Successful 3D PDBQT: {success}")
print(f"   Failed (no SMILES, embed failure, or conversion error): {failed}")
print(f"   Files saved in {output_dir}/")

In [None]:
receptor = "../data/protein_rigid.pdbqt"
ligand_dir = "../data/fda_pdbqt"
out_dir = "../results/fda_poses"
Path(out_dir).mkdir(exist_ok=True)

results = []
start_time = time.time()

for i, pdbqt_file in enumerate(sorted(Path(ligand_dir).glob("*.pdbqt")), 1):
    name = pdbqt_file.stem
    out_file = f"{out_dir}/{name}_out.pdbqt"
    
    cmd = [
        "vina", "--receptor", receptor, "--ligand", str(pdbqt_file),
        "--center_x", "24.2", "--center_y", "50.1", "--center_z", "92.2",
        "--size_x", "25", "--size_y", "25", "--size_z", "25",
        "--exhaustiveness", "32", "--num_modes", "1", "--energy_range", "3",
        "--out", out_file
    ]
    
    result = subprocess.run(cmd, capture_output=True, text=True)
    
    # Parse top affinity
    if "   1 " in result.stdout:
        for line in result.stdout.splitlines():
            if line.startswith("   1 "):
                parts = line.split()
                affinity = float(parts[1])
                results.append({"Drug_Name": name, "Affinity_kcal_mol": affinity})
                break
    
    if i % 100 == 0:
        elapsed = (time.time() - start_time) / 60
        print(f"Screened {i}/1497 compounds | Elapsed: {elapsed:.1f} min | Current top: {affinity if results else 'N/A'} kcal/mol")

# Save results
df_results = pd.DataFrame(results)
df_results = df_results.sort_values("Affinity_kcal_mol")
df_results.to_csv("../results/docking_scores.csv", index=False)

print("\nSCREENING COMPLETE!")
print(f"Total docked: {len(df_results)}")
print("Top 10 hits:")
print(df_results.head(10))