<a href="https://colab.research.google.com/github/bordin89/ML4NGP_tutorials/blob/main/ML4NGP_Practical1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Colab Setup: Install TED tools, dependencies & Foldseek

import os, pathlib

# === 1. Setup workspace ======================================================
BASE_DIR = pathlib.Path("/content").resolve()
WORK_DIR = BASE_DIR / "ted_workshop"
WORK_DIR.mkdir(exist_ok=True)

print(f"Working directory: {WORK_DIR}")
os.chdir(WORK_DIR)

# === 2. Install Python dependencies ========
print("Installing Python dependencies...")

!pip install -q \
    einops==0.6.1 \
    natsort==8.3.1 \
    pydantic==1.10.8 \
    pandas biopython tqdm requests py3Dmol rotary-embedding-torch pdb-tools

!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension


# === 3. Clone TED tools repository ===========================================
if not (WORK_DIR / "ted-tools").exists():
    print("Cloning TED tools repository...")
    !git clone https://github.com/psipred/ted-tools.git
else:
    print("TED tools repository already present")

# === 4. Install Foldseek =========================
print("Installing Foldseek...")

!rm -rf foldseek foldseek.tar.gz

!wget -q https://mmseqs.com/foldseek/foldseek-linux-avx2.tar.gz -O foldseek.tar.gz
!mkdir -p foldseek
!tar xzf foldseek.tar.gz -C foldseek --strip-components=1
!rm foldseek.tar.gz

FOLDSEEK_BIN = WORK_DIR / "foldseek" / "bin" / "foldseek"
os.environ["PATH"] = f"{WORK_DIR}/foldseek/bin:" + os.environ["PATH"]

print("Foldseek binary:", FOLDSEEK_BIN)
!ls -R foldseek

print("\nFoldseek version check:")
!{FOLDSEEK_BIN} version

# === 5. Create structure and results directories =============================
STRUCTURE_DIR   = WORK_DIR / "structures"
TED_INPUT_DIR   = WORK_DIR / "ted_input"
TED_RESULTS_DIR = WORK_DIR / "ted_results"

for d in [STRUCTURE_DIR, TED_INPUT_DIR, TED_RESULTS_DIR]:
    d.mkdir(exist_ok=True)

print("\nDirectory Setup Complete:")
print(f"  Structures:  {STRUCTURE_DIR}")
print(f"  TED Input:   {TED_INPUT_DIR}")
print(f"  TED Results: {TED_RESULTS_DIR}")
print(f"  Foldseek:    {FOLDSEEK_BIN}")

print("\nSetup complete! Ready for structure downloads.")

In [None]:
#@title Download 10 example structures (AFDB v6) into TED_INPUT_DIR

import os
import pathlib
import requests
from shutil import copy2

# Workspace paths (same as setup cell)
BASE_DIR      = pathlib.Path("/content").resolve()
WORK_DIR      = BASE_DIR / "ted_workshop"
STRUCTURE_DIR = WORK_DIR / "structures"
TED_INPUT_DIR = WORK_DIR / "ted_input"

PDB_DIR  = STRUCTURE_DIR / "pdb"
AFDB_DIR = STRUCTURE_DIR / "afdb"
PDB_DIR.mkdir(parents=True, exist_ok=True)
AFDB_DIR.mkdir(parents=True, exist_ok=True)
TED_INPUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"PDB directory:  {PDB_DIR}")
print(f"AFDB directory: {AFDB_DIR}")
print(f"TED input:      {TED_INPUT_DIR}\n")

# --- Example IDs ---------------------

af_uniprot_ids = [
    "P04637",  # p53
    "P68871",  # Hemoglobin beta
    "P69905",  # Hemoglobin alpha
    "P38398",  # BRCA1
    "P0CG47",  # Ubiquitin-40S ribosomal protein S27a
    "P00533",  # EGFR
    "P01009",  # Alpha-1-antitrypsin
    "P05067",  # APP
    "P01112",  # HRAS
    "P02649",  # APOE
]

def download_pdb(pdb_id: str, out_dir: pathlib.Path) -> pathlib.Path:
    url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
    out_path = out_dir / f"{pdb_id}.pdb"
    if out_path.exists():
        print(f"{pdb_id}: already exists")
        return out_path
    print(f"Downloading PDB {pdb_id} ...")
    r = requests.get(url)
    r.raise_for_status()
    out_path.write_bytes(r.content)
    print(f"   ↳ saved to {out_path}")
    return out_path

def download_afdb(uniprot_id: str, out_dir: pathlib.Path) -> pathlib.Path:
    url = f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_id}-F1-model_v6.pdb"
    out_path = out_dir / f"AF-{uniprot_id}-F1-model_v6.pdb"
    if out_path.exists():
        print(f"AFDB {uniprot_id}: already exists")
        return out_path
    print(f"Downloading AFDB v6 {uniprot_id} ...")
    r = requests.get(url)
    r.raise_for_status()
    out_path.write_bytes(r.content)
    print(f"   ↳ saved to {out_path}")
    return out_path

# --- Download all structures -------------------------------------------------
afdb_files = [download_afdb(uid, AFDB_DIR) for uid in af_uniprot_ids]

print("\nCopying all structures into TED_INPUT_DIR ...")
for f in afdb_files:
    dest = TED_INPUT_DIR / f.name
    if not dest.exists():
        copy2(f, dest)
        print(f"   ↳ {f.name} -> {dest}")
    else:
        print(f"   ↳ {f.name} already present in TED_INPUT_DIR")

print("\nDone! Contents of TED_INPUT_DIR:")
!ls -1 "{TED_INPUT_DIR}"

In [None]:
#@title TED consensus setup (Merizo, UniDoc, Chainsaw, STRIDE, patch run_segmentation.sh)

import os, pathlib

BASE_DIR     = pathlib.Path("/content").resolve()
WORK_DIR     = BASE_DIR / "ted_workshop"
TED_CONS_DIR = WORK_DIR / "ted-tools" / "ted_consensus_1.0"
PROGRAMS_DIR = TED_CONS_DIR / "programs"
MERIZO_DIR   = PROGRAMS_DIR / "merizo"
MERIZO_W_DIR = MERIZO_DIR / "weights"
UNIDOC_DIR   = PROGRAMS_DIR / "unidoc"
CHAINSAW_DIR = PROGRAMS_DIR / "chainsaw"
RUN_SCRIPT   = TED_CONS_DIR / "run_segmentation.sh"

os.chdir(TED_CONS_DIR)
print("TED consensus dir:", TED_CONS_DIR)

PROGRAMS_DIR.mkdir(exist_ok=True)

# --- Merizo weights -------------------------------
BASE_URL = "https://github.com/psipred/Merizo/raw/main/weights"
WEIGHTS_FILES = ["weights_part_0.pt", "weights_part_1.pt", "weights_part_2.pt"]

MERIZO_W_DIR.mkdir(parents=True, exist_ok=True)

print("\nChecking Merizo weights in", MERIZO_W_DIR)
for fname in WEIGHTS_FILES:
    out_path = MERIZO_W_DIR / fname
    if out_path.exists():
        print(f"   ✔ {fname} already present")
        continue
    url = f"{BASE_URL}/{fname}"
    print(f"   Downloading {fname} ...")
    !wget -q "{url}" -O "{out_path}"
print("Merizo weights ready.\n")

# --- UniDoc) ----------------------
UNIDOC_URL = "https://yanglab.qd.sdu.edu.cn/UniDoc/download/UniDoc_20250514.tgz"
UNIDOC_TGZ = PROGRAMS_DIR / "UniDoc_20250514.tgz"

if UNIDOC_DIR.exists():
    print("UniDoc already installed at", UNIDOC_DIR, "\n")
else:
    print("Downloading UniDoc from updated URL ...")
    !wget -q --no-check-certificate "{UNIDOC_URL}" -O "{UNIDOC_TGZ}"

    print("Extracting UniDoc ...")
    !tar -xzf "{UNIDOC_TGZ}" -C "{PROGRAMS_DIR}"
    !rm "{UNIDOC_TGZ}"

    unpacked_dir = PROGRAMS_DIR / "UniDoc"
    if unpacked_dir.exists():
        unpacked_dir.rename(UNIDOC_DIR)
        print("UniDoc unpacked to", UNIDOC_DIR)
    else:
        raise SystemExit("UniDoc folder not found after extraction.")

    helper_script = TED_CONS_DIR / "scripts" / "Run_UniDoc_from_scratch_structure_afdb.py"
    if helper_script.exists():
        !cp "{helper_script}" "{UNIDOC_DIR}/"
        print("Copied Run_UniDoc_from_scratch_structure_afdb.py into UniDoc dir\n")
    else:
        print("Helper script not found at", helper_script, "\n")

# --- Refresh Chainsaw from upstream repo ---------------------------------
print("Ensuring latest Chainsaw...")
if CHAINSAW_DIR.exists():
    !rm -rf "{CHAINSAW_DIR}"

!git clone -q https://github.com/JudeWells/chainsaw.git "{CHAINSAW_DIR}"
print("Chainsaw cloned into", CHAINSAW_DIR, "\n")

# --- Ensure STRIDE is built in programs/chainsaw/stride ------------------
STRIDE_DIR = CHAINSAW_DIR / "stride"
STRIDE_BIN = STRIDE_DIR / "stride"
STRIDE_TAR_GZ = STRIDE_DIR / "stride.tar.gz"
STRIDE_TGZ = STRIDE_DIR / "stride.tgz"

print("Checking STRIDE binary in", STRIDE_DIR)

if STRIDE_BIN.exists():
    print("STRIDE binary already present:", STRIDE_BIN)
else:
    # Make sure we have build tools
    print("STRIDE binary missing; preparing to build from tarball...")
    !command -v make >/dev/null 2>&1 || (apt-get update -qq && apt-get install -y -qq build-essential >/dev/null)

    os.chdir(STRIDE_DIR)

    if STRIDE_TAR_GZ.exists():
        print("Extracting stride.tar.gz ...")
        !tar -xzf "stride.tar.gz"
    elif STRIDE_TGZ.exists():
        print("Extracting stride.tgz ...")
        !tar -xzf "stride.tgz"
    else:
        raise SystemExit(" No stride.tar.gz or stride.tgz found in stride directory.")

    print(" Running make to build STRIDE...")
    !make
    !chmod +x "stride"

    os.chdir(TED_CONS_DIR)

    if STRIDE_BIN.exists():
        print("STRIDE binary built at", STRIDE_BIN, "\n")
    else:
        raise SystemExit("STRIDE build did not produce 'stride' binary.")

# --- 5️⃣ Patch run_segmentation.sh: remove venv block cleanly ----------------
print("Patching run_segmentation.sh to remove virtualenv requirement...")

if not RUN_SCRIPT.exists():
    raise SystemExit(f"Cannot find {RUN_SCRIPT}")

lines = RUN_SCRIPT.read_text().splitlines()
new_lines = []
in_venv_block = False
venv_removed = False

for line in lines:
    stripped = line.strip()

    if 'VENV_DIR="ted_consensus"' in line:
        in_venv_block = True
        venv_removed = True
        new_lines.append(
            '# [Colab] Using global Python environment instead of "ted_consensus" virtualenv.'
        )
        continue

    if in_venv_block:
        if stripped == "fi":
            in_venv_block = False
        continue

    new_lines.append(line)

RUN_SCRIPT.write_text("\n".join(new_lines))
os.chmod(RUN_SCRIPT, 0o755)

if venv_removed:
    print("Virtualenv block removed from run_segmentation.sh")
else:
    print("No virtualenv block found (already removed)")

In [None]:
#@title Run TED consensus on TED_INPUT_DIR

import os, pathlib

BASE_DIR        = pathlib.Path("/content").resolve()
WORK_DIR        = BASE_DIR / "ted_workshop"
TED_CONS_DIR    = WORK_DIR / "ted-tools" / "ted_consensus_1.0"
TED_INPUT_DIR   = WORK_DIR / "ted_input"
TED_RESULTS_DIR = WORK_DIR / "ted_results"

os.chdir(TED_CONS_DIR)

# Path for the Colab-specific wrapper script
COLAB_RUN = TED_CONS_DIR / "run_segmentation_colab.sh"

script_content = r"""#!/bin/bash

# This file is a part of TED: The Encyclopedia of Domains. If you utilize or reference any content from this file,
# please cite the following paper:
# Lau et al., 2024. Exploring structural diversity across the protein universe with The Encyclopedia of Domains.

# Function to display usage message
usage() {
    echo "Usage: $0 -i <input_directory_with_pdb_files> -o <output_directory>"
    exit 1
}

# [Colab] Using global Python environment instead of 'ted_consensus' virtualenv.

# Parse command-line arguments
while getopts "i:o:" opt; do
    case $opt in
        i) INPUT_DIR="$OPTARG" ;;
        o) OUTPUT_DIR="$OPTARG" ;;
        *) usage ;;
    esac
done

# Check if both input and output directories are provided
if [ -z "$INPUT_DIR" ] || [ -z "$OUTPUT_DIR" ]; then
    usage
fi

# Check if the input directory exists
if [ ! -d "$INPUT_DIR" ]; then
    echo "Error: $INPUT_DIR is not a directory"
    exit 1
fi

# Create the output directory if it doesn't exist
if [ ! -d "$OUTPUT_DIR" ]; then
    mkdir -p "$OUTPUT_DIR"
fi

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
PY=$(which python)

SEGMENT="${SCRIPT_DIR}/scripts/segment.sh"
CONSENSUS="${SCRIPT_DIR}/scripts/get_consensus.py"
FILTER_DOMAINS="${SCRIPT_DIR}/scripts/filter_domains_consensus.py"

# Run Merizo on the input directory
out_merizo="${OUTPUT_DIR}/chopping_merizo.txt"
log_merizo="${OUTPUT_DIR}/chopping_merizo.log"
bash "${SEGMENT}" -i "${INPUT_DIR}" -m merizo -o "${OUTPUT_DIR}" > "${log_merizo}" 2>&1

if test ! -f "${out_merizo}" || test ! -s "${out_merizo}"; then
    echo "Expected to find chopping file for Merizo at ${out_merizo}!"
    exit 1
fi

# Run UniDoc on the Merizo output
out_unidoc="${OUTPUT_DIR}/chopping_unidoc.txt"
log_unidoc="${OUTPUT_DIR}/chopping_unidoc.log"
bash "${SEGMENT}" -i "${INPUT_DIR}" -m unidoc -o "${OUTPUT_DIR}" -c "${out_merizo}" > "${log_unidoc}" 2>&1

if test ! -f "${out_unidoc}" || test ! -s "${out_unidoc}"; then
    echo "Expected to find chopping file for UniDoc at ${out_unidoc}!"
    exit 1
fi

# Run Chainsaw on the input directory
out_chainsaw="${OUTPUT_DIR}/chopping_chainsaw.txt"
log_chainsaw="${OUTPUT_DIR}/chopping_chainsaw.log"
bash "${SEGMENT}" -i "${INPUT_DIR}" -m chainsaw -o "${OUTPUT_DIR}" > "${log_chainsaw}" 2>&1

if test ! -f "${out_chainsaw}" || test ! -s "${out_chainsaw}"; then
    echo "Expected to find chopping file for Chainsaw at ${out_chainsaw}!"
    exit 1
fi

echo "Calculating consensus domains from Merizo, UniDoc and Chainsaw outputs.. "

# Calculate consensus from each of the outputs
out_consensus="${OUTPUT_DIR}/consensus.tsv"
log_consensus="${OUTPUT_DIR}/consensus.log"
"${PY}" "${CONSENSUS}" -c "${out_merizo}" "${out_chainsaw}" "${out_unidoc}" -o "${out_consensus}" > "${log_consensus}" 2>&1

if test -f "${out_consensus}"; then
    "${PY}" "${FILTER_DOMAINS}" "${out_consensus}" -o "${out_consensus}.tmp"

    if [ $? == 0 ]; then
        mv "${out_consensus}.tmp" "${out_consensus}"
    fi
else
    echo "Expected to find consensus domain file at ${out_consensus}"
    exit 1
fi

echo "Consensus domain file saved at ${out_consensus}"
"""

# Write Colab-specific script
COLAB_RUN.write_text(script_content)
os.chmod(COLAB_RUN, 0o755)

print("Environment:", COLAB_RUN)
print("Input dir:   ", TED_INPUT_DIR)
print("Output dir:  ", TED_RESULTS_DIR, "\n")

cmd = f"bash '{COLAB_RUN}' -i '{TED_INPUT_DIR}' -o '{TED_RESULTS_DIR}'"
print("Running TED consensus...\n")
print(cmd, "\n")

!bash -lc "{cmd}"

print("\nFiles in TED_RESULTS_DIR:")
!ls -1 "{TED_RESULTS_DIR}"

In [None]:
!cat /content/ted_workshop/ted_results/chopping_chainsaw.txt

In [None]:
#@title Explore TED AFDB domains

import os, pathlib
import pandas as pd
import ipywidgets as widgets
import py3Dmol
from IPython.display import display

BASE_DIR        = pathlib.Path("/content").resolve()
WORK_DIR        = BASE_DIR / "ted_workshop"
TED_INPUT_DIR   = WORK_DIR / "ted_input"
TED_RESULTS_DIR = WORK_DIR / "ted_results"

consensus_path  = TED_RESULTS_DIR / "consensus.tsv"
merizo_path     = TED_RESULTS_DIR / "chopping_merizo.txt"
unidoc_path     = TED_RESULTS_DIR / "chopping_unidoc.txt"
chainsaw_path   = TED_RESULTS_DIR / "chopping_chainsaw.txt"

# --- Load consensus -------------------------------------------------------
if not consensus_path.exists():
    raise FileNotFoundError(f"Cannot find consensus.tsv at {consensus_path}")

cons_cols = [
    "target_id", "md5", "nres", "n_high", "n_med", "n_low",
    "high_domains", "med_domains", "low_domains",
]
cons_df = pd.read_csv(consensus_path, sep="\t", header=None, names=cons_cols)

# AFDB-only
cons_df = cons_df[cons_df["target_id"].str.startswith("AF-")].reset_index(drop=True)
if cons_df.empty:
    raise RuntimeError("consensus.tsv has no AF-* entries; rerun TED on AFDB-only inputs.")

# --- chopping helpers -----------------------------------------------------
def load_chopping(path):
    """
    chopping_* format (simplified):

      target_id  ...  domain_string  score

    domain_string is the SECOND-TO-LAST column.
    """
    if not path.exists():
        return {}
    mapping = {}
    with open(path) as fh:
        for line in fh:
            line = line.rstrip("\n")
            if not line or line.startswith("#"):
                continue
            parts = line.split("\t")
            if len(parts) < 3:
                continue
            target_id = parts[0]
            dom_str   = parts[-2]
            mapping[target_id] = dom_str
    return mapping

merizo_chop   = load_chopping(merizo_path)
unidoc_chop   = load_chopping(unidoc_path)
chainsaw_chop = load_chopping(chainsaw_path)

def parse_domain_string(dom_str):
    """
    Return a list of domains; each domain is a list of (start, end) segments.

    Example:
      '249-353_596-636,700-800'
      -> [ [(249,353),(596,636)], [(700,800)] ]
    """
    domains = []
    if not dom_str or dom_str == "na":
        return domains

    for dom in dom_str.split(","):
        segs = []
        for seg in dom.split("_"):
            if "-" not in seg:
                continue
            a, b = seg.split("-")
            try:
                start, end = int(a), int(b)
                segs.append((start, end))
            except ValueError:
                continue
        if segs:
            domains.append(segs)
    return domains

def extract_uniprot_from_afid(target_id: str):
    # AF-P00533-F1-model_v6 -> P00533
    try:
        return target_id.split("-")[1]
    except Exception:
        return None

def get_domain_string(target_id, source, row):
    if source == "consensus_high":
        return row["high_domains"], "Consensus HIGH"
    elif source == "consensus_med":
        return row["med_domains"], "Consensus MEDIUM"
    elif source == "consensus_low":
        return row["low_domains"], "Consensus LOW"
    elif source == "merizo":
        return merizo_chop.get(target_id, "na"), "Merizo"
    elif source == "unidoc":
        return unidoc_chop.get(target_id, "na"), "UniDoc"
    elif source == "chainsaw":
        return chainsaw_chop.get(target_id, "na"), "Chainsaw"
    else:
        return "na", source

DOMAIN_COLORS = [
    "red", "orange", "yellow", "green",
    "cyan", "blue", "magenta", "salmon",
    "lime", "violet", "gold", "deepskyblue",
]

# --- Widgets --------------------------------------------------------------

target_dropdown = widgets.Dropdown(
    options=sorted(cons_df["target_id"].tolist()),
    description="AF target:",
    layout=widgets.Layout(width="70%"),
)

source_dropdown = widgets.Dropdown(
    options=[
        ("Consensus (high)", "consensus_high"),
        ("Consensus (medium)", "consensus_med"),
        ("Consensus (low)", "consensus_low"),
        ("Merizo", "merizo"),
        ("UniDoc", "unidoc"),
        ("Chainsaw", "chainsaw"),
    ],
    value="consensus_high",
    description="Domains:",
    layout=widgets.Layout(width="70%"),
)

ui  = widgets.VBox([target_dropdown, source_dropdown])
out = widgets.Output()

def update_view(change=None):
    out.clear_output(wait=True)
    with out:
        target_id = target_dropdown.value
        source    = source_dropdown.value

        row = cons_df[cons_df["target_id"] == target_id].iloc[0]
        dom_str, label = get_domain_string(target_id, source, row)
        domains = parse_domain_string(dom_str)
        uni_id = extract_uniprot_from_afid(target_id)

        pdb_path = TED_INPUT_DIR / f"{target_id}.pdb"
        if not pdb_path.exists():
            print(f"PDB file not found: {pdb_path}")
            return

        with open(pdb_path) as fh:
            pdb_str = fh.read()

        print(f"Target: {target_id}")
        if uni_id:
            print(f"   UniProt: {uni_id} (https://alphafold.ebi.ac.uk/entry/{uni_id})")
        print(f"   Segmentation: {label}")
        print(f"   Domain string: {dom_str}")

        if domains:
            print("   Domains (discontinuous segments grouped):")
            for i, segs in enumerate(domains, 1):
                seg_str = "_".join(f"{s}-{e}" for s, e in segs)
                print(f"     D{i}: {seg_str}")
        else:
            print("   Domains: (none for this method)")

        # --- 3D view ---------------------------------------------------------
        view = py3Dmol.view(width=640, height=480)
        view.addModel(pdb_str, "pdb")
        view.setStyle({"cartoon": {"color": "lightgrey"}})


        for i, segs in enumerate(domains):
            color = DOMAIN_COLORS[i % len(DOMAIN_COLORS)]
            for start, end in segs:
                sel = {"resi": list(range(start, end + 1))}
                view.addStyle(sel, {"cartoon": {"color": color}})

        view.zoomTo()
        view.show()

target_dropdown.observe(update_view, names="value")
source_dropdown.observe(update_view, names="value")

display(ui, out)
update_view()