
# Proteomics Pipeline: Brain Metastasis Cohorts (PXD005719, PXD046330, PXD051579)

Batch Colab workflow that reuses shared tooling (msconvert, MSFragger, Philosopher) to generate log2 protein abundance matrices and metadata for the brain-metastasis related datasets.


In [None]:

# === Master Configuration ===
PROJECT_DIR = "/content/drive/MyDrive/NetsAnalysisProject"
DATASETS = [
    "PXD005719",  # membrane/BBB associated proteome
    "PXD046330",  # HER2+ conditioned media / secretome
    "PXD051579",  # BBB dysfunction time-course
]
RAW_PROTEO_DIR = f"{PROJECT_DIR}/data/raw/proteomics"
REFERENCE_DIR = f"{PROJECT_DIR}/data/raw/reference"
OUTPUT_ROOT = f"{PROJECT_DIR}/data/processed/proteomics"
MZML_ROOT = f"{OUTPUT_ROOT}/mzml"
WORK_ROOT = f"{PROJECT_DIR}/tmp/brain_proteomics"
TOOLS_DIR = f"{PROJECT_DIR}/tools"
MSCONVERT_BIN = f"{TOOLS_DIR}/pwiz/msconvert"
MSFRAGGER_JAR = f"{TOOLS_DIR}/msfragger/MSFragger-3.7.jar"
PHILOSOPHER_BIN = f"{TOOLS_DIR}/philosopher/philosopher"
FASTA_GZ = f"{REFERENCE_DIR}/UP000005640_9606.fasta.gz"
FASTA_PATH = f"{REFERENCE_DIR}/UP000005640_9606.fasta"


In [None]:

# Mount Drive and ensure directories
try:
    import google.colab  # type: ignore
    from google.colab import drive  # type: ignore
    drive.mount('/content/drive')
except ModuleNotFoundError:
    print('[info] Running outside Colab; ensure PROJECT_DIR is accessible.')

from pathlib import Path

for dataset in DATASETS:
    Path(f"{RAW_PROTEO_DIR}/{dataset}").mkdir(parents=True, exist_ok=True)
    Path(f"{OUTPUT_ROOT}/{dataset}").mkdir(parents=True, exist_ok=True)
    Path(f"{MZML_ROOT}/{dataset}").mkdir(parents=True, exist_ok=True)
Path(WORK_ROOT).mkdir(parents=True, exist_ok=True)
Path(TOOLS_DIR).mkdir(parents=True, exist_ok=True)

print(f"[setup] Datasets: {DATASETS}")


In [None]:

# Ensure toolchain is in place (assumes proteo_net_reference.ipynb has run once)
from pathlib import Path
import subprocess

missing = []
if not Path(MSCONVERT_BIN).exists():
    missing.append('ProteoWizard msconvert')
if not Path(MSFRAGGER_JAR).exists():
    missing.append('MSFragger jar')
if not Path(PHILOSOPHER_BIN).exists():
    missing.append('Philosopher binary')
if missing:
    print('[warn] Missing tools:', ', '.join(missing))
    print('Run colab/proteo_net_reference.ipynb tool-install cell first or re-run the install block here.')
else:
    print('[check] All tools detected.')


In [None]:

# Shared helper functions
import subprocess
import gzip
import shutil
import pandas as pd
import numpy as np
import time
import requests
from pathlib import Path
from typing import List


def run_cmd(cmd: List[str], cwd: Path | None = None):
    print('[cmd]', ' '.join(cmd))
    subprocess.run(cmd, check=True, cwd=str(cwd) if cwd else None)


def ensure_fasta() -> Path:
    fasta_path = Path(FASTA_PATH)
    if fasta_path.exists():
        return fasta_path
    gz_path = Path(FASTA_GZ)
    if not gz_path.exists():
        raise FileNotFoundError(f"FASTA gz not found: {gz_path}")
    with gzip.open(gz_path, 'rt') as fin, open(fasta_path, 'w') as fout:
        shutil.copyfileobj(fin, fout)
    print(f"[fasta] Decompressed {gz_path} -> {fasta_path}")
    return fasta_path


def convert_raw_to_mzml(dataset: str):
    raw_dir = Path(RAW_PROTEO_DIR) / dataset
    mzml_dir = Path(MZML_ROOT) / dataset
    mzml_dir.mkdir(parents=True, exist_ok=True)
    raw_files = sorted(raw_dir.glob('*.raw'))
    if not raw_files:
        raise FileNotFoundError(f"No RAW files found for {dataset}")
    for raw in raw_files:
        out_file = mzml_dir / (raw.stem + '.mzML')
        if out_file.exists():
            print(f"[skip] {dataset}: {out_file.name} already present")
            continue
        run_cmd([
            MSCONVERT_BIN,
            str(raw),
            '--mzML',
            '--filter', 'peakPicking true 1-',
            '--outfile', out_file.name,
            '--outdir', str(mzml_dir)
        ])
    print(f"[msconvert] {dataset}: {len(raw_files)} files processed")


def write_msfragger_params(dataset: str, fasta_path: Path) -> Path:
    params_dir = Path(WORK_ROOT) / dataset
    params_dir.mkdir(parents=True, exist_ok=True)
    params_path = params_dir / 'msfragger.params'
    params_path.write_text(
        f'''num_threads = 16
precursor_true_tolerance = 15
enzyme = trypsin
isotope_error = 0/1/2
allowed_missed_cleavage = 2
fragment_bin_tol = 0.02
precursor_mass_lower = -20
precursor_mass_upper = 20
precursor_mass_units = ppm
fragment_mass_units = Dalton
search_enzyme_name = trypsin
protein_database = {fasta_path}
add_Cterm_peptide = 0.0
add_Nterm_peptide = 0.0
add_G_glycine = 0.0
add_C_cysteine = 57.021464
variable_mod_01 = 15.994915 M 100.0
variable_mod_02 = 15.994915 P 10.0
variable_mod_03 = 0.984016 NQ 1.0
localize_delta_mass = 1
replace_missing = 1
zero_bin_accept_expect = 0
remove_precursor_peak = 1
isotope_error_correction = 1
calculate_pq_peptide_probability = 1
'''
    )
    return params_path


def run_msfragger(dataset: str, params_path: Path):
    mzml_files = sorted((Path(MZML_ROOT) / dataset).glob('*.mzML'))
    if not mzml_files:
        raise FileNotFoundError(f'No mzML files for {dataset}')
    cmd = ['java', '-Xmx24G', '-jar', MSFRAGGER_JAR, str(params_path)] + [str(f) for f in mzml_files]
    run_cmd(cmd, cwd=params_path.parent)


def run_philosopher(dataset: str, fasta_path: Path) -> Path:
    work_dir = Path(WORK_ROOT) / dataset
    workspace = work_dir / 'philosopher'
    workspace.mkdir(parents=True, exist_ok=True)
    run_cmd([PHILOSOPHER_BIN, 'workspace', '--init'], cwd=workspace)
    run_cmd([PHILOSOPHER_BIN, 'database', '--annotate', str(fasta_path)], cwd=workspace)

    pepxml_files = sorted(work_dir.glob('*.pepXML')) or sorted(work_dir.glob('*.pep.xml'))
    if not pepxml_files:
        raise FileNotFoundError(f'No pepXML files for {dataset}')

    for pepxml in pepxml_files:
        run_cmd([PHILOSOPHER_BIN, 'peptideprophet', '--ppm', '--nonparam', '--expectscore', str(pepxml)], cwd=workspace)

    interact_files = sorted(workspace.glob('interact-*.pep.xml'))
    run_cmd([PHILOSOPHER_BIN, 'proteinprophet'] + [str(f) for f in interact_files], cwd=workspace)
    run_cmd([PHILOSOPHER_BIN, 'filter', '--sequential', '--picked', '--peptide', '--prot'], cwd=workspace)
    run_cmd([PHILOSOPHER_BIN, 'label-free'] + [str(f) for f in interact_files], cwd=workspace)
    run_cmd([PHILOSOPHER_BIN, 'report', '--msstats'], cwd=workspace)
    return workspace


def load_protein_table(workspace: Path) -> pd.DataFrame:
    candidates = list(workspace.glob('*.protein.tsv'))
    if not candidates:
        raise FileNotFoundError('Protein report missing')
    return pd.read_csv(candidates[0], sep='	')


def uniprot_to_gene(uniprot_ids: List[str]) -> pd.DataFrame:
    ids = [uid for uid in set(uniprot_ids) if isinstance(uid, str) and uid]
    if not ids:
        return pd.DataFrame(columns=['UniProt', 'Gene'])
    job = requests.post('https://rest.uniprot.org/idmapping/run', data={'from': 'UniProtKB_AC-ID', 'to': 'Gene_Name', 'ids': ','.join(ids)}).json()
    job_id = job['jobId']
    status_url = f'https://rest.uniprot.org/idmapping/status/{job_id}'
    result_url = f'https://rest.uniprot.org/idmapping/stream/{job_id}'
    while True:
        status = requests.get(status_url).json()
        if status.get('jobStatus') == 'FINISHED':
            break
        if status.get('jobStatus') == 'FAILED':
            raise RuntimeError('UniProt mapping failed')
        time.sleep(3)
    rows = []
    for entry in requests.get(result_url).json().get('results', []):
        gene = entry.get('to')
        if gene:
            rows.append({'UniProt': entry['from'], 'Gene': gene.split(';')[0]})
    return pd.DataFrame(rows).drop_duplicates()


def build_matrix(dataset: str, df: pd.DataFrame) -> pd.DataFrame:
    output_dir = Path(OUTPUT_ROOT) / dataset
    output_dir.mkdir(parents=True, exist_ok=True)
    intensity_cols = [c for c in df.columns if 'Intensity' in c]
    if not intensity_cols:
        raise ValueError(f'No intensity columns detected in protein report for {dataset}')
    df = df.copy()
    df['PrimaryAcc'] = df['Protein'].str.split(';').str[0]
    mapping = uniprot_to_gene(df['PrimaryAcc'].tolist())
    df = df.merge(mapping, how='left', left_on='PrimaryAcc', right_on='UniProt')
    df['Gene'] = df['Gene'].fillna(df['PrimaryAcc'])
    df = df.drop_duplicates(subset=['Gene']).set_index('Gene')
    matrix = np.log2(df[intensity_cols].replace(0, np.nan))
    matrix_path = output_dir / 'protein_abundance.tsv'
    matrix.to_csv(matrix_path, sep='	')
    map_path = output_dir / 'uniprot_to_hgnc.tsv'
    if not mapping.empty:
        mapping.to_csv(map_path, sep='	', index=False)
    else:
        df[['PrimaryAcc']].rename(columns={'PrimaryAcc': 'UniProt'}).assign(Gene=df.index).to_csv(map_path, sep='	', index=False)
    return matrix


def derive_metadata(dataset: str, matrix: pd.DataFrame) -> pd.DataFrame:
    samples = matrix.columns
    records = []
    for sample in samples:
        clean = sample.replace('.raw', '').replace('.mzML', '')
        tokens = clean.split('_')
        treatment = 'NA'
        organ = 'Brain'
        replicate = 'NA'
        for token in tokens:
            if token.lower() in {'control', 'treatment', 'treated'}:
                treatment = token.lower()
            if token.lower() in {'br', 'brain'}:
                organ = 'Brain'
            if token.lower() in {'p', 'parental'}:
                organ = 'Parental'
            if token.lower().startswith('rep'):
                replicate = token.lower()
        records.append({'sample_id': sample, 'treatment': treatment, 'organ': organ, 'replicate': replicate})
    meta = pd.DataFrame(records)
    meta_path = Path(OUTPUT_ROOT) / dataset / 'metadata.tsv'
    meta.to_csv(meta_path, sep='	', index=False)
    return meta


### Batch execution across datasets

In [None]:

from pathlib import Path

fasta_path = ensure_fasta()
summary = []

for dataset in DATASETS:
    print(f"==== Processing {dataset} ====")
    convert_raw_to_mzml(dataset)
    params = write_msfragger_params(dataset, fasta_path)
    run_msfragger(dataset, params)
    workspace = run_philosopher(dataset, fasta_path)
    protein_df = load_protein_table(workspace)
    matrix = build_matrix(dataset, protein_df)
    meta = derive_metadata(dataset, matrix)
    summary.append({
        'dataset': dataset,
        'proteins': matrix.shape[0],
        'samples': matrix.shape[1]
    })

summary_df = pd.DataFrame(summary)
summary_df


In [None]:

print('Outputs:')
for dataset in DATASETS:
    out_dir = Path(OUTPUT_ROOT) / dataset
    print(f"- {dataset}: {out_dir}/protein_abundance.tsv, {out_dir}/metadata.tsv")
