<a href="https://colab.research.google.com/github/yc386/orthrus_metaproteomics/blob/main/orthrus_v110/orthrus_v110_pt2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Orthrus pt2- [`Sage`](https://github.com/lazear/sage)**

Please note: change to **TPU** runtime if RAM usage is expected to be high (due to files/PTMs/databases)

In [4]:
#@title Add inputs for `SAGE` -> click `Runtime` -> `Run all`

#@markdown **Parameters for `SAGE`**
peak_folder = "" #@param {type:"string"}
file_type="mzML" #@param ["mzML", "mgf"]
#@markdown - use the drop-down menu to choose the instrument file type
json_file_path = '/content/drive/MyDrive/casanovo/sage/config_general_MQ_fixed_CAM_v1.json' #@param {type:"string"}
#@markdown - a `Sage`-compatible `.json` file
enzyme = "KR" #@param {type:"string"}

#@markdown **`SAGE` PTM plus**
#@markdown - Default `Sage` contains CAM (fixed) + variable mods: Oxidation(M), Deamidation(NQ)
#@markdown - PTM plus up to 5 variable mods and CAM (cysteine carbamidomethylation) can be turned off
use_PTM_plus = True #@param {type:"boolean"}
static_CAM = True #@param {type:"boolean"}
max_variable_mods= 3 #@param {type:"number"}
missed_cleavages= 2 #@param {type:"number"}
AA_1 = "M" #@param ["None", "[","]","A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"]
AA_1_mod = 15.9949 #@param {type:"number"}
AA_2 = "P" #@param ["None", "[","]","A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"]
AA_2_mod = 15.9949 #@param {type:"number"}
AA_3 = "N" #@param ["None", "[","]","A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"]
AA_3_mod = 0.984016 #@param {type:"number"}
AA_4 = "Q" #@param ["None", "[","]","A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"]
AA_4_mod = 0.984016 #@param {type:"number"}
AA_5 = "None" #@param ["None", "[","]","A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"]
#@markdown - [ = n-terminal
AA_5_mod = 	42.010565 #@param {type:"number"}


In [2]:
#@title install dependencies
%%time

import os
import shutil
import glob
import json

if not os.path.isfile("Sage_READY"):
  print("installing conda...")
  os.system("wget -qnc https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh")
  os.system("bash ./Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local")
  os.system("touch Sage_READY")
  os.system(f"conda install -c bioconda -c conda-forge sage-proteomics -y -q")


'''
Organise mztabs and instrument files in the same folder

'''

def organise_files (directory):
    print("Organising files...")

    if not os.path.isdir(directory):
        print(f"The directory {directory} does not exist.")
        return



    MS2_files = glob.glob(os.path.join(directory, f'*.{file_type}'))

    for MS2 in MS2_files:

        base_name = os.path.splitext(os.path.basename(MS2))[0]

        new_folder_path = os.path.join(directory, base_name)

        if not os.path.exists(new_folder_path):
            os.makedirs(new_folder_path)
            print(f"Created folder: {new_folder_path}")
        else:
            print(f"Folder already exists: {new_folder_path}")


        MS2_path = os.path.join(new_folder_path, os.path.basename(MS2))
        if not os.path.exists(MS2_path):
            shutil.move(MS2, new_folder_path)
            print(f"Moved {MS2} to {new_folder_path}")
        else:
            print(f"MS2 file already exists in the destination: {MS2_path}")


        fasta_filename = f"{base_name}_matched.fasta"
        fasta_file = os.path.join(directory, fasta_filename)


        if os.path.exists(fasta_file):
            new_fasta_path = os.path.join(new_folder_path, fasta_filename)
            if not os.path.exists(new_fasta_path):
                shutil.move(fasta_file, new_folder_path)
                print(f"Moved {fasta_file} to {new_folder_path}")
            else:
                print(f".fasta file already exists in the destination: {new_fasta_path}")
        else:
            print(f"No matching .fasta file found for {base_name}")


"""Generate and save a Sage configuration .json file."""

def get_sage_config(json_file_path, peak_folder, static_mods, new_mods,
                    missed_cleavages, enzyme,
                    min_len, max_len, max_variable_mods,
                    output_config_path):

    with open(json_file_path, 'r') as file:
        json_data = json.load(file)

        peak_files = glob.glob(peak_folder)
        print(f"🗂️ {len(peak_files)} file(s) collected from {peak_folder}")

        json_data['mzml_paths'] = peak_files
        json_data['database']['static_mods'] = static_mods
        json_data['database']['variable_mods'] = new_mods
        json_data['database']['enzyme']['missed_cleavages'] = missed_cleavages
        json_data['database']['enzyme']['cleave_at']= enzyme
        json_data['database']['enzyme']['min_len'] = min_len
        json_data['database']['enzyme']['max_len'] = max_len
        json_data['database']['max_variable_mods'] = max_variable_mods
        json_data['database']['decoy_tag'] = "rev_"
        json_data['database']['generate_decoys'] = True

    with open(output_config_path, 'w') as f:
        json.dump(json_data, f, indent=4)


installing conda...
CPU times: user 143 ms, sys: 28.7 ms, total: 172 ms
Wall time: 50.6 s


In [5]:
#@title Run Sage

organise_files(peak_folder)
folder_path = peak_folder

if use_PTM_plus:
    AAs = [AA_1, AA_2, AA_3, AA_4, AA_5]
    mods = [AA_1_mod, AA_2_mod, AA_3_mod, AA_4_mod, AA_5_mod]
    PTMs = {}

    for AA, mod in zip(AAs, mods):
        if AA != "None":
            PTMs[AA] = [mod]

    big_folder = glob.glob(f"{folder_path}/*")
    for folder in big_folder:
        if not os.path.isdir(folder):
            continue

        mzml_files = glob.glob(f"{folder}/*.{file_type}")
        if not mzml_files:
            continue

        peak_path = mzml_files[0]
        output_json = peak_path.replace(f".{file_type}", '.json')

        json_file_path = json_file_path
        missed_cleavages = missed_cleavages
        enzyme = enzyme
        min_len = 6
        max_len = 30
        max_variable_mods = max_variable_mods
        static_mods = {"C": 57.021464} if static_CAM else {}

        get_sage_config(
            json_file_path, peak_path, static_mods, PTMs,
            missed_cleavages, enzyme,
            min_len, max_len,
            max_variable_mods, output_json
        )

        fasta_files = glob.glob(f"{folder}/*.fasta")
        if not fasta_files:
            continue

        fasta_path = fasta_files[0]
        !sage {output_json} --fasta {fasta_path} \
            --write-pin --output_directory {folder}

else:
    big_folder = glob.glob(f"{folder_path}/*")
    for folder in big_folder:
        if not os.path.isdir(folder):
            continue

        mzml_files = glob.glob(f"{folder}/*.{file_type}")
        if not mzml_files:
            continue

        peak_path = mzml_files[0]
        output_json = peak_path.replace(f".{file_type}", '.json')
        json_file_path = json_file_path
        missed_cleavages = 2
        enzyme = enzyme
        min_len = 6
        max_len = 30
        max_variable_mods = 5
        static_mods = {"C": 57.021464}
        new_mods = {"M": [15.994915], "N": [0.984016], "Q": [0.984016]}

        get_sage_config(
            json_file_path, peak_path, static_mods, new_mods,
            missed_cleavages, enzyme,
            min_len, max_len,
            max_variable_mods, output_json
        )

        fasta_files = glob.glob(f"{folder}/*.fasta")
        if not fasta_files:
            continue

        fasta_path = fasta_files[0]
        !sage {output_json} --fasta {fasta_path} \
            --write-pin --output_directory {folder}


Organising files...
The directory  does not exist.
