In [None]:
import os
import sys
import pickle
import requests
import numpy as np
import pandas as pd
from pathlib import Path 
from tqdm.notebook import trange
from tqdm.notebook import tqdm

In [None]:
# git for functions loading and work path finding
import git

repo = git.Repo('.', search_parent_directories=True)
work_path = Path(repo.working_tree_dir)
if str(work_path) not in sys.path:
    sys.path.append(str(work_path))

In [None]:
from function.utilities import find_human_sequence
from function.utilities import get_fasta_filename
from function.cutpondr import CutPONDR

# Get human sequence from downloaded OMA fasta

In [None]:
# please specify the path to processed fasta files to extract human sequnece from homologs (b_seqaa_checked)
b_seqaa_checked_path = work_path / "1_prepare_training_data" / "oma_all" / "b_seqaa_checked"

In [None]:
human_protein_df = pd.DataFrame(columns=['oma_group_id','protein_sequence'])

fasta_pathlist = list(Path(b_seqaa_checked_path).rglob("*.fasta"))
t = trange(len(fasta_pathlist), leave=True)
for i in t:
    
    # get fasta filename
    file_name = get_fasta_filename(fasta_pathlist[i])
    t.set_description(file_name)
    t.refresh()
    
    # get human sequence
    human_sequence_info = find_human_sequence(fasta_pathlist[i])
    oma_group_id = human_sequence_info['oma_group_id']
    sequence = human_sequence_info['sequence']
 
    # append to df
    human_protein_df = pd.concat([human_protein_df,
                                  pd.DataFrame([{"oma_group_id":oma_group_id,"protein_sequence":sequence}])],
                                 ignore_index=True)

In [None]:
# save to pickle
# human_protein_df.to_pickle(work_path / "1_prepare_training_data" / "1-2_human_from_oma.pkl")

# PONDR disorder identification

In [None]:
def pondr_disorder(algorithm, delete=False):
    """
    cutpondr: CutPONDR object
    input_df: human_df for identify disorder/order sequences
    algorithm : (‘VLXT’, ‘XL1_XT’, ‘CAN_XT’, ‘VL3-BA’, 'VSL2'), default ‘VLXT’, pondr algorithm for use
    """
    
    global cutpondr, input_df, output_df
    
    t = trange(len(input_df), desc=" ", leave=True)
    for i in t:
        oma_group_id = input_df.loc[i]["oma_group_id"]
        protein_sequence = input_df.loc[i]["protein_sequence"]

        t.set_description(oma_group_id)
        t.refresh()

        cutpondr.cut(sequence=protein_sequence, protein_name="aa", algorithm=algorithm)
        order_sequence = cutpondr.get_order_sequence()
        disorder_sequence = cutpondr.get_disorder_sequence()
        od_ident = cutpondr.get_od_ident()
        
        output_df = pd.concat([output_df,
                               pd.DataFrame([{"oma_group_id": oma_group_id,
                                              "protein_sequence": protein_sequence,
                                              "order_sequence": order_sequence,
                                              "disorder_sequence": disorder_sequence,
                                              "od_ident": od_ident
                                             }])
                              ],ignore_index=True)
        
        if delete:
            input_df.drop(i, inplace=True)

In [None]:
# choose PONDR algorithm
algorithm = "VSL2" 

# input_df: human sequences from OMA downloaded fasta
input_df = pd.read_pickle(work_path / "1_prepare_training_data" / "1-2_human_from_oma.pkl")

# output df: save PONDR's output
output_df = pd.DataFrame(columns=["oma_group_id", "protein_sequence", "order_sequence", "disorder_sequence", "od_ident"])

In [None]:
cutpondr = CutPONDR(show_progress_window=False)
failed_human_list = pondr_disorder(algorithm=algorithm, delete=False)
cutpondr.close()

In [None]:
# output_df.to_pickle(work_path / "1_prepare_training_data" / "1-2_human_pondr_vsl2.pkl")