In [None]:
import pickle
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.notebook import trange

from function.cutpondr import CutPONDR
from function.utilities import get_uniprot_rawdata

# Param

In [None]:
#####CHANGE HERE#####
algorithm = "VSL2"  # PONDR algorithm for use (‘VLXT’, ‘XL1_XT’, ‘CAN_XT’, ‘VL3-BA’, 'VSL2')
#####CHANGE HERE#####

In [None]:
# path for protein data downloaded from uniprot
pondr_disorder_ident_path = Path("./output/pondr_disorder_ident")

# Load uniprot data

In [None]:
human_uniprot_path = Path("./rawdata/human_uniprot.tab")
human_df = get_uniprot_rawdata(human_uniprot_path)
human_df = human_df.sort_values(by="uniprot_id", ignore_index=True)

In [None]:
human_df

# PONDR disorder identification

In [None]:
def pondr_disorder(cutpondr, input_df, algorithm):
    """
    cutpondr: CutPONDR object
    input_df: human_df for identify disorder/order sequences
    algorithm : (‘VLXT’, ‘XL1_XT’, ‘CAN_XT’, ‘VL3-BA’, 'VSL2'), default ‘VLXT’, pondr algorithm for use
    """
    output_df = pd.DataFrame(columns=["uniprot_id", "order_sequence", "disorder_sequence", "od_ident"])
    
    
    failed_list = []
    t = trange(len(input_df), desc=" ", leave=True)
    for i in t:
        uniprot_id = input_df.loc[i]["uniprot_id"]
        protein_sequence = input_df.loc[i]["protein_sequence"]

        t.set_description(uniprot_id)
        t.refresh()

        try:
            cutpondr.cut(sequence=protein_sequence, protein_name="aa", algorithm=algorithm)
            order_sequence = cutpondr.get_order_sequence()
            disorder_sequence = cutpondr.get_disorder_sequence()
            od_ident = cutpondr.get_od_ident()
            output_df = output_df.append(
                {
                    "uniprot_id": uniprot_id,
                    "order_sequence": order_sequence,
                    "disorder_sequence": disorder_sequence,
                    "od_ident": od_ident,
                },
                ignore_index=True,
            )
        except:
            t.set_description("{} failed".format(uniprot_id))
            t.refresh()
            
            print("{} failed".format(uniprot_id))
            failed_list.append(uniprot_id)
     
    
    return output_df, failed_list

In [None]:
cutpondr = CutPONDR(show_progress_window=False)
od_human_df, failed_human_list = pondr_disorder(cutpondr, input_df=human_df, algorithm=algorithm)
cutpondr.close()

# Save output for further use

In [None]:
disorder_ident_output = pondr_disorder_ident_path / "{}_od_human_df.pkl".format(algorithm)
disorder_ident_failed_output = pondr_disorder_ident_path / "{}_failed_human_list.pkl".format(algorithm)

print(disorder_ident_output)
print(disorder_ident_failed_output)

In [None]:
od_human_df.to_pickle(disorder_ident_output)
with open(disorder_ident_failed_output, "wb") as f:
    pickle.dump(failed_human_list, f)

In [None]:
#read
# od_human_df = pd.read_pickle(disorder_ident_output)
# with open(disorder_ident_failed_output, 'rb') as f:
#     failed_human_list = pickle.load(f)