In [1]:
import pickle
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.notebook import trange

from function.cutpondr import CutPONDR
from function.utilities import get_uniprot_rawdata

# param

In [2]:
#####CHANGE HERE#####
algorithm = "VSL2"  # PONDR algorithm for use (‘VLXT’, ‘XL1_XT’, ‘CAN_XT’, ‘VL3-BA’, 'VSL2')
chromedriver_path = Path("/home/wenlin/d/custom_command/chromedriver") # chrome driver's path
#####CHANGE HERE#####

In [8]:
# path for protein data downloaded from uniprot
pondr_disorder_ident_path = Path("./output/pondr_disorder_ident")

# load uniprot data

In [4]:
human_uniprot_path = Path("./rawdata/human_uniprot.tab")
human_df = get_uniprot_rawdata(human_uniprot_path)

In [5]:
human_df

Unnamed: 0,uniprot_id,gene_name,protein_name,protein_sequence,taxonomy
0,Q96R72,OR4K3,Olfactory receptor 4K3 (Olfactory receptor OR1...,MAWSNQSAVTEFILRGLSSSLELQIFYFLFFSIVYAATVLGNLLIV...,9606
1,Q9UKL2,OR52A1,Olfactory receptor 52A1 (HPFH1OR) (Odorant rec...,MSISNITVYMPSVLTLVGIPGLESVQCWIGIPFCAIYLIAMIGNSL...,9606
2,Q9H205,OR2AG1,Olfactory receptor 2AG1 (HT3) (Olfactory recep...,MELWNFTLGSGFILVGILNDSGSPELLCATITILYLLALISNGLLL...,9606
3,Q8NGN2,OR10S1,Olfactory receptor 10S1 (Olfactory receptor OR...,MTSRSVCEKMTMTTENPNQTVVSHFFLEGLRYTAKHSSLFFLLFLL...,9606
4,Q8NGC1,OR11G2,Olfactory receptor 11G2 (Olfactory receptor OR...,MHFLSQNDLNINLIPHLCLHRHSVIAGAFTIHRHMKIFNSPSNSST...,9606
...,...,...,...,...,...
20391,Q8WVZ7,RNF133,E3 ubiquitin-protein ligase RNF133 (EC 2.3.2.2...,MHLLKVGTWRNNTASSWLMKFSVLWLVSQNCCRASVVWMAYMNISF...,9606
20392,P05387,RPLP2,60S acidic ribosomal protein P2 (Large ribosom...,MRYVASYLLAALGGNSSPSAKDIKKILDSVGIEADDDRLNKVISEL...,9606
20393,P51991,HNRNPA3,Heterogeneous nuclear ribonucleoprotein A3 (hn...,MEVKPPPGRPQPDSGRRRRRRGEEGHDPKEPEQLRKLFIGGLSFET...,9606
20394,Q9BZX4,ROPN1B,Ropporin-1B (Rhophilin-associated protein 1B),MAQTDKPTCIPPELPKMLKEFAKAAIRAQPQDLIQWGADYFEALSR...,9606


# PONDR disorder identification

In [6]:
def pondr_disorder(cutpondr, input_df, algorithm):
    """
    cutpondr: CutPONDR object
    input_df: human_df for identify disorder/order sequences
    algorithm : (‘VLXT’, ‘XL1_XT’, ‘CAN_XT’, ‘VL3-BA’, 'VSL2'), default ‘VLXT’, pondr algorithm for use
    """
    output_df = pd.DataFrame(columns=["uniprot_id", "order_sequence", "disorder_sequence", "od_ident"])
    failed_list = []

    t = trange(input_df.shape[0], desc=" ", leave=True)
    for i in t:
        uniprot_id = input_df.loc[i]["uniprot_id"]
        protein_sequence = input_df.loc[i]["protein_sequence"]

        t.set_description(uniprot_id)
        t.refresh()

        try:
            cutpondr.cut(sequence=protein_sequence, protein_name="aa", algorithm=algorithm)
            order_sequence = cutpondr.get_order_sequence()
            disorder_sequence = cutpondr.get_disorder_sequence()
            od_ident = cutpondr.get_od_ident()
            output_df = output_df.append(
                {
                    "uniprot_id": uniprot_id,
                    "order_sequence": order_sequence,
                    "disorder_sequence": disorder_sequence,
                    "od_ident": od_ident,
                },
                ignore_index=True,
            )
        except:
            t.set_description("{} failed".format(uniprot_id))
            t.refresh()
            
            print("{} failed".format(uniprot_id))
            failed_list.append(uniprot_id)
    return output_df, failed_list

In [3]:
cutpondr = CutPONDR(driver_path=chromedriver_path, show_progress_window=True)
od_human_df, failed_human_list = pondr_disorder(cutpondr, input_df=human_df, algorithm=algorithm)
cutpondr.close()

# save output for further use

In [7]:
disorder_ident_output = pondr_disorder_ident_path / "{}_od_human_df.pkl".format(algorithm)
disorder_ident_failed_output = pondr_disorder_ident_path / "{}_failed_human_list.pkl".format(algorithm)

print(disorder_ident_output)
print(disorder_ident_failed_output)

output/pondr disorder ident/VSL2_od_human_df.pkl
output/pondr disorder ident/VSL2_failed_human_list.pkl


In [None]:
od_human_df.to_pickle(disorder_ident_output)
with open(disorder_ident_failed_output, "wb") as f:
    pickle.dump(failed_human_list, f)