In [1]:
import pickle
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.notebook import trange

from function.cutpondr import CutPONDR
from function.utilities import get_uniprot_rawdata

# param

In [2]:
#####CHANGE HERE#####
algorithm = "VLXT"  # PONDR algorithm for use (‘VLXT’, ‘XL1_XT’, ‘CAN_XT’, ‘VL3-BA’, 'VSL2')
chromedriver_path = Path("/home/wenlin/d/custom_command/chromedriver") # chrome driver's path
#####CHANGE HERE#####

In [3]:
# path for protein data downloaded from uniprot
pondr_disorder_ident_path = Path("./output/pondr_disorder_ident")

# load uniprot data

In [4]:
human_uniprot_path = Path("./rawdata/human_uniprot.tab")
human_df = get_uniprot_rawdata(human_uniprot_path)
human_df = human_df.sort_values(by="uniprot_id", ignore_index=True)

In [5]:
human_df

Unnamed: 0,uniprot_id,gene_name,protein_name,protein_sequence,taxonomy
0,A0A024RBG1,NUDT4B,Diphosphoinositol polyphosphate phosphohydrola...,MMKFKPNQTRTYDREGFKKRAACLCFRSEQEDEVLLVSSSRYPDQW...,9606
1,A0A075B6H7,IGKV3-7,Probable non-functional immunoglobulin kappa v...,MEAPAQLLFLLLLWLPDTTREIVMTQSPPTLSLSPGERVTLSCRAS...,9606
2,A0A075B6H8,IGKV1D-42,Probable non-functional immunoglobulin kappa v...,MDMRVPAQLLGLLLLWLPGVRFDIQMTQSPSFLSASVGDRVSIICW...,9606
3,A0A075B6H9,IGLV4-69,Immunoglobulin lambda variable 4-69,MAWTPLLFLTLLLHCTGSLSQLVLTQSPSASASLGASVKLTCTLSS...,9606
4,A0A075B6I0,IGLV8-61,Immunoglobulin lambda variable 8-61,MSVPTMAWMMLLLGLLAYGSGVDSQTVVTQEPSFSVSPGGTVTLTC...,9606
...,...,...,...,...,...
20391,S4R3Y5,MTRNR2L11,Humanin-like 11 (HN11) (MT-RNR2-like protein 11),MATRGFSCLLLVISEIDLSVKRWV,9606
20392,U3KPV4,A3GALT2,"Alpha-1,3-galactosyltransferase 2 (EC 2.4.1.87...",MALKEGLRAWKRIFWRQILLTLGLLGLFLYGLPKFRHLEALIPMGV...,9606
20393,W5XKT8,SPACA6,Sperm acrosome membrane-associated protein 6 (...,MALLALASAVPSALLALAVFRVPAWACLLCFTTYSERLRICQMFVG...,9606
20394,W6CW81,PYDC5,Pyrin domain-containing protein 5 (Pyrin domai...,MESKYKEILLLTSLDNITDEELDRFKCFLPDEFNIATGKLHTLNST...,9606


# PONDR disorder identification

In [7]:
def pondr_disorder(cutpondr, input_df, algorithm):
    """
    cutpondr: CutPONDR object
    input_df: human_df for identify disorder/order sequences
    algorithm : (‘VLXT’, ‘XL1_XT’, ‘CAN_XT’, ‘VL3-BA’, 'VSL2'), default ‘VLXT’, pondr algorithm for use
    """
    output_df = pd.DataFrame(columns=["uniprot_id", "order_sequence", "disorder_sequence", "od_ident"])
    
    #first round
    first_failed_list = []
    print("=====first_round=====")
    t = trange(len(input_df), desc=" ", leave=True)
    for i in t:
        uniprot_id = input_df.loc[i]["uniprot_id"]
        protein_sequence = input_df.loc[i]["protein_sequence"]

        t.set_description(uniprot_id)
        t.refresh()

        try:
            cutpondr.cut(sequence=protein_sequence, protein_name="aa", algorithm=algorithm)
            order_sequence = cutpondr.get_order_sequence()
            disorder_sequence = cutpondr.get_disorder_sequence()
            od_ident = cutpondr.get_od_ident()
            output_df = output_df.append(
                {
                    "uniprot_id": uniprot_id,
                    "order_sequence": order_sequence,
                    "disorder_sequence": disorder_sequence,
                    "od_ident": od_ident,
                },
                ignore_index=True,
            )
        except:
            t.set_description("{} failed".format(uniprot_id))
            t.refresh()
            
            print("{} failed".format(uniprot_id))
            first_failed_list.append(uniprot_id)
     
    #second round: fetch recond again by first_failed_list caused by some network problem, not by PONDR
    second_failed_list = []
    print("=====second_round=====")
    t = trange(len(first_failed_list), desc=" ", leave=True)
    for i in t:
        uniprot_id = first_failed_list[i]
        protein_sequence = input_df[input_df['uniprot_id'] == uniprot_id]['protein_sequence'].values[0]

        t.set_description(uniprot_id)
        t.refresh()

        try:
            cutpondr.cut(sequence=protein_sequence, protein_name="aa", algorithm=algorithm)
            order_sequence = cutpondr.get_order_sequence()
            disorder_sequence = cutpondr.get_disorder_sequence()
            od_ident = cutpondr.get_od_ident()
            output_df = output_df.append(
                {
                    "uniprot_id": uniprot_id,
                    "order_sequence": order_sequence,
                    "disorder_sequence": disorder_sequence,
                    "od_ident": od_ident,
                },
                ignore_index=True,
            )
        except:
            t.set_description("{} failed".format(uniprot_id))
            t.refresh()
            
            print("{} failed".format(uniprot_id))
            second_failed_list.append(uniprot_id)
    
    output_df = output_df.sort_values(by="uniprot_id", ignore_index=True)
    
    return output_df, second_failed_list

In [1]:
cutpondr = CutPONDR(driver_path=chromedriver_path, show_progress_window=True)
od_human_df, failed_human_list = pondr_disorder(cutpondr, input_df=a, algorithm=algorithm)
cutpondr.close()

# save output for further use

In [201]:
disorder_ident_output = pondr_disorder_ident_path / "{}_od_human_df.pkl".format(algorithm)
disorder_ident_failed_output = pondr_disorder_ident_path / "{}_failed_human_list.pkl".format(algorithm)

print(disorder_ident_output)
print(disorder_ident_failed_output)

output/pondr_disorder_ident/VSL2_od_human_df.pkl
output/pondr_disorder_ident/VSL2_failed_human_list.pkl


In [202]:
od_human_df.to_pickle(disorder_ident_output)
with open(disorder_ident_failed_output, "wb") as f:
    pickle.dump(failed_human_list, f)

In [203]:
#read
# od_human_df = pd.read_pickle(disorder_ident_output)
# with open(disorder_ident_failed_output, 'rb') as f:
#     failed_human_list = pickle.load(f)

In [None]:
cutpondr = CutPONDR(driver_path=chromedriver_path, show_progress_window=True)

algorithm = "VLXT"
od_human_df, failed_human_list = pondr_disorder(cutpondr, input_df=human_df, algorithm=algorithm)
disorder_ident_output = pondr_disorder_ident_path / "{}_od_human_df.pkl".format(algorithm)
disorder_ident_failed_output = pondr_disorder_ident_path / "{}_failed_human_list.pkl".format(algorithm)
od_human_df.to_pickle(disorder_ident_output)
with open(disorder_ident_failed_output, "wb") as f:
    pickle.dump(failed_human_list, f)
    
algorithm = "VL3-BA"
od_human_df, failed_human_list = pondr_disorder(cutpondr, input_df=human_df, algorithm=algorithm)
disorder_ident_output = pondr_disorder_ident_path / "{}_od_human_df.pkl".format(algorithm)
disorder_ident_failed_output = pondr_disorder_ident_path / "{}_failed_human_list.pkl".format(algorithm)
od_human_df.to_pickle(disorder_ident_output)
with open(disorder_ident_failed_output, "wb") as f:
    pickle.dump(failed_human_list, f)
    
algorithm = "VSL2"
od_human_df, failed_human_list = pondr_disorder(cutpondr, input_df=human_df, algorithm=algorithm)
disorder_ident_output = pondr_disorder_ident_path / "{}_od_human_df.pkl".format(algorithm)
disorderhttps://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&ved=2ahUKEwjkgN3Lz-7zAhUEEqYKHZyAC7kQFnoECAQQAQ&url=https%3A%2F%2Fblog.csdn.net%2Fliu16659%2Farticle%2Fdetails%2F107178712&usg=AOvVaw1_AHYykr4eVA3ZN-RqSzVJ_ident_failed_output = pondr_disorder_ident_path / "{}_failed_human_list.pkl".format(algorithm)
od_human_df.to_pickle(disorder_ident_output)
with open(disorder_ident_failed_output, "wb") as f:
    pickle.dump(failed_human_list, f)
    
cutpondr.close()

=====first_round=====


 :   0%|          | 0/20396 [00:00<?, ?it/s]

A0A075B6S0 failed
A0A075B6Y3 failed
A0A075B6Y9 failed
A0A075B700 failed
A0A075B706 failed
A0A0A0MT70 failed
A0A0A0MT78 failed
A0A0A0MT87 failed
A0A0A0MT89 failed
A0A0A0MT94 failed
A0A0A0MTA4 failed
A0A0A0MTA7 failed
A0A0B4J200 failed
A0A0C4DH62 failed
A0A0C5B5G6 failed
A0A0J9YWP8 failed
A0A0J9YWX3 failed
A0A0J9YX06 failed
A0A0J9YXA8 failed
A0A0J9YXG5 failed
A0A0J9YXM7 failed
O60613 failed
P01358 failed
P01858 failed
P02728 failed
P02729 failed
P07203 failed
P0CJ68 failed
P0CJ69 failed
P0CJ70 failed
P0CJ71 failed
P0CJ72 failed
P0CJ73 failed
P0CJ74 failed
P0CJ75 failed
P0CJ76 failed
P0CJ77 failed
P0DMP1 failed
P0DOY5 failed
P0DPI4 failed
P0DPR3 failed
P18283 failed
P18545 failed
P19338 failed
P22103 failed
P22352 failed
P36969 failed
P49895 failed
P49908 failed
P55073 failed
P59796 failed
P59797 failed
P62341 failed
P62945 failed
P63302 failed
P69208 failed
Q16881 failed
Q86VQ6 failed
Q8IVG9 failed
Q8IZQ5 failed
Q8WWX9 failed
Q92813 failed
Q99611 failed
Q9BQE4 failed
Q9BVL4 failed
Q9C0D9

 :   0%|          | 0/73 [00:00<?, ?it/s]

A0A075B6S0 failed
A0A075B6Y3 failed
A0A075B6Y9 failed
A0A075B700 failed
A0A075B706 failed
A0A0A0MT70 failed
A0A0A0MT78 failed
A0A0A0MT87 failed
A0A0A0MT89 failed
A0A0A0MT94 failed
A0A0A0MTA4 failed
A0A0A0MTA7 failed
A0A0B4J200 failed
A0A0C4DH62 failed
A0A0C5B5G6 failed
A0A0J9YWP8 failed
A0A0J9YWX3 failed
A0A0J9YX06 failed
A0A0J9YXA8 failed
A0A0J9YXG5 failed
A0A0J9YXM7 failed
O60613 failed
P01358 failed
P01858 failed
P02728 failed
P02729 failed
P07203 failed
P0CJ68 failed
P0CJ69 failed
P0CJ70 failed
P0CJ71 failed
P0CJ72 failed
P0CJ73 failed
P0CJ74 failed
P0CJ75 failed
P0CJ76 failed
P0CJ77 failed
P0DMP1 failed
P0DOY5 failed
P0DPI4 failed
P0DPR3 failed
P18283 failed
P22103 failed
P22352 failed
P36969 failed
P49895 failed
P49908 failed
P55073 failed
P59796 failed
P59797 failed
P62341 failed
P62945 failed
P63302 failed
P69208 failed
Q16881 failed
Q86VQ6 failed
Q8IVG9 failed
Q8IZQ5 failed
Q8WWX9 failed
Q92813 failed
Q99611 failed
Q9BQE4 failed
Q9BVL4 failed
Q9C0D9 failed
Q9NNW7 failed
Q9NRI7

 :   0%|          | 0/20396 [00:00<?, ?it/s]

A0A075B6S0 failed
A0A075B6Y3 failed
A0A075B6Y9 failed
A0A075B700 failed
A0A075B706 failed
A0A0A0MT70 failed
A0A0A0MT78 failed
A0A0A0MT87 failed
A0A0A0MT89 failed
A0A0A0MT94 failed
A0A0A0MTA4 failed
A0A0A0MTA7 failed
A0A0B4J200 failed
A0A0C4DH62 failed
A0A0C5B5G6 failed
A0A0J9YWP8 failed
A0A0J9YWX3 failed
A0A0J9YX06 failed
A0A0J9YXA8 failed
A0A0J9YXG5 failed
A0A0J9YXM7 failed
