In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from function.cutpondr import CutPONDR
from tqdm.notebook import trange
import pickle

In [2]:
def load_uniprot_rawdata(path):
    '''
    path: str, the uniprot tab file path
    
    Load protein data downloaded from uniprot (https://www.uniprot.org/uploadlists/), 
    the columns of uniprot must be 
    "Entry, Gene names (primary), Protein names, Sequence, Organism ID" 
    and save as "Tab-separated format (.tab)" 
    '''
    df = pd.read_csv(path,sep='\t',names=['uniprot_id','gene_name','protein_name','protein_sequence','taxonomy'])
    df = df.drop(0).reset_index().drop(axis=1,labels='index')
    return df

In [3]:
def pondr_disorder(cutpondr,input_df,algorithm):
    '''
    cutpondr: CutPONDR object
    input_df: pandas.DataFrame, uniprot rawdata for identify disorder/order sequences
    algorithm : {‘VLXT’, ‘XL1_XT’, ‘CAN_XT’, ‘VL3-BA’, 'VSL2'}, default ‘VLXT’, pondr algorithm for use 
    '''
    output_df = pd.DataFrame(columns=['uniprot_id','order_sequence','disorder_sequence','od_ident'])
    failed_list = []
    
    t = trange(input_df.shape[0], desc=' ', leave=True)
    for i in t:
        uniprot_id = input_df.loc[i]['uniprot_id']
        protein_sequence  = input_df.loc[i]['protein_sequence']
        
        t.set_description(uniprot_id)
        t.refresh()
        
        try:
            cutpondr.cut(sequence=protein_sequence,protein_name='aa',algorithm=algorithm)
            order_sequence = cutpondr.get_order_sequence()
            disorder_sequence = cutpondr.get_disorder_sequence()
            od_ident = cutpondr.get_od_ident()
            output_df = output_df.append({'uniprot_id': uniprot_id,
                                          'order_sequence':order_sequence,
                                          'disorder_sequence':disorder_sequence,
                                          'od_ident':od_ident}, ignore_index=True)
        except:
            t.set_description('{} failed'.format(uniprot_id))
            t.refresh()
            print('{} failed'.format(uniprot_id))
            failed_list.append(uniprot_id)
            
            
    return output_df,failed_list

# Load uniprot data 

In [4]:
#path for protein data downloaded from uniprot
human_df_path = './rawdata/human_uniprot.tab' 

#load protein data to pandas.DataFrame
human_df = load_uniprot_rawdata(human_df_path)

In [5]:
human_df

Unnamed: 0,uniprot_id,gene_name,protein_name,protein_sequence,taxonomy
0,Q96R72,OR4K3,Olfactory receptor 4K3 (Olfactory receptor OR1...,MAWSNQSAVTEFILRGLSSSLELQIFYFLFFSIVYAATVLGNLLIV...,9606
1,Q9UKL2,OR52A1,Olfactory receptor 52A1 (HPFH1OR) (Odorant rec...,MSISNITVYMPSVLTLVGIPGLESVQCWIGIPFCAIYLIAMIGNSL...,9606
2,Q9H205,OR2AG1,Olfactory receptor 2AG1 (HT3) (Olfactory recep...,MELWNFTLGSGFILVGILNDSGSPELLCATITILYLLALISNGLLL...,9606
3,Q8NGN2,OR10S1,Olfactory receptor 10S1 (Olfactory receptor OR...,MTSRSVCEKMTMTTENPNQTVVSHFFLEGLRYTAKHSSLFFLLFLL...,9606
4,Q8NGC1,OR11G2,Olfactory receptor 11G2 (Olfactory receptor OR...,MHFLSQNDLNINLIPHLCLHRHSVIAGAFTIHRHMKIFNSPSNSST...,9606
...,...,...,...,...,...
20391,Q8WVZ7,RNF133,E3 ubiquitin-protein ligase RNF133 (EC 2.3.2.2...,MHLLKVGTWRNNTASSWLMKFSVLWLVSQNCCRASVVWMAYMNISF...,9606
20392,P05387,RPLP2,60S acidic ribosomal protein P2 (Large ribosom...,MRYVASYLLAALGGNSSPSAKDIKKILDSVGIEADDDRLNKVISEL...,9606
20393,P51991,HNRNPA3,Heterogeneous nuclear ribonucleoprotein A3 (hn...,MEVKPPPGRPQPDSGRRRRRRGEEGHDPKEPEQLRKLFIGGLSFET...,9606
20394,Q9BZX4,ROPN1B,Ropporin-1B (Rhophilin-associated protein 1B),MAQTDKPTCIPPELPKMLKEFAKAAIRAQPQDLIQWGADYFEALSR...,9606


# PONDR order/disorder identification

In [7]:
#CHANGE HERE
algorithm = 'VSL2' #PONDR algorithm for use (‘VLXT’, ‘XL1_XT’, ‘CAN_XT’, ‘VL3-BA’, 'VSL2')
chromedriver_path = '/home/wenlin/d/custom_command/chromedriver' #specify your chrome driver's path
#CHANGE HERE

In [8]:
#pondr order/disorder identification
pondr_disorder_identification_path = Path('./output/pondr_disorder_identification') 
cutpondr = CutPONDR(driver_path = chromedriver_path,
                    show_progress_window = True)
od_human_df,failed_human_list = pondr_disorder(cutpondr,
                                               input_df = human_df,
                                               algorithm = algorithm)
cutpondr.close()

#save identified data for further use 
od_human_df.to_pickle(pondr_disorder_identification_path/'{}_od_human_df.pkl'.format(algorithm))
with open(pondr_disorder_identification_path/'{}_failed_human_list.pkl'.format(algorithm), 'wb') as f:
    pickle.dump(failed_human_list, f)

 :   0%|          | 0/20396 [00:00<?, ?it/s]

P62945 failed
Q9NRI7 failed
P18283 failed
P0DPI4 failed
O60613 failed
P63302 failed
Q9NZV5 failed
A0A0J9YWX3 failed
A0A0A0MT94 failed
A0A0A0MT78 failed
A0A0J9YXM7 failed
A0A0A0MTA7 failed
P59797 failed
Q9Y6D0 failed
Q9BVL4 failed
A0A075B6S0 failed
A0A075B6Y9 failed
Q8WWX9 failed
P34925 failed
A0A0J9YXG5 failed
A0A075B706 failed
A0A0A0MT87 failed
P49908 failed
A0A0J9YXA8 failed
A0A075B6Y3 failed
A0A0A0MTA4 failed
P62341 failed
Q99611 failed
Q9BQE4 failed
Q8IZQ5 failed
P0CJ69 failed
P22103 failed
Q9NNW7 failed
S4R3Y5 failed
A0A0C5B5G6 failed
P55073 failed
P0DOY5 failed
Q8IVG9 failed
P0DMP1 failed
P0CJ76 failed
P07203 failed
P22352 failed
P02728 failed
P01358 failed
P02729 failed
P0CJ73 failed
Q9C0D9 failed
A0A0C4DH62 failed
P59796 failed
P0CJ71 failed
P0CJ74 failed
P36969 failed
P0CJ70 failed
P0CJ68 failed
P0CJ75 failed
P49895 failed
P0CJ77 failed
S4R3P1 failed
P0CJ72 failed
P69208 failed
Q9NZV6 failed
A0A0A0MT89 failed
P0DPR3 failed
Q16881 failed
A0A0A0MT70 failed
A0A075B700 failed
A0A0

In [7]:
#numbers of rbps(1535) is a bit less than "Gerstberger, et al."(1542), mainly cause by duplicate records,
#more informations are shown below
rbp_df

Unnamed: 0,uniprot_id,gene_name,protein_name,protein_sequence,taxonomy
0,Q96EY7,PTCD3,Pentatricopeptide repeat domain-containing pro...,MAVVSAVRWLGLRSRLGQPLTGRRAGLCEQARSCRFYSGSATLSKV...,9606
1,P50914,RPL14,60S ribosomal protein L14 (CAG-ISL 7) (Large r...,MVFRRFVEVGRVAYVSFGPHAGKLVAIVDVIDQNRALVDGPCTQVR...,9606
2,Q9NVV4,MTPAP,"Poly(A) RNA polymerase, mitochondrial (PAP) (E...",MAVPGVGLLTRLNLCARRRTRVQRPIVRLLSCPGTVAKDLRRDEQP...,9606
3,Q9UKM9,RALY,RNA-binding protein Raly (Autoantigen p542) (H...,MSLKLQASNVTNKNDPKSINSRVFIGNLNTALVKKSDVETIFSKYG...,9606
4,P62888,RPL30,60S ribosomal protein L30 (Large ribosomal sub...,MVAAKKTKKSLESINSRLQLVMKSGKYVLGYKQTLKMIRQGKAKLV...,9606
...,...,...,...,...,...
1530,P07814,EPRS1,Bifunctional glutamate/proline--tRNA ligase (B...,MATLSLTVNSGDPPLGALLAVEHVKDDVSISVEEGKENILHVSENV...,9606
1531,D3DX98,,"HCG2045905, isoform CRA_a (Fragment)",MCSTLKKCGTYRTEVAECHDHGSTFQGRKKGGSSFRDNFDKRSCHY...,9606
1532,Q2VIR3,EIF2S3B,Eukaryotic translation initiation factor 2 sub...,MAGGEAGVTLGQPHLSRQDLTTLDVTKLTPLSHEVISRQATINIGT...,9606
1533,Q96IC2,REXO5,RNA exonuclease 5 (EC 3.1.-.-) (Exonuclease NE...,MEPEREGTERHPRKVRESRQAPNKLVGAAEAMKAGWDLEESQPEAK...,9606


## diff list of rbp_df
|no.|  gene name form  <br>Gerstberger, et al  | reason  |
|----|  :----  | :----  |
|1| PABPC1L2B | duplicate with **PABPC1L2A** |
|2| AARSD1    | duplicate with **PTGES3L-AARSD1** |
|3| FRG1B     | does not have uniprot_id |
|4| SMN2      | duplicate with **SMN1** |
|5| RPS17L    | duplicate with **RPS17**|
|6| RBMY1F    | duplicate with **RBMY1J**|
|7| RPL41     | sequence is too short |
|8| hCG_2045905| unreviewed from uniprot |
|9| VARS2| unreviewed from uniprot |