In [None]:
import os
import sys
import shutil
import requests
import subprocess
import numpy as np
import pandas as pd
from pathlib import Path 
from tqdm.notebook import tqdm
    
from function.seqfilter import SeqFilter
from function.utilities import seq_to_fasta
from function.utilities import get_uniprot_rawdata

In [None]:
#load human sequence and order/disroder information
human_df = get_uniprot_rawdata("./rawdata/human_uniprot.tab")
od_human_df = pd.read_pickle("./rawdata/VSL2_od_human_df.pkl")
def get_sequence(uniprot_id,return_sequence):
    if return_sequence == 'od_ident':
        return od_human_df[od_human_df['uniprot_id'] == uniprot_id]['od_ident'].tolist()[0]
    elif return_sequence == 'sequence':
        return human_df[human_df['uniprot_id'] == uniprot_id]['protein_sequence'].tolist()[0]

In [None]:
class Polya():
    def __init__(self,propensity_dict):
        self.propensity_dict = propensity_dict  
        
    def get_seq_prop(self,sequence,od_ident, prop_score_threshold, prop_score_filter_length):
        od_index = seqfilter.get_od_index(od_ident)
        od_index = od_index['disorder_region']
        
        self.prop_score_threshold = prop_score_threshold
        self.prop_score_filter_length = prop_score_filter_length
            
        #算所有的feature    
        all_seq_features = 0
        for index in od_index:
            start = index['start']
            end = index['end']

            frag_seq_prop_list = self.__get_frag_seq_prop_list(sequence[start:end])
            frag_features = self.__get_feature_num_frag_seq_prop_list(frag_seq_prop_list)

            all_seq_features = all_seq_features + frag_features
        return all_seq_features
        
    
    def __get_frag_seq_prop_list(self,frag_sequence):
        value = 1
        value_list = []
        for i in frag_sequence:
            if self.propensity_dict[i] == 0 :
                value_list.append(round(-0.1 ,3))
                value = 1
            else:
                value = value * self.propensity_dict[i]
                value_list.append(round(value,3))
        return value_list
    
    
    def __get_feature_num_frag_seq_prop_list(self,frag_seq_prop_list):
        score_str = ''
        for i in frag_seq_prop_list:
            if i >= self.prop_score_threshold:
                score_str = score_str + 'p'
            else:
                score_str = score_str + ' '
        possible_propensity = list(filter(None, score_str.split(" ")))
        length_filtered_num = len(list(filter(lambda item: len(item)>self.prop_score_filter_length, possible_propensity)))
        return length_filtered_num

# Param

In [None]:
propensity_dict = {'A':1.2,'L':1,'M':1,'Q':0.8,
                   'D':0,'E':0,'F':0,'G':0,'H':0,'I':0,'K':0,'C':0,'Y':0,'N':0,'P':0,'T':0,'R':0,'S':0,'V':0,'W':0,}

In [None]:
#####CHANGE HERE#####

#polyalanine param
prop_score_filter_length = 1 #no length filter
polya_thresholds = [1.5,2.0,2.5]

#order/disorder length filter param
order_filter_length = 10
disorder_filter_length = 40

#####CHANGE HERE#####

# Run polyalanine for all proteins

In [None]:
seqfilter = SeqFilter()
polya = Polya(propensity_dict=propensity_dict)

polya_df = pd.DataFrame(human_df['uniprot_id'])

for index, row in tqdm(polya_df.iterrows(), total=polya_df.shape[0]):
    
    uniprot_id = row['uniprot_id']
    
    #get sequence
    sequence = get_sequence(uniprot_id,"sequence")

    #get disorder info
    #exception for no pondr disorder/order seq
    try:
        od_ident = get_sequence(uniprot_id,"od_ident")
    except Exception as e: 
        print("{}, no pondr order/disorder info".format(uniprot_id))
        continue
        
    od_ident = seqfilter.length_filter_by_od_ident(od_ident,disorder_filter_length,order_filter_length) 
    
    for threshold in polya_thresholds:
        prop= polya.get_seq_prop(sequence=sequence,
                                 od_ident=od_ident,
                                 prop_score_threshold=threshold,
                                 prop_score_filter_length=prop_score_filter_length)
        
        column_names='cond_{}'.format(threshold)
        polya_df.loc[polya_df['uniprot_id'] == uniprot_id, column_names] = prop

In [None]:
#save output to ./output
# polya_df.to_pickle("./output/polya.pkl")