In [1]:
import wget
import os.path as osp

# Check if disprot data is already downloaded:
if not osp.exists('disprot.tsv'):
    wget.download("https://disprot.org/api/search?release=2022_03&show_ambiguous=true&show_obsolete=false&format=tsv&namespace=all&get_consensus=false", "disprot.tsv")

In [23]:
import pandas as pd
from aaindex.aaindex import aaindex

disprot = pd.read_csv('disprot.tsv', sep='\t', header=0)
# Drop duplicates:
disprot = disprot.drop_duplicates(subset=['acc', 'start', 'end'])
disprot

Unnamed: 0,acc,name,organism,ncbi_taxon_id,disprot_id,region_id,start,end,term_namespace,term,ec,reference,region_sequence,confidence,obsolete
0,P03265,DNA-binding protein,Human adenovirus C serotype 5,28285,DP00003,DP00003r002,294,334,Structural state,IDPO:00076,ECO:0006220,pmid:8632448,EHVIEMDVTSENGQRALKEQSSKAKIVKNRWGRNVVQISNT,,
1,P03265,DNA-binding protein,Human adenovirus C serotype 5,28285,DP00003,DP00003r004,454,464,Structural state,IDPO:00076,ECO:0006220,pmid:8632448,VYRNSRAQGGG,,
2,P49913,Cathelicidin antimicrobial peptide,Homo sapiens,9606,DP00004,DP00004r001,134,170,Structural state,IDPO:00076,ECO:0006206,pmid:9452503,LLGDFFRKSKEKIGKEFKRIVQRIKDFLRNLVPRTES,,
5,P03045,Antitermination protein N,Escherichia phage lambda,10710,DP00005,DP00005r001,1,107,Structural state,IDPO:00076,ECO:0006165,pmid:9659923,MDAQTRRRERRAEKQAQWKAANPLLVGVSAKPVNRPILSLNRKPKS...,,
14,P03045,Antitermination protein N,Escherichia phage lambda,10710,DP00005,DP00005r012,1,22,Structural transition,IDPO:00050,ECO:0006165,pmid:9659923,MDAQTRRRERRAEKQAQWKAAN,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10007,A0A2Z5UJ33,Nucleoprotein,Influenza A virus,382835,DP03573,DP03573r005,73,91,Structural state,IDPO:00076,ECO:0006220,pmid:17151603,ERRNKYLEEHPSAGKDPKK,,
10008,A0A2Z5UJ33,Nucleoprotein,Influenza A virus,382835,DP03573,DP03573r006,203,212,Structural state,IDPO:00076,ECO:0006220,pmid:17151603,DRNFWRGENG,,
10010,P03496,Non-structural protein 1,Influenza A virus (strain A/Puerto Rico/8/1934...,211044,DP03575,DP03575r001,204,230,Structural state,IDPO:00076,ECO:0006220,pmid:21464929,RSSNENGRPPLTPKQKREMAGTIRSEV,,
10011,P03496,Non-structural protein 1,Influenza A virus (strain A/Puerto Rico/8/1934...,211044,DP03575,DP03575r002,73,82,Structural state,IDPO:00076,ECO:0006220,pmid:20133840,SDEALKMTMA,,


In [24]:
import requests
import time

def full_seq_from_uniprot(uniprot_id):
    url = 'https://www.uniprot.org/uniprot/' + uniprot_id + '.fasta'
    fasta = requests.get(url).text
    #time.sleep(1)
    return "".join(fasta.split('\n')[1:])

asc2seq = {}
for acc in disprot['acc'].unique():
    asc2seq[acc] = full_seq_from_uniprot(acc)

len(asc2seq)

KeyboardInterrupt: 

In [25]:
def get_non_disordered_data(seq, regions):
    for region in sorted(regions, key=lambda x: x[0], reverse=True):
        seq = seq[:region[0]] + seq[region[1]:]
    return seq

asc2non_disordered_seq = {}
for acc in asc2seq:
    start_pos = disprot.loc[disprot['acc'] == acc, 'start'].values
    end_pos = disprot.loc[disprot['acc'] == acc, 'end'].values
    asc2non_disordered_seq[acc] = get_non_disordered_data(asc2seq[acc], list(zip(start_pos, end_pos)))

len(asc2non_disordered_seq)

1898

In [26]:
import numpy as np

aa_index_feats = aaindex.record_codes()

def get_avg_feats_per_sequence(seq):
    features = np.zeros(len(aa_index_feats))
    for i, feat in enumerate(aa_index_feats):
        feat_vals = aaindex[feat]['values']
        features[i] = np.average(np.array([feat_vals[aa] for aa in seq]))
    return features

prepared_data = []
for asc, ordered_seq in asc2non_disordered_seq.items():
    prepared_data.append([asc, 0, disprot.loc[disprot['acc'] == asc, 'ncbi_taxon_id'].values[0], len(ordered_seq)] + list(get_avg_feats_per_sequence(ordered_seq)))

for i, row in disprot.iterrows():
    prepared_data.append([row['acc'], 1, row['ncbi_taxon_id'], len(row['region_sequence'])] + list(get_avg_feats_per_sequence(row['region_sequence'])))

prepared_data = pd.DataFrame(prepared_data, columns=['asc', 'is_disordered', 'taxon', 'seq_length']+[f'{feat}_avg' for feat in aa_index_feats])

prepared_data.head()

KeyError: 'X'

In [None]:
prepared_data.to_csv('basic_aa_features.csv', index=False)

In [None]:
# TODO: Run linear models to see if any of the features are important
# Then run a linear model on the important ones to see how predictive that is overall