In [1]:
import pandas as pd
import numpy as np
import requests as rq
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
import json

In [2]:
df = pd.read_csv('/home/user503/Downloads/Telegram Desktop/tab_for_seq.txt', sep='\t')
df

Unnamed: 0,Parent,Antigen,Epitope(s),их кол-во,PMID
0,Severe acute respiratory syndrome-related coro...,Spike glycoprotein precursor,"FRAILTAFSPAQDIW,PDDFMGCVLAWNTRN,SVLNDILSR,ELCD...",277,"17565177,10706708,8943576,17239816,11433380,16..."
1,Hepacivirus C,Genome polyprotein,"ILAGYGAGV,YPYRLWHY,GCAWYELTPAETTVRLRAYMNTPGLPV...",193,"9002445,31154438,7604034,9446793,9438205,23555..."
2,Homo sapiens,Elongation factor 2,"GGGQIIPTAR,AEMYVAKF + OX(M3),GLHGWAFTL,ETVSEES...",264,"http://www.iedb.org/assay/11761195,http://www...."
3,Homo sapiens,Probable ATP-dependent RNA helicase DDX5,"IATPGRLIDF,STCIYGGAPK + METH(C3),YRRSKEITV,QEH...",139,"http://www.iedb.org/assay/11788448,http://www...."
4,Homo sapiens,Catenin beta-1,"GLQKMVALL + OX(M5),LTDPSQRL,GMQIPSTQF + OX(M2)...",133,"http://www.iedb.org/assay/11457595,http://www...."
...,...,...,...,...,...
293,Homo sapiens,Protein mono-ADP-ribosyltransferase PARP14,"AELIISEVF,TLQEVHFLL,YEAPRCVY + METH(C6),QLSPRL...",117,"http://www.iedb.org/assay/12040128,http://www...."
294,Homo sapiens,eIF-2-alpha kinase activator GCN1,"HVPGFCLPK + METH(C6),EVPKKLTEW,KVDPLFTEL,ILPEI...",117,"29557506,http://www.iedb.org/assay/12122083,ht..."
295,Homo sapiens,Nucleobindin-2,"KVQNIHPVESAK,DKTKVQNIHPVESAKIEPPD,NIHPVESAKIEP...",159,"http://www.iedb.org/assay/11721113,http://www...."
296,Homo sapiens,Clusterin,"DIHFHSPAFQHPPTE,IEKTNEERKTL,SGSGLVGR,EDQYYLRVT...",136,"http://www.iedb.org/assay/11928756,http://www...."


In [3]:
df['PMID'].str.split(',').str.len().min()

1

In [4]:
df['Epitope(s)'].str.split(',').str.len().min()

101

In [5]:
df[df.duplicated(['Antigen', 'Parent'], keep=False)].sort_values('Antigen')

Unnamed: 0,Parent,Antigen,Epitope(s),их кол-во,PMID


In [6]:
df.Parent.unique().tolist()

['Severe acute respiratory syndrome-related coronavirus',
 'Hepacivirus C',
 'Homo sapiens',
 'Influenza A virus',
 'Mycobacterium tuberculosis',
 'Hepatitis B virus',
 'Measles morbillivirus',
 'Vaccinia virus',
 'Lymphocytic choriomeningitis mammarenavirus',
 'Argentinian mammarenavirus',
 'Lassa mammarenavirus',
 'Brazilian mammarenavirus',
 'Whitewater Arroyo mammarenavirus',
 'Yellow fever virus',
 'Machupo mammarenavirus',
 'West Nile virus',
 'Guanarito mammarenavirus',
 'Zaire ebolavirus',
 'Sudan ebolavirus',
 'Variola virus',
 'Human orthopneumovirus',
 'Phleum pratense',
 'Bordetella pertussis']

In [7]:
from ncbi.datasets.openapi.api.taxonomy_api import TaxonomyApi
from ncbi.datasets.openapi import ApiClient as DatasetsApiClient

In [8]:
with DatasetsApiClient() as api_client:
    api = TaxonomyApi(api_client)
    for i in tqdm(df.Parent.unique()):
        tax = api.tax_name_query(i).sci_name_and_ids[0]
        if tax.sci_name == i:
            inds = df[df.Parent == i].index.tolist()
            df.loc[inds, 'tax_id'] = tax.tax_id
        else:
            print(tax.sci_name, '\t', i, '\n')
df.tax_id.unique()

  0%|          | 0/23 [00:00<?, ?it/s]

array(['694009', '11103', '9606', '11320', '1773', '10407', '11234',
       '10245', '11623', '2169991', '11620', '2169992', '46919', '11089',
       '11628', '11082', '45219', '186538', '186540', '10255', '11250',
       '15957', '520'], dtype=object)

In [9]:
def retrieve_uniprot_id_from_iedb(url: str):
    response = rq.get(url).text
    epitope_id = response.split('IEDB_epitope')[1][:20].split(' ')[0].strip(':"},')
    response = rq.get(f'https://www.iedb.org/epitope/{epitope_id}').text
    uni_id = response.split('UniProt')[1][:10].split(' ')[0].strip(':)')
    return uni_id

In [10]:
def get_uniprot_entry(acess: str):
    requestURL = f'https://www.ebi.ac.uk/proteins/api/proteins/{acess}'
    r = rq.get(requestURL, headers={"Accept" : "application/json"})
    responseBody = r.json()
    return responseBody

In [11]:
iedb = df.PMID.str.split(',').explode()
iedb = iedb[iedb.str.contains('http')]
uniprot_df = pd.DataFrame({}, columns=['url', 'id', 'len'])
for i, url in tqdm(list(iedb.iteritems())):
    try:
        uni_id = retrieve_uniprot_id_from_iedb(url)
    except IndexError:
        continue
    length = get_uniprot_entry(uni_id)['sequence']['length']
    uniprot_df.loc[i] = [url, uni_id, length]

uniprot_df

  0%|          | 0/43857 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
uniprot_df.to_csv('iedb_uniprot.tsv', sep='\t')

In [None]:
for i, j in df.iterrows():
    r = get_seq_len(j.Antigen, j.tax_id)
    if not len(r):
        r = get_seq_len(j.Antigen, j.tax_id, False)
    print(j.Parent, j.Antigen, '---------', len(r))

In [None]:
df.Antigen.tolist()

In [None]:
r = get_seq_len('Spectrin alpha chain, non-erythrocytic 1', 9606)
len(r)

In [None]:
tmp_len = []
for i in responseBody:
    tmp_len.append(i['sequence']['length'])

In [None]:
tmp_len

In [None]:
tmp = df.loc[297].PMID.str.split(',').explode()
tmp[tmp.str.isdigit()]

In [None]:
df.PMID.str.isdigit