In [1]:
import pandas as pd
from pysam import FastaFile
import aiohttp
import json
import logging

In [2]:
logger = logging.getLogger(__name__)

In [3]:
refseq_protein_fa = FastaFile('/Users/liang/Box/MyCPTAC/CPTAC_data_collection_v1/DCC/RefSeq_20180629/RefSeq.20180629_Human_ucsc_hg38_cpdbnr_mito_264contams.fasta.gz')

In [4]:
refseq_protein_fa.nreferences

41734

In [5]:
refseq_df = pd.read_table('../custom_sources/cptac3_refseq_20180629/refseq_20180629_human_only_unique_loci.tsv.gz')
refseq_df.head(10)

Unnamed: 0,seqnames,start,end,width,strand,symbol,refseq_tx_id,refseq_prot_id,hgnc_id,entrez_id,ensembl_gene_id,multi_genomic_loci,num_exons,aa_len,tx_len,cds_len
0,chr1,69091,70008,918,+,OR4F5,NM_001005484,NP_001005484.1,HGNC:14825,79501,ENSG00000186092,False,1,305,918,918
1,chr1,450740,451678,939,-,OR4F29,NM_001005221,NP_001005221.2,HGNC:31275,729759,ENSG00000284733,True,1,312,939,939
2,chr1,925741,944581,18841,+,SAMD11,NM_152486,NP_689699.2,HGNC:28706,148398,ENSG00000187634,False,14,681,2554,2046
3,chr1,944203,959299,15097,-,NOC2L,NM_015658,NP_056473.2,HGNC:24517,26155,ENSG00000188976,False,19,749,2800,2250
4,chr1,960587,965719,5133,+,KLHL17,NM_198317,NP_938073.1,HGNC:24023,339451,ENSG00000187961,False,12,642,2564,1929
5,chr1,966497,975104,8608,+,PLEKHN1,NM_001160184,NP_001153656.1,HGNC:25284,84069,ENSG00000187583,False,15,576,2295,1731
6,chr1,966497,975104,8608,+,PLEKHN1,NM_032129,NP_115505.2,HGNC:25284,84069,ENSG00000187583,False,16,611,2400,1836
7,chr1,975199,982117,6919,-,PERM1,NM_001291366,NP_001278295.1,HGNC:28208,84808,ENSG00000187642,False,4,790,3417,2373
8,chr1,975199,982117,6919,-,PERM1,NM_001291367,NP_001278296.1,HGNC:28208,84808,ENSG00000187642,False,5,696,3064,2091
9,chr1,998962,1000172,1211,-,HES4,NM_021170,NP_066993.1,HGNC:24149,57801,ENSG00000188290,False,4,221,962,666


In [16]:
len(set(refseq_df.refseq_prot_id) & set(refseq_protein_fa.references))

41470

In [6]:
conn = aiohttp.TCPConnector(limit_per_host=10)
session = aiohttp.ClientSession(
    connector=conn,
    headers={
        'Accept': 'application/json',
        'Content-type': 'application/json',
    }
)

In [7]:
async def retry_post(session, max_retry=3, **kwargs):
    for retry in range(max_retry):
        try:
            resp = await session.post(**kwargs)
            resp.raise_for_status()
            return resp
        except aiohttp.ClientResponseError as e:
            logger.error(f'Request failed with response {e.status} {e.message} [retry={retry}]')
            logger.exception(e)
            if retry == max_retry - 1:
                raise ValueError('Reach maximal retries') from e

In [8]:
async def query_uniparc(prot_id, prot_seq):
    try:
        resp = await retry_post(
            session,
            url='https://www.ebi.ac.uk/proteins/api/uniparc/sequence',
            params={
                # 'rfActive': 'true',
                'rfTaxId': '9606',
            },
            json={'sequence': prot_seq}
        )
    except ValueError:
        logger.error(f'UniParc query of {prot_id} failed after maximal retries')
        return None
    t = await resp.text()
    # Write the raw JSON response to external file

    j = await resp.json()
    uniparc_id = j['accession']
    uniparc_checksum = j['sequence']['checksum']
    dbrefs = [d for d in j['dbReference'] if d['type'] not in ['JPO', 'EPO', 'VEGA', 'EMBL']]
    ref_ids = {}
    for db_type in ['Ensembl', 'UniProt', 'RefSeq']:
        ids = set(
            f"{d['id']}.{d['versionI']}" for d in dbrefs
            if d['type'].startswith(db_type)
        )
        ref_ids[db_type] = ";".join(ids) or None

    return {
        'original_prot_id': prot_id,
        'uniparc_id': uniparc_id,
        'uniparc_checksum': uniparc_checksum,
        'ensembl_prot_ids': ref_ids['Ensembl'],
        'uniprot_ids': ref_ids['UniProt'],
        'refseq_prot_ids': ref_ids['RefSeq'],
    }

In [9]:
prot_id = 'NP_001278296.1'
prot_seq = refseq_protein_fa.fetch(prot_id)

In [10]:
await query_uniparc(prot_id, prot_seq)

{'original_prot_id': 'NP_001278296.1',
 'uniparc_id': 'UPI0000418FB0',
 'uniparc_checksum': 'A90060DB855C2EAD',
 'ensembl_prot_ids': 'ENSP00000414022.1',
 'uniprot_ids': 'Q5SV97.3;Q5SV97-1.1',
 'refseq_prot_ids': 'NP_001278296.1'}

In [11]:
records = []
for prot_id in refseq_df.head(10)['refseq_prot_id']:
    prot_seq = refseq_protein_fa.fetch(prot_id)
    records.append(await query_uniparc(prot_id, prot_seq))

In [12]:
result_df = pd.DataFrame.from_records(records)
result_df

Unnamed: 0,original_prot_id,uniparc_id,uniparc_checksum,ensembl_prot_ids,uniprot_ids,refseq_prot_ids
0,NP_001005484.1,UPI0000041BC1,70E99D068AB2DD59,ENSP00000317482.1;ENSP00000334393.3;ENSP000004...,Q8NH21.1,NP_001005484.1
1,NP_001005221.2,UPI0000041D3C,08285B196DC638FB,ENSP00000329982.1;ENSP00000394282.1;ENSP000004...,Q6IEY1.1;A0A126GV92.1,XP_024305755.1;XP_016857900.1;NP_001005221.2;N...
2,NP_689699.2,UPI00005C3DB1,A9C65DE8AD561C63,,,NP_689699.2
3,NP_056473.2,UPI000013C171,825CD88441B5ACFF,,Q9Y3T9.3,NP_056473.2
4,NP_938073.1,UPI00001DFBF0,FE37BBCCD32131BF,ENSP00000343930.3,Q6TDP4.1,NP_938073.1
5,NP_001153656.1,UPI00005764FF,323FF81C7963CB97,ENSP00000368717.1,Q494U1-3.1,NP_001153656.1
6,NP_115505.2,UPI00001416D8,4163BA09DF9525C9,ENSP00000368720.1,Q494U1-1.2;Q494U1.3;Q494U1-2.1,NP_115505.2
7,NP_001278295.1,UPI0003E30FA7,EE1E33651D99CEA3,ENSP00000414022.2,Q5SV97-1.2;Q5SV97.4,NP_001356826.1;XP_011540609.1;NP_001278295.1;X...
8,NP_001278296.1,UPI0000418FB0,A90060DB855C2EAD,ENSP00000414022.1,Q5SV97.3;Q5SV97-1.1,NP_001278296.1
9,NP_066993.1,UPI000006EC19,ADFAE485C36EEBC4,ENSP00000304595.3,Q9HCC6.1,NP_066993.1


In [13]:
records[1]

{'original_prot_id': 'NP_001005221.2',
 'uniparc_id': 'UPI0000041D3C',
 'uniparc_checksum': '08285B196DC638FB',
 'ensembl_prot_ids': 'ENSP00000329982.1;ENSP00000394282.1;ENSP00000409316.1',
 'uniprot_ids': 'Q6IEY1.1;A0A126GV92.1',
 'refseq_prot_ids': 'XP_024305755.1;XP_016857900.1;NP_001005221.2;NP_001005277.1;XP_016857899.1;XP_024305761.1;XP_016857897.1;XP_016857898.1;XP_016864827.1;XP_024305760.1;NP_001005224.1'}