In [2]:
import json
import logging
from pathlib import Path
import pandas as pd
import xmlschema

In [11]:
xs = xmlschema.XMLSchema('https://www.uniprot.org/docs/uniparc.xsd')

In [5]:
df = pd.read_table('../../intermediates/refseq_20180629/ebi_api_missing/refseq_checksums.tsv')
df.head()

Unnamed: 0,original_protein_id,uniparc_checksum
0,NP_000330.2,B6044AA3206A94E0
1,NP_001006933.2,6AFE830C9FC909A2
2,NP_001008738.2,6DF41100700F9C40
3,NP_001012415.2,0DBA3BDAB8DB71B8
4,NP_001013666.2,F10186546D4527EB


In [None]:
records = []

for row in df.itertuples(index=False):
    xml_pth = f'../../intermediates/refseq_20180629/ebi_api_missing/xmls/{row.original_protein_id}.xml'

In [41]:
row = next(df.itertuples(index=False))

In [42]:
xml_pth = f'../../intermediates/refseq_20180629/ebi_api_missing/xmls/{row.original_protein_id}.xml'

In [44]:
data, errors = xs.to_dict(xml_pth, validation='lax')

[XMLSchemaChildrenValidationError(reason="Unexpected child with tag '{http://uniprot.org/uniparc}signatureSequenceMatch' at position 6.")]

In [45]:
if len(data['entry']) > 1:
    ids = [x['accession'] for x in data['entry']]
    logger.warning(f'{prot_id} got multiple uniparc entries: {" ".join(ids)}')

In [46]:
entry = data['entry'][0]

In [47]:
uniparc_id = entry['accession']

In [62]:
entry['sequence']['@checksum']

'B6044AA3206A94E0'

In [48]:
entry['dbReference']

[{'@type': 'UniProtKB/Swiss-Prot protein isoforms',
  '@id': 'P55017-2',
  '@version_i': 1,
  '@active': 'N',
  '@created': '2009-02-10',
  '@last': '2010-11-02',
  'property': [{'@type': 'NCBI_taxonomy_id', '@value': '9606'}]},
 {'@type': 'RefSeq',
  '@id': 'NP_000330',
  '@version_i': 2,
  '@active': 'N',
  '@version': 2,
  '@created': '2008-04-29',
  '@last': '2019-06-02',
  'property': [{'@type': 'NCBI_GI', '@value': '186910315'},
   {'@type': 'NCBI_taxonomy_id', '@value': '9606'},
   {'@type': 'protein_name',
    '@value': 'solute carrier family 12 member 3 isoform 1'},
   {'@type': 'gene_name', '@value': 'SLC12A3'}]},
 {'@type': 'IPI',
  '@id': 'IPI00646907',
  '@version_i': 2,
  '@active': 'N',
  '@version': 2,
  '@created': '2009-02-12',
  '@last': '2010-11-05',
  'property': [{'@type': 'NCBI_taxonomy_id', '@value': '9606'}]}]

In [54]:
ref_ids = {}
for db_type in ["Ensembl", "UniProt", "RefSeq"]:
    ids = set()
    for d in entry['dbReference']:
        if d["@type"].startswith(db_type):
            if db_type == 'UniProt':
                id_str = f"{d['@id']}.{d['@version_i']}"
            else:
                id_str = f"{d['@id']}.{d['@version']}"
            ids.add(id_str)
    ref_ids[db_type] = ";".join(ids) or None

In [55]:
ref_ids

{'Ensembl': None, 'UniProt': 'P55017-2.1', 'RefSeq': 'NP_000330.2'}

In [60]:
record = {
    "original_prot_id": row.original_protein_id,
    "uniparc_id": uniparc_id,
    "uniparc_checksum": row.uniparc_checksum,
    "ensembl_prot_ids": ref_ids["Ensembl"],
    "uniprot_ids": ref_ids["UniProt"],
    "refseq_prot_ids": ref_ids["RefSeq"],
}
record

{'original_prot_id': 'NP_000330.2',
 'uniparc_id': 'UPI000173ABE6',
 'uniparc_checksum': 'B6044AA3206A94E0',
 'ensembl_prot_ids': None,
 'uniprot_ids': 'P55017-2.1',
 'refseq_prot_ids': 'NP_000330.2'}