In [31]:
import pandas as pd
import numpy as np
import scipy
#import stats
import os
import sys
from Bio import Entrez
import json


In [11]:
data = os.path.join("..", 
                    "data", 
                    "Rem2_orthologs.csv")

# *Always* tell NCBI who you are
Entrez.email = "aglucaci@gmail.com"

print("Reading ...", data)

Reading ... ../data/Rem2_orthologs.csv


In [3]:
df = pd.read_csv(data)

In [4]:
df

Unnamed: 0,Gene ID,Gene symbol,Description,Scientific name,Common name,RefSeq Transcript accessions,RefSeq Protein accessions,SPARCLE ID,Curation Status,RefseqSelect,Default sequence
0,64626,Rem2,RRAD and GEM like GTPase 2,Rattus norvegicus,Norway rat,NM_022685.2,NP_073176.2,10134947.0,curated,,default-seq
1,140743,Rem2,rad and gem related GTP binding protein 2,Mus musculus,house mouse,NM_080726.3,NP_542764.2,10134947.0,curated,RefseqSelect,default-seq
2,161253,REM2,RRAD and GEM like GTPase 2,Homo sapiens,human,NM_173527.3,NP_775798.2,10134947.0,curated,RefseqSelect,default-seq
3,467399,REM2,RRAD and GEM like GTPase 2,Pan troglodytes,chimpanzee,XM_009427474.3,XP_009425749.1,10134947.0,model,,default-seq
4,490603,REM2,RRAD and GEM like GTPase 2,Canis lupus familiaris,dog,XM_005623217.4,XP_005623274.2,,model,,default-seq
...,...,...,...,...,...,...,...,...,...,...,...
181,126958288,LOC126958288,RRAD and GEM like GTPase 2,Macaca thibetana thibetana,,XM_050796554.1,XP_050652511.1,,model,,default-seq
182,127187235,LOC127187235,RRAD and GEM like GTPase 2,Acomys russatus,golden spiny mouse,XM_051143407.1,XP_050999364.1,,model,,default-seq
183,127230422,LOC127230422,RRAD and GEM like GTPase 2,Phodopus roborovskii,desert hamster,XM_051195738.1,XP_051051695.1,,model,,default-seq
184,127551038,LOC127551038,RRAD and GEM like GTPase 2,Antechinus flavipes,yellow-footed antechinus,XM_051980438.1,XP_051836398.1,,model,,default-seq


In [19]:
def retrieve_annotation(id_list):

    """Annotates Entrez Gene IDs using Bio.Entrez, in particular epost (to
    submit the data to NCBI) and esummary to retrieve the information.
    Returns a list of dictionaries with the annotations."""

    request = Entrez.epost("protein", id=",".join(id_list))
    try:
        result = Entrez.read(request)
    except RuntimeError as e:
        # FIXME: How generate NAs instead of causing an error with invalid IDs?
        print("An error occurred while retrieving the annotations.")
        print("The error returned was %s" % e)
        sys.exit(-1)
    #end try
    
    print([result])

    webEnv = result["WebEnv"]
    queryKey = result["QueryKey"]
    
    #Next step
    data = Entrez.esummary(db="protein", webenv=webEnv, query_key=queryKey)
    annotations = Entrez.read(data)

    print("Retrieved %d annotations for %d genes" % (len(annotations), len(id_list)))

    return annotations

In [20]:
retrieve_annotation(["NP_775798.2"])

[{'QueryKey': '1', 'WebEnv': 'MCID_649613dc6126a2290935178b'}]
Retrieved 1 annotations for 1 genes


[{'Item': [], 'Id': '124248562', 'Caption': 'NP_775798', 'Title': 'GTP-binding protein REM 2 [Homo sapiens]', 'Extra': 'gi|124248562|ref|NP_775798.2|[124248562]', 'Gi': IntegerElement(124248562, attributes={}), 'CreateDate': '2003/01/14', 'UpdateDate': '2023/03/12', 'Flags': IntegerElement(512, attributes={}), 'TaxId': IntegerElement(9606, attributes={}), 'Length': IntegerElement(340, attributes={}), 'Status': 'live', 'ReplacedBy': '', 'Comment': '  ', 'AccessionVersion': 'NP_775798.2'}]

In [None]:
"""
handle = Entrez.efetch(db="nucleotide", id="186972394", rettype="gb", retmode="text")
print(handle.read())

"""

In [27]:
handle = Entrez.efetch(db="protein", id="124248562", rettype="gb", retmode="xml")
#print(handle.read())

In [28]:
handle

<http.client.HTTPResponse at 0x12f58de80>

In [30]:
handle = Entrez.efetch(db="protein", id="124248562", rettype="gb", retmode="xml")
record = Entrez.read(handle)
handle.close()

record[0]

{'GBSeq_locus': 'NP_775798', 'GBSeq_length': '340', 'GBSeq_moltype': 'AA', 'GBSeq_topology': 'linear', 'GBSeq_division': 'PRI', 'GBSeq_update-date': '12-MAR-2023', 'GBSeq_create-date': '14-JAN-2003', 'GBSeq_definition': 'GTP-binding protein REM 2 [Homo sapiens]', 'GBSeq_primary-accession': 'NP_775798', 'GBSeq_accession-version': 'NP_775798.2', 'GBSeq_other-seqids': ['ref|NP_775798.2|', 'gi|124248562'], 'GBSeq_keywords': ['RefSeq', 'MANE Select'], 'GBSeq_source': 'Homo sapiens (human)', 'GBSeq_organism': 'Homo sapiens', 'GBSeq_taxonomy': 'Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae; Homo', 'GBSeq_references': [{'GBReference_reference': '1', 'GBReference_position': '1..340', 'GBReference_authors': ['Luck K', 'Kim DK', 'Lambourne L', 'Spirohn K', 'Begg BE', 'Bian W', 'Brignall R', 'Cafarelli T', 'Campos-Laborie FJ', 'Charloteaux B', 'Choi D', 'Cote AG', 'Daley M', 'Deimling S', 'Desbul

In [None]:
#r = {'is_claimed': 'True', 'rating': 3.5}
json = json.dumps(record[0]) # note i gave it a different name
file.write(json)