In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
home_dir = "../"

import time
import os
from Bio import Entrez
from urllib.error import HTTPError
import xml.etree.ElementTree as ET
import pandas as pd
import xmltodict

In [2]:
Entrez.email = "akabir0101@gmail.com" # provide your user email 
# RECOMMENDED: apply for API key from NCBI (https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/). 
# 10 queries per second with a valid API key, otherwise 3 queries per seconds are allowed for 'None'
Entrez.api_key = "328570309ccd040632796143ec88b51bcf08"
retmax = 500 # return 20 rs per batch example, max=1000
namespace = "https://www.ncbi.nlm.nih.gov/SNP/docsum"
ns = u'{%s}' % namespace
nsl = len(ns)

In [3]:
# dbSNP supported query terms (https://www.ncbi.nlm.nih.gov/snp/docs/entrez_help/) can be build and test online using web query builder (https://www.ncbi.nlm.nih.gov/snp/advanced) 
# esearch handle
eShandle = Entrez.esearch(db="snp",  # search dbSNP
                          term='"homo sapiens"[Organism] AND "missense variant"[Function Class] AND "by alfa"[Validation Status]',
                          usehistory="y", #cache result on server for download in batches
                          retmax=retmax
                         )                        

# get esearch result
eSresult = Entrez.read(eShandle)
webenv = eSresult["WebEnv"]
query_key = eSresult["QueryKey"]
total_count = int(eSresult["Count"])
print(f"Query result count:: {total_count}, Fetch count: {len(range(0, total_count, retmax))}")


Query result count:: 6866032, Fetch count: 13733


In [14]:
def download_batch(start, retmax):
    attempt = 0
    while (attempt < 3):
        attempt += 1
        try:
            fetch_handle = Entrez.efetch(db="snp", retmode="xml", retstart=start, retmax=retmax, webenv=webenv, query_key=query_key) # # rettype="",
            break
        except HTTPError as err:
            if 400 <= err.code <= 599:
                print("Received error from server %s" % err)
                print("Attempt %i of 3" % attempt)
                time.sleep(10)
            else:
                raise
    try: # this try-except block is necessary, b/c data can be incomplete or lost while in network.
        data = fetch_handle.read().decode()
        fetch_handle.close()
        return data
    except:
        print(f"Error. Downloading again record {start} to {start+retmax-1}")
        time.sleep(30)
        return download_batch(start, retmax)


In [15]:
# sample codes adopted with modifications from http://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc139.
filepath = home_dir+"data/dbsnp/search_results/1.txt"
out_file_mode = "a" if os.path.exists(filepath) else "w"
out_file_handle = open(filepath, out_file_mode)

fetch_count = 8014+1 # 1-indexed
start_idx =  4005499+1# 0-indexed
for start in range(start_idx, total_count, retmax):
    data = download_batch(start, retmax)
    out_file_handle.write(data)
        
         
    end = min(total_count, start+retmax)-1
    print(f"Current downloads fetch_count:start_idx-end: {fetch_count}:{start}-{end}") 
    # if error occurs: fetch_count=fetch_count+1, start_idx=end+1
    # if no error: fetch_count=fetch_count+1, start_idx=end+1
    if fetch_count==2: break
    fetch_count += 1

out_file_handle.close()

Current downloads fetch_count:start_idx-end: 8015:4005500-4005999
Current downloads fetch_count:start_idx-end: 8016:4006000-4006499
Current downloads fetch_count:start_idx-end: 8017:4006500-4006999
Current downloads fetch_count:start_idx-end: 8018:4007000-4007499
Current downloads fetch_count:start_idx-end: 8019:4007500-4007999
Current downloads fetch_count:start_idx-end: 8020:4008000-4008499
Current downloads fetch_count:start_idx-end: 8021:4008500-4008999
Current downloads fetch_count:start_idx-end: 8022:4009000-4009499
Current downloads fetch_count:start_idx-end: 8023:4009500-4009999
Current downloads fetch_count:start_idx-end: 8024:4010000-4010499
Current downloads fetch_count:start_idx-end: 8025:4010500-4010999
Current downloads fetch_count:start_idx-end: 8026:4011000-4011499
Current downloads fetch_count:start_idx-end: 8027:4011500-4011999
Current downloads fetch_count:start_idx-end: 8028:4012000-4012499
Current downloads fetch_count:start_idx-end: 8029:4012500-4012999
Current do