# Use UCSC's DAS server to lookup sequences by genome position.

In [1]:
import re
import os
import urllib
import sqlite3
import glob
from Bio import SeqIO
from xml.dom import minidom

In [2]:
def get_sequence_from_das(my_db,chromosome,start,end):
    """
    Queries the UCSC DAS server given the database, chromosome and position
    Returns the sequence if it is given valid coordinates, else an empty
    string.

    Args:
        my_db: the database name (can be hg38, etc.)
        chromosome: given as "chr1," "chr2," etc.
        start: the raw start position from the beginning of the chromosome
        end: The raw end position

    Returns:
        Sequence from the DAS

    See:
        http://stackoverflow.com/questions/22328319/python-fetch-sequence-from-das-by-coordinates
    """
    sequence=''
    base = "http://genome.ucsc.edu/cgi-bin/das/%s/dna?segment="%my_db
    url = base + chromosome + ':' + str(start) + ',' + str(end)
    print(url)
    xml = urllib.request.urlopen(url).read() # Python3
    xmldoc = minidom.parseString(xml)
    sequence = ''.join(xmldoc.getElementsByTagName('DNA')[0].firstChild.nodeValue.split('\n'))
    return sequence

In [3]:
get_sequence_from_das('hg38','chr1',15000000,15000300)

http://genome.ucsc.edu/cgi-bin/das/hg38/dna?segment=chr1:15000000,15000300


'tggatatattatatggcaagagggactttgcagctgtgacggaggatcttgagatgggaagatgatctggattgtctgggtgggccctgggtaatcgtggggtacttatagaggaaggttggcagatcagagagagggaaggagatgtgagactggaaaaaggtcagagtcagagagatctgaaggtgccctgttgctgactttcaggctagaggacagggccagcagccatgggatgcaggcggcctctaaaggctgggaaaggcaaggaaacaaattctcccctatagcctccggtggg'

In [None]:
http://genome.ucsc.edu/cgi-bin/das/%s/dna?segment=hg38:chr1:15000000,15000300