# AVE DCD Mapping - Initial Analysis of Metadata
## Given scoreset metadata available from the public API, determine how well human target sequences map to RefSeq sequences

In [24]:
import requests
from biocommons.seqrepo import SeqRepo
from bs4 import BeautifulSoup

In [25]:
### Extract scoreset target sequences from API

In [26]:
def get_target_sequence_data():
    response = requests.get('https://www.mavedb.org/api/scoresets/')
    json_parse = response.json()
    n_scoresets = len(json_parse)
    target_sequences = list()
    for i in range(n_scoresets):
        if json_parse[i]['target']['reference_maps'][0]['genome']['organism_name'] == 'Homo sapiens':
            target_sequences.append(json_parse[i]['target']['reference_sequence']
                                ['sequence'])
    return target_sequences

tarl = get_target_sequence_data()

In [27]:
### Extract RefSeq metadata from API 

In [28]:
def get_full_refseq(url):
    page = requests.get(url)
    page = BeautifulSoup(page.text)
    page = page.find('p', class_='itemid')
    page = page.get_text()
    return str(page.split(':')[1].strip())

In [29]:
def get_refseq():
    response = requests.get('https://www.mavedb.org/api/scoresets/')
    json_parse = response.json()
    n_scoresets = len(json_parse)
    ref_seqid = list()
    for i in range(n_scoresets):
        if json_parse[i]['target']['reference_maps'][0]['genome']['organism_name'] == 'Homo sapiens':
            if json_parse[i]['target']['refseq'] is None:
                ref_seqid.append(json_parse[i]['target']['refseq'])
            elif '.' not in json_parse[i]['target']['refseq']['identifier']:
                json_parse[i]['target']['refseq']['identifier'] = get_full_refseq(json_parse[i]['target']['refseq']['url'])
                ref_seqid.append(json_parse[i]['target']['refseq'])
            else:
                ref_seqid.append(json_parse[i]['target']['refseq'])
    
    identifiers = list()
    for i in range(len(ref_seqid)):
        if ref_seqid[i] is not None:
            identifiers.append(ref_seqid[i]['identifier'])
    print(set(identifiers))
    return ref_seqid

get_refseq()

{'NM_005957.5', 'NM_001005781.1', 'NM_003345.5', 'NP_000242.1', 'NM_022445.3', 'NP_031401.1', 'NM_007294.3', 'NM_001363670.1'}


[None,
 None,
 None,
 None,
 None,
 {'offset': 230,
  'identifier': 'NM_005957.5',
  'url': 'http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?val=NM_005957',
  'dbversion': None,
  'dbname': 'RefSeq'},
 None,
 None,
 None,
 None,
 {'offset': 833,
  'identifier': 'NM_001363670.1',
  'url': 'http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?val=NM_001363670.1',
  'dbversion': None,
  'dbname': 'RefSeq'},
 {'offset': 230,
  'identifier': 'NM_005957.5',
  'url': 'http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?val=NM_005957',
  'dbversion': None,
  'dbname': 'RefSeq'},
 {'offset': 159,
  'identifier': 'NM_003345.5',
  'url': 'http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?val=NM_003345',
  'dbversion': None,
  'dbname': 'RefSeq'},
 {'offset': 0,
  'identifier': 'NP_000242.1',
  'url': 'http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?val=NP_000242.1',
  'dbversion': None,
  'dbname': 'RefSeq'},
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 {'offset': 230,
  'identifier': '

In [30]:
### Get corresponding reference sequences using SeqRepo

In [None]:
def get_template_sequences():
    sr = SeqRepo("/usr/local/share/seqrepo/latest")
    target_list = get_target_sequence_data()
    template_list = get_refseq()
    sequences_list = list()
    for i in range(len(template_list)):
        if template_list[i] is None:
            sequences_list.append('')
        elif template_list[i]['offset'] == 0:
            length = len(target_list[i])
            sequences_list.append(sr[template_list[i]['identifier']][0:length])
        else:
            length = len(target_list[i])
            sequences_list.append(sr[template_list[i]['identifier']]
                                  [template_list[i]['offset'] - 1:template_list[i]['offset'] - 1 + length])
    return sequences_list

templ = get_template_sequences()

In [None]:
### Quantify how well target sequences map to RefSeq sequences

In [None]:
def match_percent(str1, str2):
    count = 0
    for i in range(len(str1)):
        if str1[i] == str2[i]:
            count = count + 1
    return 100 * count / len(str1)

In [None]:
def determine_matches(tarl, templ):
    match_percent_list = list()
    for i in range(len(tarl)):
        if templ[i] == '' or len(tarl[i]) != len(templ[i]):
            match_percent_list.append(None)
        else:
            match_percent_list.append(match_percent(tarl[i], templ[i]))
    return match_percent_list

determine_matches(tarl, templ)