# Get ancient information from ensemble compara

Try to collect ancient alleles from ensemble compara.

In [None]:
from urllib.parse import urljoin
from tqdm.notebook import tqdm
import pandas as pd

from ensemblrest import EnsemblRest
from tskitetude.smarterapi import VariantsEndpoint
from tskitetude.ensembl import ComparaSheepSNP

If I want to recover the coordinates of old `OAR3` I need to connect to the compara
archive:

In [None]:
base_url = 'https://nov2020.rest.ensembl.org'
ensRest = EnsemblRest(base_url=base_url)
session = ensRest.session

Get information on assembly. Mind to the *top level coordinates*, which are the 
assembled chromosomes.

In [None]:
data = ensRest.getInfoAssembly(species="ovis_aries")
chromosomes = pd.DataFrame(list(filter(lambda record: record['coord_system'] == "chromosome", data["top_level_region"])))
chromosomes.head()

A simple test to collect alignments:

In [None]:
url = urljoin(base_url, "alignment/region/ovis_aries/1:649900..650000")
params = [
    ('method', 'EPO'),
    ('species_set_group', 'mammals'),
    ('display_species_set', 'ovis_aries'),
    ('display_species_set', 'capra_hircus')
]

response = session.get(url, params=params)
response.json()

Ok, now try to collect some variant from 50K from the SMARTER API endpoint:

In [None]:
variant_api = VariantsEndpoint(species="Sheep", assembly="OAR3")

region = "1:1-5000000"
chip_name = "IlluminaOvineSNP50"

data = variant_api.get_variants(chip_name=chip_name, region=region)
page = page = data["page"]
variants = pd.json_normalize(data["items"])

while data["next"] is not None:
    data = variant_api.get_variants(chip_name=chip_name, region=region, page=page+1)
    df_page = pd.json_normalize(data["items"])
    page = data["page"]
    variants = pd.concat([variants, df_page], ignore_index=True)

variants.info()

Get a sample of the positions of the selected variants:

In [None]:
variants[["rs_id", "locations.chrom", "locations.position", "locations.alleles"]].head()

Get an helper object to query ensemble compara:

In [None]:
compara = ComparaSheepSNP(base_url=base_url)

Now try to collect ancestor alleles from alignments:

In [None]:
results = []
for idx, variant in tqdm(variants.iterrows(), total=variants.shape[0]):
    ancestor = compara.get_ancestor(chrom=variant["locations.chrom"], position=variant["locations.position"])
    results.append(ancestor)

Get only ancestor allele when an alignment is found:

In [None]:
ancestor_alleles = [None if result is None else result["seq"] for result in results]

Add ancestor allele to the variant dataframe:

In [None]:
variants["ancestor_alleles"] = ancestor_alleles
filtered_variants = variants[variants["ancestor_alleles"].notna()]
filtered_variants[["rs_id", "locations.chrom", "locations.position", "locations.alleles", "ancestor_alleles"]].head()

Please note that I can collect an ancestral allele which is not present in the alleles itself