# Get ancient information from ensemble compara

Try to collect ancient alleles from ensemble compara.

In [1]:
from urllib.parse import urljoin
from tqdm.notebook import tqdm
import pandas as pd

from ensemblrest import EnsemblRest
from tskitetude.smarterapi import VariantsEndpoint
from tskitetude.ensembl import ComparaSheepSNP

If I want to recover the coordinates of old `OAR3` I need to connect to the compara
archive:

In [2]:
base_url = 'https://nov2020.rest.ensembl.org'
ensRest = EnsemblRest(base_url=base_url)
session = ensRest.session

Get information on assembly. Mind to the *top level coordinates*, which are the 
assembled chromosomes.

In [3]:
data = ensRest.getInfoAssembly(species="ovis_aries")
chromosomes = pd.DataFrame(list(filter(lambda record: record['coord_system'] == "chromosome", data["top_level_region"])))
chromosomes.head()

Unnamed: 0,name,coord_system,length
0,1,chromosome,275612895
1,10,chromosome,86447213
2,11,chromosome,62248096
3,12,chromosome,79100223
4,13,chromosome,83079144


A simple test to collect alignments:

In [4]:
url = urljoin(base_url, "alignment/region/ovis_aries/1:649900..650000")
params = [
    ('method', 'EPO'),
    ('species_set_group', 'mammals'),
    ('display_species_set', 'ovis_aries'),
    ('display_species_set', 'capra_hircus')
]

response = session.get(url, params=params)
response.json()

[{'alignments': [{'strand': 1,
    'start': 818768,
    'description': '',
    'seq_region': '3',
    'species': 'capra_hircus',
    'end': 818868,
    'seq': 'TGGAAGCAGCAGACGCAAGCCATGCCTG--AAGCT-CTGCAGCCTCGTGGAGGGAAAGAG--A-AAGCGCCAGACCGCAGAGCCAGCTCGAG---GTGA-GGGGTCTGCTA'},
   {'start': 37294,
    'strand': -1,
    'seq_region': 'Ancestor_1888_968487',
    'description': '',
    'seq': 'TGGAAGCAGCAGACGCAAGCCATGCCTG--AAGCT-CTGCAGCCTCGTGGAGGGAAAGAG--A-ACGCGCCAGACCGCAGAGCCAGCTCGAG---GTGA-GGGGTCTGCTA',
    'species': 'Chir-Oari[2]',
    'end': 37394},
   {'start': 649900,
    'strand': 1,
    'description': '',
    'seq_region': '1',
    'seq': 'TGGAAGCAGCAGACGCAAGCCATGCCTG--AAGCT-CTGCAGCCTCGTGGAGGGAAAGAG--A-GCGCGCCAGACCGCAGAGCCAGCTCGAG---GTGA-GGGGTCTGCTA',
    'end': 650000,
    'species': 'ovis_aries'}],
  'tree': '(capra_hircus_3_818768_818868[+]:0.01130222,ovis_aries_1_649900_650000[+]:0.0111331)Chir-Oari[2]:0.0169365;'}]

Ok, now try to collect some variant from 50K from the SMARTER API endpoint:

In [5]:
variant_api = VariantsEndpoint(species="Sheep", assembly="OAR3")

region = "1:1-5000000"
chip_name = "IlluminaOvineSNP50"

data = variant_api.get_variants(chip_name=chip_name, region=region)
page = page = data["page"]
variants = pd.json_normalize(data["items"])

while data["next"] is not None:
    data = variant_api.get_variants(chip_name=chip_name, region=region, page=page+1)
    df_page = pd.json_normalize(data["items"])
    page = data["page"]
    variants = pd.concat([variants, df_page], ignore_index=True)

variants.info()

2024-09-05 12:40:38,378 - tskitetude.smarterapi - INFO - Initialized VariantsEndpoint with URL: https://webserver.ibba.cnr.it/smarter-api/variants/sheep/OAR3


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 23 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   affy_snp_id                       88 non-null     object
 1   chip_name                         90 non-null     object
 2   cust_id                           69 non-null     object
 3   name                              90 non-null     object
 4   probesets                         88 non-null     object
 5   rs_id                             90 non-null     object
 6   _id.$oid                          90 non-null     object
 7   locations.alleles                 90 non-null     object
 8   locations.chrom                   90 non-null     object
 9   locations.illumina                90 non-null     object
 10  locations.illumina_forward        90 non-null     object
 11  locations.illumina_strand         86 non-null     object
 12  locations.illumina_top  

Get a sample of the positions of the selected variants:

In [6]:
variants[["rs_id", "locations.chrom", "locations.position", "locations.alleles"]].head()

Unnamed: 0,rs_id,locations.chrom,locations.position,locations.alleles
0,[rs430360910],1,52854,A/G
1,[rs402427398],1,81978,C/G
2,[rs413624639],1,120098,C/T
3,[rs424810706],1,204694,A/G
4,[rs55630911],1,315497,A/G


Get an helper object to query ensemble compara:

In [7]:
compara = ComparaSheepSNP(base_url=base_url)

2024-09-05 12:40:38,584 - tskitetude.ensembl - INFO - Using base URL: https://nov2020.rest.ensembl.org


Now try to collect ancestor alleles from alignments:

In [8]:
results = []
for idx, variant in tqdm(variants.iterrows(), total=variants.shape[0]):
    ancestor = compara.get_ancestor(chrom=variant["locations.chrom"], position=variant["locations.position"])
    results.append(ancestor)

  0%|          | 0/90 [00:00<?, ?it/s]

Get only ancestor allele when an alignment is found:

In [9]:
ancestor_alleles = [None if result is None else result["seq"] for result in results]

Add ancestor allele to the variant dataframe:

In [10]:
variants["ancestor_alleles"] = ancestor_alleles
filtered_variants = variants[variants["ancestor_alleles"].notna()]
filtered_variants[["rs_id", "locations.chrom", "locations.position", "locations.alleles", "ancestor_alleles"]].head()

Unnamed: 0,rs_id,locations.chrom,locations.position,locations.alleles,ancestor_alleles
16,[rs399425964],1,1193562,C/T,C
17,[rs404997262],1,1245222,A/G,T
18,[rs430138341],1,1390945,A/C,T
19,[rs401390747],1,1406764,C/G,G
20,[rs398738941],1,1513820,A/G,T


Please note that I can collect an ancestral allele which is not present in the alleles itself