# Infer genealogies with tsinfer on RosEl with MY3 samples

In [1]:
import json
import cyvcf2
import tsinfer
import tsdate
import tskit
import numpy as np
import pandas as pd
import tqdm
from tskit import MISSING_DATA

In [2]:
## Define function to add sites into samples for tsinfer
def add_diploid_sites(vcf, samples):
    """
    Read the sites in the vcf and add them to the samples object.
    """
    # You may want to change the following line, e.g. here we allow
    # "*" (a spanning deletion) to be a valid allele state
    allele_chars = set("ATGCatgc*")
    pos = 0
    siteID = 0
    
    for variant in vcf:  # Loop over variants, each assumed at a unique site    
        # progressbar.update(variant.POS - pos)
        allele_chars = set("ATGCatgc*")
        
        pos = variant.POS
        
        alleles = [variant.REF.upper()] + [v.upper() for v in variant.ALT]
        ancestral_allele = 0
        
        # ancestral = variant.INFO.get("AA", ".")  # "." means unknown
        
        # # some VCFs (e.g. from 1000G) have many values in the AA field: take the 1st
        # ancestral = ancestral.split("|")[0].upper()
        # if ancestral == "." or ancestral == "":
        #     ancestral_allele = MISSING_DATA
        #     # alternatively, you could specify `ancestral = variant.REF.upper()`
        # else:
        #     ancestral_allele = alleles.index(ancestral)
    
        # Check we have ATCG alleles
        for a in alleles:
            if len(set(a) - allele_chars) > 0:
                print(f"Ignoring site at pos {pos}: allele {a} not in {allele_chars}")
                continue
        # Map original allele indexes to their indexes in the new alleles list.
        genotypes = [g for row in variant.genotypes for g in row[0:2]]

        samples.add_site(pos, genotypes, alleles, ancestral_allele=ancestral_allele)
        siteID += 1

In [3]:
##  Load samples to tsinfer
demes=['YF3','MF3']
demeSize = [30, 26]
popList = np.repeat(demes, demeSize)
vcf_path="/nfs/scistore18/bartogrp/apal/snap_hap/ARGs/VCFs/Am_MY3_stitchRun1_Chr6-20000000-20400000.noRef.statphased.vcf.gz"

vcf = cyvcf2.VCF(vcf_path)

with tsinfer.SampleData(path = '/nfs/scistore18/bartogrp/apal/snap_hap/ARGs/samples/samples_MY3.list') as sample_data:
    ## Define populations
    sample_data.add_population(metadata={"name": "YF3"})
    sample_data.add_population(metadata={"name": "MF3"})

    ## Define inidividuals
    for sampleName, pop in zip(vcf.samples, popList):
        popIndex = demes.index(pop)
        sample_data.add_individual(ploidy=2, population=popIndex, metadata={"names":sampleName})

    ## Add sites and genotypes
    add_diploid_sites(vcf, sample_data)

In [4]:
print(
    "Sample file created for {} samples ".format(sample_data.num_samples)
    + "({} individuals) ".format(sample_data.num_individuals)
    + "with {} variable sites.".format(sample_data.num_sites),
    flush=True,
)

Sample file created for 106 samples (53 individuals) with 16313 variable sites.


In [5]:
## Run tsinfer
ts_neu_MY3 = tsinfer.infer(sample_data)
simple_ts_neu_MY3 = ts_neu_MY3.simplify(keep_unary=False)
dated_ts_neu_MY3 = tsdate.date(simple_ts_neu_MY3, Ne = 354349, mutation_rate = 1e-8)

In [6]:
dated_ts_neu_MY3

Tree Sequence,Unnamed: 1
Trees,5873
Sequence Length,20399987.0
Time Units,generations
Sample Nodes,106
Total Size,3.2 MiB
Metadata,dict

Table,Rows,Size,Has Metadata
Edges,42188,1.3 MiB,
Individuals,53,4.0 KiB,✅
Migrations,0,8 Bytes,
Mutations,12519,452.4 KiB,
Nodes,7316,557.4 KiB,✅
Populations,2,54 Bytes,✅
Provenances,3,1.7 KiB,
Sites,12519,634.4 KiB,✅


In [7]:
treeStart = []
treeEnd = []
treeSpan = []
treeList = []

for tree in dated_ts_neu_MY3.trees():
    if (tree.interval.left > 0):
        treeStart.append(tree.interval.left)
        treeEnd.append(tree.interval.right)
        treeSpan.append(tree.span)
        treeList.append(tree.as_newick(root=tree.root))

tsDF = pd.DataFrame({'treeStart': treeStart,
                      'treeEnd': treeEnd,
                     'treeSpan': treeSpan,
                     'tree': treeList})
tsDF.to_csv('/nfs/scistore18/bartogrp/apal/snap_hap/ARGs/ts_run1-2024May_RosEl-MY3/RosEl.MY3/ts.neu.trList.newick.txt',
            sep="\t", quoting = None, index = False, encoding = 'utf-8')