# Infer genealogies with ts with FR&Ye samples from Flank3

In [1]:
import tsinfer
import tsdate
import tskit
import cyvcf2
import numpy as np
import os

In [2]:
# os.chdir('/Users/apal/Phd/Projects/2021-snap_hap/ARGs')
os.chdir('/nfs/scistore18/bartogrp/apal/snap_hap/ARGs')
os.getcwd()

'/nfs/scistore18/bartogrp/apal/snap_hap/ARGs'

In [3]:
## Define function to add sites into samples for tsinfer
def add_diploid_sites(vcf, samples):
    
    """
    Read the sites in the vcf and add them to the samples object.
    """
    
    # You may want to change the following line, e.g. here we allow
    # "*" (a spanning deletion) to be a valid allele state
    allele_chars = set("ATGCatgc*")
    pos = 0
    siteID = 0
    
    for variant in vcf:  # Loop over variants, each assumed at a unique site    
        # progressbar.update(variant.POS - pos)
        allele_chars = set("ATGCatgc*")
        
        pos = variant.POS
        
        alleles = [variant.REF.upper()] + [v.upper() for v in variant.ALT]
        ancestral_allele = 0
        
        # ancestral = variant.INFO.get("AA", ".")  # "." means unknown
        
        # # some VCFs (e.g. from 1000G) have many values in the AA field: take the 1st
        # ancestral = ancestral.split("|")[0].upper()
        # if ancestral == "." or ancestral == "":
        #     ancestral_allele = MISSING_DATA
        #     # alternatively, you could specify `ancestral = variant.REF.upper()`
        # else:
        #     ancestral_allele = alleles.index(ancestral)
    
        # Check we have ATCG alleles
        for a in alleles:
            if len(set(a) - allele_chars) > 0:
                print(f"Ignoring site at pos {pos}: allele {a} not in {allele_chars}")
                continue
        # Map original allele indexes to their indexes in the new alleles list.
        genotypes = [g for row in variant.genotypes for g in row[0:2]]

        samples.add_site(pos, genotypes, alleles, ancestral_allele=ancestral_allele)
        siteID += 1

In [4]:
##  Load samples to tsinfer
demes=['mFR','yYe']
demeSize = [22, 20]
popList = np.repeat(demes, demeSize)
vcf_path="./VCFs/Am_mFRyYe_stitchRun1_Chr6-30000000-30400000.refScaf.statphased.vcf.gz"
vcf = cyvcf2.VCF(vcf_path)

In [5]:
with tsinfer.SampleData(path = './ts_run2-20240522_FrYe/ts.neu.mFRyYe.samples') as sample_data:
    
    ## Define populations
    sample_data.add_population(metadata={"name": "mFR"})
    sample_data.add_population(metadata={"name": "yYe"})

    ## Define inidividuals
    for sampleName, pop in zip(vcf.samples, popList):
        popIndex = demes.index(pop)
        sample_data.add_individual(ploidy=2, population=popIndex, metadata={"names":sampleName})

    ## Add sites and genotypes
    add_diploid_sites(vcf, sample_data)

In [6]:
## Load sample data if run before
sample_data = tsinfer.load('./ts_run2-20240522_FrYe/ts.neu.mFRyYe.samples')

In [7]:
# sample_data.
sample_data.num_samples

84

In [8]:
print(
    "Sample file created for {} samples ".format(sample_data.num_samples)
    + "({} individuals) ".format(sample_data.num_individuals)
    + "with {} variable sites.".format(sample_data.num_sites),
    flush=True,
)

Sample file created for 84 samples (42 individuals) with 13359 variable sites.


In [11]:
ts_neu_mFRyYe = tsinfer.infer(sample_data)

In [12]:
# type(ts_rosel_mFRyYe)

In [13]:
tsSimp_neu_mFRyYe = ts_neu_mFRyYe.simplify(keep_unary=False)
tsD_neu_mFRyYe = tsdate.date(tsSimp_neu_mFRyYe, Ne = 354349, mutation_rate = 7e-9)

In [14]:
tsD_neu_mFRyYe

Tree Sequence,Unnamed: 1
Trees,5371
Sequence Length,30399988.0
Time Units,generations
Sample Nodes,84
Total Size,2.9 MiB
Metadata,dict

Table,Rows,Size,Has Metadata
Edges,41692,1.3 MiB,
Individuals,42,3.1 KiB,✅
Migrations,0,8 Bytes,
Mutations,9973,360.4 KiB,
Nodes,6773,517.0 KiB,✅
Populations,2,54 Bytes,✅
Provenances,3,1.7 KiB,
Sites,9973,505.9 KiB,✅


In [15]:
## Save the tree sequence
tsD_neu_mFRyYe.dump('./ts_run2-20240522_FrYe/ts.neu.mFRyYe.ts')

## Load tree sequence
# tsD_rosel_mFRyYe.dump('./ts_run2-20240522_FrYe/ts_mFRyYe.ts')