# ts genealogies
Infer genealogies with pure-magenta vs pure-yellow high coverage samples from MF and YF

In [2]:
import tsinfer
import tsdate
import tskit
import cyvcf2
import numpy as np
import os
import pandas as pd

In [3]:
# os.chdir('/Users/apal/Phd/Projects/2021-snap_hap/ARGs')
os.chdir('/nfs/scistore18/bartogrp/apal/snap_hap/ARGs')
os.getcwd()

'/nfs/scistore18/bartogrp/apal/snap_hap/ARGs'

In [4]:
## Define function to add sites into samples for tsinfer
def add_diploid_sites(vcf, samples):
    
    """
    Read the sites in the vcf and add them to the samples object.
    """
    
    # You may want to change the following line, e.g. here we allow
    # "*" (a spanning deletion) to be a valid allele state
    allele_chars = set("ATGCatgc*")
    pos = 0
    siteID = 0
    
    for variant in vcf:  # Loop over variants, each assumed at a unique site    
        # progressbar.update(variant.POS - pos)
        allele_chars = set("ATGCatgc*")
        
        pos = variant.POS
        
        alleles = [variant.REF.upper()] + [v.upper() for v in variant.ALT]
        ancestral_allele = 0
        
        # ancestral = variant.INFO.get("AA", ".")  # "." means unknown
        
        # # some VCFs (e.g. from 1000G) have many values in the AA field: take the 1st
        # ancestral = ancestral.split("|")[0].upper()
        # if ancestral == "." or ancestral == "":
        #     ancestral_allele = MISSING_DATA
        #     # alternatively, you could specify `ancestral = variant.REF.upper()`
        # else:
        #     ancestral_allele = alleles.index(ancestral)
    
        # Check we have ATCG alleles
        for a in alleles:
            if len(set(a) - allele_chars) > 0:
                print(f"Ignoring site at pos {pos}: allele {a} not in {allele_chars}")
                continue
        # Map original allele indexes to their indexes in the new alleles list.
        genotypes = [g for row in variant.genotypes for g in row[0:2]]

        samples.add_site(pos, genotypes, alleles, ancestral_allele=ancestral_allele)
        siteID += 1

In [5]:
##  Load samples to tsinfer
demes=['pmFR','pyYe']
demeSize = [45, 50]
popList = np.repeat(demes, demeSize)
vcf_path="./VCFs/Am_pmFRpyYe-hCov_stitchRun1_Chr6-52800000-53200000.refScaf.statphased.vcf.gz"
vcf = cyvcf2.VCF(vcf_path)

In [6]:
with tsinfer.SampleData(path = './ts_run3-20240603_pmFRpyYe-hCov/ts.RosEl.pmFRpyYe.samples') as sample_data:
    
    ## Define populations
    sample_data.add_population(metadata={"name": "pmFR"})
    sample_data.add_population(metadata={"name": "pyYe"})

    ## Define inidividuals
    for sampleName, pop in zip(vcf.samples, popList):
        popIndex = demes.index(pop)
        sample_data.add_individual(ploidy=2, population=popIndex, metadata={"names":sampleName})

    ## Add sites and genotypes
    add_diploid_sites(vcf, sample_data)

In [7]:
## Load sample data if run before
sample_data = tsinfer.load('./ts_run3-20240603_pmFRpyYe-hCov/ts.RosEl.pmFRpyYe.samples')

In [8]:
# sample_data.
sample_data.num_samples

190

In [9]:
print(
    "Sample file created for {} samples ".format(sample_data.num_samples)
    + "({} individuals) ".format(sample_data.num_individuals)
    + "with {} variable sites.".format(sample_data.num_sites),
    flush=True,
)

Sample file created for 190 samples (95 individuals) with 23529 variable sites.


In [11]:
ts_rosel_pmFRpyYe = tsinfer.infer(sample_data)
tsSimp_rosel_pmFRpyYe = ts_rosel_pmFRpyYe.simplify(keep_unary=False)
tsD_rosel_pmFRpyYe = tsdate.date(tsSimp_rosel_pmFRpyYe, Ne=354349, mutation_rate=7e-9)

In [12]:
tsD_rosel_pmFRpyYe

Tree Sequence,Unnamed: 1
Trees,13993
Sequence Length,53199989.0
Time Units,generations
Sample Nodes,190
Total Size,8.3 MiB
Metadata,dict

Table,Rows,Size,Has Metadata
Edges,132057,4.0 MiB,
Individuals,95,7.1 KiB,✅
Migrations,0,8 Bytes,
Mutations,21207,766.3 KiB,
Nodes,19208,1.4 MiB,✅
Populations,2,56 Bytes,✅
Provenances,3,1.7 KiB,
Sites,21207,1.0 MiB,✅


In [14]:
## Save the tree sequence
tsD_rosel_pmFRpyYe.dump('./ts_run3-20240603_pmFRpyYe-hCov/ts.RosEl.pmFRpyYe.trees')

## Load tree sequence
# tsD_rosel_mFRyYe = tskit.load('./ts_run2-20240522_FrYe/ts.RosEl.mFRyYe.ts')

In [15]:
treeStart = []
treeEnd = []
treeSpan = []
treeList = []

for tree in tsD_rosel_pmFRpyYe.trees():
    if (tree.interval.left > 0):
        treeStart.append(tree.interval.left)
        treeEnd.append(tree.interval.right)
        treeSpan.append(tree.span)
        # treeList.append(tree.as_newick(root=tree.roots))
        treeList.append(tree.as_newick(root=tree.root))

tsDF = pd.DataFrame({'treeStart': treeStart,
                      'treeEnd': treeEnd,
                     'treeSpan': treeSpan,
                     'tree': treeList})
tsDF.to_csv('./ts_run3-20240603_pmFRpyYe-hCov/ts.RosEl.trList.newick.txt',
            sep="\t", quoting = None, index = False, encoding = 'utf-8')