# Explore threads metadata

Threads metadata has been added to the threads TreeSequence with the `annotate_tree`
script. Let's explore data and check if everything is fine.

In [None]:
import itertools

import cyvcf2
import tszip
import pandas as pd
import numpy as np

from tskitetude import get_data_dir, get_project_dir

let's load information on sample names:

In [None]:
sample_info = pd.read_csv(
    get_data_dir() / "toInfer/tsm100M300I.sample_names.txt",
    sep="\t",
    header=None,
    names=["population", "individual"]
)
sample_info.head()

Now open the threads object:

In [None]:
ts_threads = tszip.load(get_project_dir() / "results-threads/toInfer/threads/ts300I2k.1.trees.tsz")
ts_threads

Explore the metadata of the first 10 nodes:

In [None]:
for node in itertools.islice(ts_threads.nodes(), 5):
    print(f"Node {node.id}: population={ts_threads.population(node.population)}, individual={ts_threads.individual(node.individual)}")

Ok, seems that metadata follow the sample order in vcf file. Let's prove it by checking
the sample names in `ts_threads.individuals()`:

In [None]:
print(f"Num. samples: {ts_threads.num_samples}")
print(f"Num. individuals: {ts_threads.num_individuals}")

sample2nodes = {}

for i, individual in enumerate(ts_threads.individuals()):
    if i < 5:
        print(f"Individual {individual.id}: sample='{individual.metadata["sample_id"]}', nodes={individual.nodes.tolist()}")

    sample2nodes[individual.metadata["sample_id"]] = individual.nodes.tolist()

Let's discover individual breeds using their node informations:

In [None]:
sample2breed = {}

for sample_id, nodes in sample2nodes.items():
    for node in nodes:
        sample2breed.setdefault(
            sample_id, set()
        ).add(ts_threads.population(ts_threads.node(node).population).metadata["breed"])


Test that sample and breed mapping is correct:

In [None]:
# test that sample and breed mapping is correct:
assert all(
    sample_info.loc[sample_info["individual"] == sample_id, "population"].values[0] in breed_set
    for sample_id, breed_set in sample2breed.items()
), "Sample to breed mapping is incorrect!"


Now test that the genotypes are consistent with sample names: using VCF as a reference,
read the `focal` sample generated through the pipeline:

In [None]:
vcf_file = get_project_dir() / "results-threads/toInfer/focal/ts300I2k.1.vcf.gz"
with cyvcf2.VCF(vcf_file) as vcf_reader:
    print(vcf_reader.samples[:5])  # show first 5 sample names in VCF

Check that the number of variant is the same:

In [None]:
# count number of variants in VCF file
num_variants = sum(1 for _ in cyvcf2.VCF(vcf_file))
print(f"N. of variants in VCF: {num_variants}")

# count number of variants in TS file
num_ts_variants = ts_threads.num_sites
print(f"N. of variants TreeSequence: {num_ts_variants}")

# check variant size
if num_variants == num_ts_variants:
    print(f"✓ N. of variants matches (between TS and VCF)!")
else:
    print(f"✗ Warning: VCF has {num_variants} variants, TreeSequence has {num_ts_variants}")

Ok, positions are not the same: for what I saw, first position in `ts_threads` is 0,
so let's read first position on vcf file and determine the offset of *TS* object:

In [None]:
with cyvcf2.VCF(vcf_file) as vcf_reader:
    variant = next(vcf_reader)
    offset = variant.POS

# translate the TS positions
ts_positions = np.array([int(site.position) + offset for site in ts_threads.sites()], dtype=np.int64)
print(ts_positions[:10])  # show first 10 positions

# read positions from VCF
with cyvcf2.VCF(vcf_file) as vcf_reader:
    vcf_positions = np.array([variant.POS for variant in vcf_reader])
print(vcf_positions[:10])  # show first 10 positions

In [None]:
# Find which variants are missing from the TreeSequence
# Get positions from both VCF and TreeSequence
print(f"TreeSequence positions: {len(ts_positions)} variants")
print(f"VCF positions: {len(vcf_positions)} variants")

print("\nExtracting variant positions from VCF...")

# collect information on variants
vcf_variants_info = []

with cyvcf2.VCF(vcf_file) as vcf_reader:
    for variant in vcf_reader:
        vcf_variants_info.append({
            'chrom': variant.CHROM,
            'pos': variant.POS,
            'ref': variant.REF,
            'alt': ','.join(variant.ALT) if variant.ALT else '',
            'qual': variant.QUAL,
            'filter': variant.FILTER if variant.FILTER else 'PASS',
        })

# create a dataframe from variant information
vcf_variants_df = pd.DataFrame(vcf_variants_info)

# Find missing positions (in VCF but not in TreeSequence)
missing_in_ts = np.setdiff1d(vcf_positions, ts_positions)
print(f"\nVariants in VCF but missing in TreeSequence: {len(missing_in_ts)}")

if len(missing_in_ts) > 0:
    # filter out missing variants
    missing_variants_df = vcf_variants_df[vcf_variants_df['pos'].isin(missing_in_ts)].copy()

    genotype_stats = []

    with cyvcf2.VCF(vcf_file) as vcf_reader:
        for variant in vcf_reader:
            if variant.POS in missing_in_ts:
                gts = np.array([gt[:2] for gt in variant.genotypes])
                unique_alleles = np.unique(gts[gts >= 0])  # Escludere missing (-1)

                genotype_stats.append({
                    'pos': variant.POS,
                    'unique_alleles': len(unique_alleles),
                    'is_monomorphic': len(unique_alleles) == 1,
                    'missing_gts': np.sum(gts == -1),
                    'total_gts': gts.size,
                    'allele_counts': dict(zip(*np.unique(gts.flatten(), return_counts=True)))
                })

    genotype_stats_df = pd.DataFrame(genotype_stats)

    # join information
    missing_variants_full = missing_variants_df.merge(genotype_stats_df, on='pos')

    # uncomment to save data
    # missing_variants_full.to_csv('missing_variants.csv', index=False)

else:
    print("\n✓ All VCF variants are present in TreeSequence")

# Also check if TreeSequence has any variants not in VCF (unlikely but possible)
extra_in_ts = np.setdiff1d(ts_positions, vcf_positions)
if len(extra_in_ts) > 0:
    extra_df = pd.DataFrame({'pos': extra_in_ts})

In [None]:
missing_variants_full

So the missing variants are one monomorphic SNP (no alternates) in position `45384357` and
one biallelic SNP in position `63257441` (filtered out by the `BCFTOOLS_BIALLELIC` step).
Let's track the position to skip:

In [None]:
skip_positions = missing_variants_full["pos"].tolist()

## Extract TreeSequences Genotypes

Collect the full genotype matrix from TS object:

In [None]:
# collect a matrix of (num_sites, num_samples)
genotype_matrix = ts_threads.genotype_matrix()
print(f"Genotype matrix from TS: {genotype_matrix.shape}")
print(f"Number of sites: {genotype_matrix.shape[0]}")
print(f"Number of samples: {genotype_matrix.shape[1]}")
print(f"\nSubsetting first 5 objects in both dimensions")
print(genotype_matrix[:5, :5])