# Explore threads metadata

Threads metadata has been added to the threads TreeSequence with the `annotate_tree`
script. Let's explore data and check if everything is fine.

In [1]:
import itertools

import cyvcf2
import tszip
import pandas as pd
import numpy as np

from tskitetude import get_data_dir, get_project_dir

let's load information on sample names:

In [2]:
sample_info = pd.read_csv(
    get_data_dir() / "toInfer/tsm100M300I.sample_names.txt",
    sep="\t",
    header=None,
    names=["population", "individual"]
)
sample_info.head()

Unnamed: 0,population,individual
0,A,tsk_486
1,II,tsk_276
2,A,tsk_506
3,II,tsk_115
4,D,tsk_1388


Now open the threads object:

:::{warning}
where are opening the fitted version of the trees, since they contain variant infomation
:::

In [3]:
ts_threads = tszip.load(get_project_dir() / "results-threads/toInfer-fit/threads/ts300I2k.1.trees.tsz")
ts_threads

Tree Sequence,Unnamed: 1
Trees,1 999
Sequence Length,99 981 261
Time Units,unknown
Sample Nodes,3 200
Total Size,85.3 MiB
Metadata,dict  chromosome: 1 offset: 8568

Table,Rows,Size,Has Metadata
Edges,1 979 642,60.4 MiB,
Individuals,1 600,80.8 KiB,✅
Migrations,0,8 Bytes,
Mutations,1 998,72.2 KiB,
Nodes,358 846,9.6 MiB,
Populations,8,271 Bytes,✅
Provenances,1,384 Bytes,
Sites,1 998,48.8 KiB,

Provenance Timestamp,Software Name,Version,Command,Full record
"09 December, 2025 at 12:13:23 PM",threads,0.2.1,Unknown,Details  dict  software:  dict  name: threads version: 0.2.1  parameters:  dict  input_file: ts300I2k.1.biallelic.vcf.gz metadata_added: True populations_added: 8 individuals_added: 1600  timestamp: 2025-12- 09T12:13:23.094293+00:00 description: Tree sequence annotated with population and individual metadata


Explore the metadata of the first 10 nodes:

In [4]:
for node in itertools.islice(ts_threads.nodes(), 5):
    print(f"Node {node.id}: population={ts_threads.population(node.population)}, individual={ts_threads.individual(node.individual)}")

Node 0: population=Population(id=7, metadata={'breed': 'II'}), individual=Individual(id=0, flags=0, location=array([], dtype=float64), parents=array([], dtype=int32), nodes=array([0, 1], dtype=int32), metadata={'sample_id': 'tsk_105'})
Node 1: population=Population(id=7, metadata={'breed': 'II'}), individual=Individual(id=0, flags=0, location=array([], dtype=float64), parents=array([], dtype=int32), nodes=array([0, 1], dtype=int32), metadata={'sample_id': 'tsk_105'})
Node 2: population=Population(id=7, metadata={'breed': 'II'}), individual=Individual(id=1, flags=0, location=array([], dtype=float64), parents=array([], dtype=int32), nodes=array([2, 3], dtype=int32), metadata={'sample_id': 'tsk_106'})
Node 3: population=Population(id=7, metadata={'breed': 'II'}), individual=Individual(id=1, flags=0, location=array([], dtype=float64), parents=array([], dtype=int32), nodes=array([2, 3], dtype=int32), metadata={'sample_id': 'tsk_106'})
Node 4: population=Population(id=7, metadata={'breed': '

Ok, seems that metadata follow the sample order in vcf file. Let's prove it by checking
the sample names in `ts_threads.individuals()`:

In [5]:
print(f"Num. samples: {ts_threads.num_samples}")
print(f"Num. individuals: {ts_threads.num_individuals}")

sample2nodes = {}

for i, individual in enumerate(ts_threads.individuals()):
    if i < 5:
        print(f"Individual {individual.id}: sample='{individual.metadata["sample_id"]}', nodes={individual.nodes.tolist()}")

    sample2nodes[individual.metadata["sample_id"]] = individual.nodes.tolist()

Num. samples: 3200
Num. individuals: 1600
Individual 0: sample='tsk_105', nodes=[0, 1]
Individual 1: sample='tsk_106', nodes=[2, 3]
Individual 2: sample='tsk_107', nodes=[4, 5]
Individual 3: sample='tsk_108', nodes=[6, 7]
Individual 4: sample='tsk_109', nodes=[8, 9]


Let's discover individual breeds using their node informations:

In [6]:
sample2breed = {}

for sample_id, nodes in sample2nodes.items():
    for node in nodes:
        sample2breed.setdefault(
            sample_id, set()
        ).add(ts_threads.population(ts_threads.node(node).population).metadata["breed"])


Test that sample and breed mapping is correct:

In [7]:
# test that sample and breed mapping is correct:
assert all(
    sample_info.loc[sample_info["individual"] == sample_id, "population"].values[0] in breed_set
    for sample_id, breed_set in sample2breed.items()
), "Sample to breed mapping is incorrect!"


Now test that the genotypes are consistent with sample names: using VCF as a reference,
read the `focal` sample generated through the pipeline:

In [8]:
vcf_file = get_project_dir() / "results-threads/toInfer/focal/ts300I2k.1.vcf.gz"
with cyvcf2.VCF(vcf_file) as vcf_reader:
    print(vcf_reader.samples[:5])  # show first 5 sample names in VCF

['tsk_105', 'tsk_106', 'tsk_107', 'tsk_108', 'tsk_109']


Check that the number of variant is the same:

In [9]:
# count number of variants in VCF file
num_variants = sum(1 for _ in cyvcf2.VCF(vcf_file))
print(f"N. of variants in VCF: {num_variants}")

# count number of variants in TS file
num_ts_variants = ts_threads.num_sites
print(f"N. of variants TreeSequence: {num_ts_variants}")

# check variant size
if num_variants == num_ts_variants:
    print(f"✓ N. of variants matches (between TS and VCF)!")
else:
    print(f"✗ Warning: VCF has {num_variants} variants, TreeSequence has {num_ts_variants}")

N. of variants in VCF: 2000
N. of variants TreeSequence: 1998


Ok, positions are not the same: for what I saw, first position in `ts_threads` is 0,
so let's read first position on vcf file and determine the offset of *TS* object:

In [10]:
with cyvcf2.VCF(vcf_file) as vcf_reader:
    variant = next(vcf_reader)
    offset = variant.POS

# translate the TS positions
ts_positions = np.array([int(site.position) + offset for site in ts_threads.sites()], dtype=np.int64)
print(ts_positions[:10])  # show first 10 positions

# read positions from VCF
with cyvcf2.VCF(vcf_file) as vcf_reader:
    vcf_positions = np.array([variant.POS for variant in vcf_reader])
print(vcf_positions[:10])  # show first 10 positions

[  8568  11982  54502  99207  99302 110491 132927 136059 144274 177369]
[  8568  11982  54502  99207  99302 110491 132927 136059 144274 177369]
[  8568  11982  54502  99207  99302 110491 132927 136059 144274 177369]


In [11]:
# Find which variants are missing from the TreeSequence
# Get positions from both VCF and TreeSequence
print(f"TreeSequence positions: {len(ts_positions)} variants")
print(f"VCF positions: {len(vcf_positions)} variants")

print("\nExtracting variant positions from VCF...")

# collect information on variants
vcf_variants_info = []

with cyvcf2.VCF(vcf_file) as vcf_reader:
    for variant in vcf_reader:
        vcf_variants_info.append({
            'chrom': variant.CHROM,
            'pos': variant.POS,
            'ref': variant.REF,
            'alt': ','.join(variant.ALT) if variant.ALT else '',
            'qual': variant.QUAL,
            'filter': variant.FILTER if variant.FILTER else 'PASS',
        })

# create a dataframe from variant information
vcf_variants_df = pd.DataFrame(vcf_variants_info)

# Find missing positions (in VCF but not in TreeSequence)
missing_in_ts = np.setdiff1d(vcf_positions, ts_positions)
print(f"\nVariants in VCF but missing in TreeSequence: {len(missing_in_ts)}")

if len(missing_in_ts) > 0:
    # filter out missing variants
    missing_variants_df = vcf_variants_df[vcf_variants_df['pos'].isin(missing_in_ts)].copy()

    genotype_stats = []

    with cyvcf2.VCF(vcf_file) as vcf_reader:
        for variant in vcf_reader:
            if variant.POS in missing_in_ts:
                gts = np.array([gt[:2] for gt in variant.genotypes])
                unique_alleles = np.unique(gts[gts >= 0])  # Escludere missing (-1)

                genotype_stats.append({
                    'pos': variant.POS,
                    'unique_alleles': len(unique_alleles),
                    'is_monomorphic': len(unique_alleles) == 1,
                    'missing_gts': np.sum(gts == -1),
                    'total_gts': gts.size,
                    'allele_counts': dict(zip(*np.unique(gts.flatten(), return_counts=True)))
                })

    genotype_stats_df = pd.DataFrame(genotype_stats)

    # join information
    missing_variants_full = missing_variants_df.merge(genotype_stats_df, on='pos')

    # uncomment to save data
    # missing_variants_full.to_csv('missing_variants.csv', index=False)

else:
    print("\n✓ All VCF variants are present in TreeSequence")

# Also check if TreeSequence has any variants not in VCF (unlikely but possible)
extra_in_ts = np.setdiff1d(ts_positions, vcf_positions)
if len(extra_in_ts) > 0:
    extra_df = pd.DataFrame({'pos': extra_in_ts})

TreeSequence positions: 1998 variants
VCF positions: 2000 variants

Extracting variant positions from VCF...

Variants in VCF but missing in TreeSequence: 2

Variants in VCF but missing in TreeSequence: 2


In [12]:
missing_variants_full

Unnamed: 0,chrom,pos,ref,alt,qual,filter,unique_alleles,is_monomorphic,missing_gts,total_gts,allele_counts
0,1,45384357,C,T,,PASS,1,True,0,3200,{0: 3200}
1,1,63257441,C,"T,G",,PASS,3,False,0,3200,"{0: 3145, 1: 7, 2: 48}"


So the missing variants are one monomorphic SNP (no alternates) in position `45384357` and
one biallelic SNP in position `63257441` (filtered out by the `BCFTOOLS_BIALLELIC` step).
Let's track the position to skip:

In [13]:
skip_positions = missing_variants_full["pos"].tolist()

## Extract TreeSequences Genotypes

Collect the full genotype matrix from TS object:

In [14]:
# collect a matrix of (num_sites, num_samples)
ts_genotype_matrix = ts_threads.genotype_matrix()
print(f"Genotype matrix from TS: {ts_genotype_matrix.shape}")
print(f"Number of sites: {ts_genotype_matrix.shape[0]}")
print(f"Number of samples: {ts_genotype_matrix.shape[1]}")
print(f"\nSubsetting first 5 objects in both dimensions")
print(ts_genotype_matrix[:5, :5])

Genotype matrix from TS: (1998, 3200)
Number of sites: 1998
Number of samples: 3200

Subsetting first 5 objects in both dimensions
[[0 0 0 0 0]
 [0 0 0 0 1]
 [0 0 0 1 1]
 [0 0 0 1 0]
 [0 0 0 1 0]]


## Extract VCF genotypes

Collect the full genotype matrix from VCF (remember to skip filtered variants):

In [15]:
with cyvcf2.VCF(vcf_file) as vcf_reader:
    print("\nExtracting genotypes from VCF...")

    genotype_list = []

    for i, variant in enumerate(vcf_reader):
        if variant.POS in skip_positions:
            continue  # those variants are missing in TS

        # Extract alleles as a numpy array directly
        # variant.genotypes is a list, we take only the first 2 elements (alleles)
        # Unnest genotypes: create a (num_samples * 2,) flat array for this variant
        genotypes_array = np.array([gt[:2] for gt in variant.genotypes], dtype=np.int8)
        genotype_list.append(genotypes_array.flatten())

        if (i + 1) % 10000 == 0:
            print(f"  Processed {i + 1} variants...")

    # Convert to a 3D numpy array: dimensions (num_variants, num_samples, 2)
    vcf_genotype_matrix = np.array(genotype_list, dtype=np.int8)

    print(f"\nVCF genotype matrix created:")
    print(f"  Shape: {vcf_genotype_matrix.shape}")
    print(f"  Dtype: {vcf_genotype_matrix.dtype}")
    print(f"  Number of variants: {vcf_genotype_matrix.shape[0]}")
    print(f"  Number of samples: {vcf_genotype_matrix.shape[1]}")

    print(f"\nExample - first 5 genotypes of the first sample:")
    print(vcf_genotype_matrix[:5, :5])


Extracting genotypes from VCF...

VCF genotype matrix created:
  Shape: (1998, 3200)
  Dtype: int8
  Number of variants: 1998
  Number of samples: 3200

Example - first 5 genotypes of the first sample:
[[0 0 0 0 0]
 [0 0 0 0 1]
 [0 0 0 1 1]
 [0 0 0 1 0]
 [0 0 0 1 0]]

VCF genotype matrix created:
  Shape: (1998, 3200)
  Dtype: int8
  Number of variants: 1998
  Number of samples: 3200

Example - first 5 genotypes of the first sample:
[[0 0 0 0 0]
 [0 0 0 0 1]
 [0 0 0 1 1]
 [0 0 0 1 0]
 [0 0 0 1 0]]


## Compare genotype matrices

Compare the two genotype matrices:

In [16]:
# Check matching between VCF and TreeSequence genotype matrices: test for shape and values
if vcf_genotype_matrix.shape == ts_genotype_matrix.shape:
    print(f"✓ Matrices have the same shape: {vcf_genotype_matrix.shape}")

    # Both matrices have shape (num_sites, num_samples * 2 intended as haploid individuals)
    # test for site-wise matches
    matches = np.all(vcf_genotype_matrix == ts_genotype_matrix, axis=1)
    num_matches = np.sum(matches)
    num_total = len(matches)

    print(f"\nGlobal result:")
    print(f"  Sites with matching genotypes: {num_matches}/{num_total} ({100*num_matches/num_total:.2f}%)")

    if num_matches != num_total:
        # Find sites with mismatches
        mismatches = np.where(~matches)[0]
        print(f"\n  Number of sites with mismatches: {len(mismatches)}")
        print(f"  First 5 sites with mismatches: {mismatches[:5]}")

        # Analyze an example of mismatch
        if len(mismatches) > 0:
            site_idx = mismatches[0]
            print(f"\n  Example - Site {site_idx}:")

            # Find which samples have mismatches
            site_matches = np.all(vcf_genotype_matrix[site_idx] == ts_genotype_matrix[site_idx], axis=1)
            discordant_samples = np.where(~site_matches)[0]

            print(f"    Samples with mismatches: {len(discordant_samples)}")
            if len(discordant_samples) > 0:
                sample_idx = discordant_samples[0]
                print(f"    Sample {sample_idx}:")
                print(f"      VCF:          {vcf_genotype_matrix[site_idx, sample_idx]}")
                print(f"      TreeSequence: {ts_genotype_matrix[site_idx, sample_idx]}")
    else:
        print("\n✓ All matrices match perfectly!")

    # Statistics per sample
    # Per-sample match stats (all sites must match for a sample to be "perfect")
    sample_matches = np.all(vcf_genotype_matrix == ts_genotype_matrix, axis=0)
    num_perfect_samples = int(np.sum(sample_matches))

    print(f"\nSamples with all matching genotypes: {num_perfect_samples}/{ts_threads.num_samples}")

    if num_perfect_samples != ts_threads.num_samples:
        mismatched_samples = np.where(~sample_matches)[0]
        print(f"  Number of samples with mismatches: {len(mismatched_samples)}")
        print(f"  First 10 samples with mismatches: {mismatched_samples[:10].tolist()}")
else:
    print(f"✗ Matrices have different shapes!")
    print(f"  VCF:          {vcf_genotype_matrix.shape}")
    print(f"  TreeSequence: {ts_genotype_matrix.shape}")

✓ Matrices have the same shape: (1998, 3200)

Global result:
  Sites with matching genotypes: 1998/1998 (100.00%)

✓ All matrices match perfectly!

Samples with all matching genotypes: 3200/3200
