# Add metadata to threads TreeSequence

open data generated with threads:

In [None]:
import json
import tskit
import tszip
import pandas as pd
import datetime

from tskitetude import get_data_dir

define metadata to add information to tskit tables:

In [None]:
population_metadata_schema = tskit.MetadataSchema({
    "codec": "json",
    "type": "object",
    "properties": {
        "breed": {"type": "string"}
    },
    "required": ["breed"]
})

individual_metadata_schema = tskit.MetadataSchema({
    "codec": "json",
    "type": "object",
    "properties": {
        "sample_id": {"type": "string"}
    },
    "required": ["sample_id"]
})

let's load information on sample names:

In [None]:
sample_info = pd.read_csv(
    get_data_dir() / "toInfer/tsm100M300I.sample_names.txt",
    sep="\t",
    header=None,
    names=["population", "individual"]
)
sample_info.head()

In [None]:
ts_threads = tszip.load("results-threads/toInfer/threads/ts300I2k.1.tsz")
ts_threads

Get a copy of tables:

In [None]:
tables = ts_threads.dump_tables()

collect unique populations:

In [None]:
unique_pops = sample_info["population"].unique()
unique_pops

Create a mapping from population names to IDs:

In [None]:
tables.populations.metadata_schema = population_metadata_schema

breed_to_id = {}

for breed in unique_pops:
    pop_id = tables.populations.add_row(
        metadata={"breed": breed}
    )
    breed_to_id[breed] = pop_id

breed_to_id

In [None]:
tables.individuals.metadata_schema = individual_metadata_schema

individual_to_id = {}

for _, row in sample_info.iterrows():
    sample_id = row["individual"]

    # If the individual hasn't already been added, add it
    if sample_id not in individual_to_id:
        ind_id = tables.individuals.add_row(
            flags=0,
            metadata={
                "sample_id": sample_id
            }
        )
        individual_to_id[sample_id] = ind_id

let's update existing individuals to link them to populations and individuals:

In [None]:
# we are talking of diploid individuals, so they have 2 contiguous nodes
for i in range(len(sample_info)):
    row = sample_info.iloc[i]
    pop_id = breed_to_id[row["population"]]
    ind_id = individual_to_id[row["individual"]]

    # update both nodes for the diploid individual
    for j in range(2):
        node_id = i * 2 + j
        node = tables.nodes[node_id]

        # Replace the node with an updated one
        tables.nodes[node_id] = node.replace(
            population=pop_id,
            individual=ind_id
        )

Let's check that individuals have been updated:

In [None]:
# check first 5 nodes
for i in range(min(5, len(sample_info))):
    node = tables.nodes[i]
    print(f"Node {i}: population={node.population}, individual={node.individual}")

Add provenance information to document how this TreeSequence was generated:

In [None]:
# Create provenance record
provenance_record = {
    "software": {
        "name": "threads",
        "version": "v0.2.1"
    },
    "parameters": {
        "input_file": "data/toInfer/threads/ts300I2k.vcf.gz",
        "metadata_added": True,
        "populations_added": len(breed_to_id),
        "individuals_added": len(individual_to_id)
    },
    "timestamp": datetime.datetime.now().isoformat(),
    "description": "TreeSequence generated with threads and metadata added for populations and individuals"
}

# Add provenance to tables
tables.provenances.add_row(
    timestamp=provenance_record["timestamp"],
    record=json.dumps(provenance_record)
)

Now, create a new TreeSequence with the updated tables:

In [None]:
new_ts = tables.tree_sequence()

print(f"Num of populations: {new_ts.num_populations}")
print(f"Num of individuals: {new_ts.num_individuals}")
print(f"Num of nodes: {new_ts.num_nodes}")

new_ts