In [None]:
from datetime import date

import altair as alt
import polars as pl

pedigree_file = "/master/abagwell/variant-analysis/resources/rhesus/pedigree/Demographics_2024-04-15_13-26-54.tsv"
date_of_pedigree = date(2024, 4, 15)  # TODO: Change

data = pl.read_csv(pedigree_file, separator="\t", infer_schema_length=None)

In [None]:
demographics = data.drop_nulls("Date of Birth").with_columns( # Pull year from string dates
    pl.col("Date of Birth").str.to_date("%m-%d-%Y"),#.dt.year(),
    pl.col("Date of Death").str.to_date("%m-%d-%Y"),#.dt.year(),
).with_columns(  # Set death date of living animals to date of pedigree
    pl.col("Date of Death").fill_null(date_of_pedigree),
).with_columns(  # Crate column with list of years alive
    pl.date_ranges(pl.col("Date of Birth"), pl.col("Date of Death"), interval="1y").alias("Year"),
).explode("Year"  # Create row for every year alive for every animal
).with_columns(  # Convert date to year
    pl.col("Year").dt.year(),
)

## Notes:
# - Animals without a birth date are removed.
# - Depending on date of birth, not all animals show up in last year due to the year interval

In [None]:
# Using output from RFMix
Q_file = "/master/abagwell/variant-analysis/results/rhesus_prev/admixture/RFMix.chr10.rfmix.Q"
Q = pl.read_csv(Q_file, separator="\t", skip_rows=1).filter( # Remove reference samples
    pl.col("#sample").str.starts_with("W")
).with_columns( # Remove seq prefixes
    pl.col("#sample").str.strip_prefix("WES").str.strip_prefix("WGS")
)

In [None]:
# Colony assignments
colonies_file = "/master/abagwell/variant-analysis/resources/rhesus/pop/colonies.tsv"
colonies = pl.read_csv(colonies_file, separator="\t", infer_schema_length=None)

In [None]:
colony_demographics = demographics.join(colonies, how="left", on="Id")

merged = colony_demographics.join(Q, how="left", left_on="Id", right_on="#sample"
).group_by("Year", "Colony").agg(pl.count("Id").alias("Count"), pl.mean("Indian"), pl.mean("Chinese")).drop_nulls()

In [None]:
# Plot of u42 colony sizes over time. Counts are small because this colony designations file only includes animals that are sequenced.

alt.Chart(merged).mark_line(line=True).encode(
    alt.X("Year", title="Year"),
    alt.Y("Count", title="Population Size"),
    color=alt.Color("Colony:O").scale(scheme='dark2'),
).properties(
    title=f"Rhesus Population over Time\nas of {date_of_pedigree}",
    width=340
)

In [None]:
# Read runs file
runs_file = "/master/abagwell/variant-analysis/resources/rhesus/samples/runs.all.list"
runs = pl.read_csv(runs_file, separator="\t", has_header=False, new_columns=["batch/run"]).with_columns(
    pl.col("batch/run").str.split("/").list.get(0).alias("batch"),
    pl.col("batch/run").str.split("/").list.get(1).str.split("_").list.get(0).alias("sample"),
    pl.col("batch/run").str.split("/").list.get(1).str.split("_").list.get(1).alias("library"),
# Pull out seq type and indiv. Also made seq types an Enum, which can be sorted later
).with_columns(
    pl.col("sample").str.slice(0, 3).cast(pl.Enum(["unsequenced", "LRS", "WGS", "WES", "GBS"])).alias("seq"),
    pl.col("sample").str.slice(3).alias("indiv"),
# Group by batch + indiv (to not double count multiple runs from same batch) and then by indiv
).group_by("batch", "indiv").agg(pl.first("seq")).group_by("indiv").agg("seq"
# One of two ways to filter seq. Change this depending on which method is preferred. The first doesn't current work though.
).with_columns(
    # Keep only one of each sequencing type for each animal
    #pl.col("seq").list.unique()
    # Keep only largest sequencing type for each animal
    pl.col("seq").list.sort().list.first()
)


In [None]:
colony_demographics_runs = colony_demographics.join(runs, how="left", left_on="Id", right_on="indiv").with_columns(
    pl.col("seq").fill_null("unsequenced")

).group_by("Year", "seq").agg(pl.count("Id").alias("Count")).sort("Year", "seq", descending=True)

In [None]:
# For testing

colony_demographics_runs
#colony_demographics_runs.sort("Year", "seq").filter(pl.col("Year") == 2021)

In [None]:
# The total area shows the number of animals overall, subdivided by type of sequencing (or if unsequenced).
# In order to keep the sum of sequencing types equal to the true total, sequencing types are given priority as follows: unsequenced > LRS > WGS > WES > GBS

alt.data_transformers.disable_max_rows()
alt.Chart(colony_demographics_runs).mark_area().encode(
    alt.X("Year", title="Year"),
    alt.Y("Count:Q", title="Population Size"),
    color=alt.Color("seq:O", sort=["unsequenced", "LRS", "WGS", "WES", "GBS"]).scale(scheme='dark2'), #, sort=["unsequenced", "LRS", "WGS", "WES"]
    order=alt.Order('seq_order:O', sort='ascending')
).properties(
    title=f"Rhesus Population over Time\nas of {date_of_pedigree}",
    #height=2000,
    width=340,
).transform_calculate(
    order="{'unsequenced': 0, 'LRS': 1, 'WGS': 2, 'WES': 3, 'GBS': 4}[datum.seq_order]"  
)
