In [1]:
from datetime import date

import altair as alt
import polars as pl

# # Rhesus
pedigree_file = "/master/abagwell/variant-analysis/resources/rhesus/pedigree/Demographics_2024-04-17_10-28-20.tsv"
date_of_pedigree = date(2024, 4, 17)

# Marmoset
# pedigree_file = "/master/abagwell/variant-analysis/resources/marmoset/pedigree/Demographics_2024-05-02_12-34-33.tsv"
# date_of_pedigree = date(2024, 4, 17)

data = pl.read_csv(pedigree_file, separator="\t", infer_schema_length=None).with_columns(
    pl.col("Account Description").str.contains("eserved for breeding").alias("is_reserved_for_breeding"),
    pl.col("Account Description").str.contains("reeder").alias("is_breeder"),
).drop_nulls("Date of Birth").with_columns( # Change str to date type
    pl.col("Date of Birth").str.to_date("%m-%d-%Y"),
    pl.col("Date of Death").str.to_date("%m-%d-%Y"),
).with_columns(  # Set death date of living animals to date of pedigree
    pl.col("Date of Death").fill_null(date_of_pedigree),
)


#.group_by("Account Description").agg().sort("Account Description")#.filter(pl.col("Account Description") == True)

# Find individuals who lived at least one year
older_offspring = data.filter(pl.col("Earliest Acq Category") == "Birth"
    ).filter((pl.col("Date of Death") - pl.col("Date of Birth")) >= pl.duration(days=0)
)
# .with_columns(
#     (pl.col("Date of Death") - pl.col("Date of Birth")).alias("age") #>= pl.duration(days=0)
# )


reserved_for_breeding = data.filter((pl.col("is_reserved_for_breeding") == True)).filter(pl.col("Date of Death").is_null()).select("Id").with_columns(pl.lit(0).cast(pl.UInt32).alias("Num of offspring"))

# Only count offspring if at least 1 year old
#year_old_offspring = data.filter()

sires = data.join(older_offspring, left_on="Id", right_on="Sire").group_by("Id").agg(
    pl.len().alias("Num of offspring"), pl.col("Date of Birth").first(), pl.col("Date of Death").first(), pl.col("is_breeder").first()
)
dam = data.join(older_offspring, left_on="Id", right_on="Dam").group_by("Id").agg(
    pl.len().alias("Num of offspring"), pl.col("Date of Birth").first(), pl.col("Date of Death").first(), pl.col("is_breeder").first()
)

breeders = pl.concat([sires, dam]).sort("Id")

#pl.concat([breeders, reserved_for_breeding]).sort("Id")

In [2]:
breeders#.filter(pl.col("is_breeder") == True)

Id,Num of offspring,Date of Birth,Date of Death,is_breeder
str,u32,date,date,bool
"""10235""",3,1991-03-22,2002-04-22,false
"""10244""",4,1991-03-29,2007-01-26,false
"""10250""",2,1991-04-08,2003-06-24,false
"""10338""",4,1991-07-02,2013-08-19,false
"""10383""",3,1991-07-31,2008-02-15,false
…,…,…,…,…
"""9140""",2,1983-06-03,1996-09-10,false
"""9590""",4,1990-04-05,2002-07-19,false
"""9710""",6,1990-06-09,2002-10-16,false
"""9793""",4,1990-07-27,2007-01-24,false


In [3]:
#data["Earliest Acq Category"].unique()

In [4]:
#data.join(older_offspring, on="Id", how="anti").filter(pl.col("Earliest Acq Category") != "Acquisition")
#older_offspring

# Count non-"breeders" or "breeders" number of offspring and grand offspring
def descendants(is_breeder):
    """Find offspring and grand offspring"""
    sire_offspring = data.join(data.filter(pl.col("is_breeder") == is_breeder), left_on="Sire", right_on="Id").select("Id", "Sire", "Dam", "Earliest Acq Category")
    dam_offspring = data.join(data.filter(pl.col("is_breeder") == is_breeder), left_on="Dam", right_on="Id").select("Id", "Sire", "Dam", "Earliest Acq Category")

    offspring = pl.concat([sire_offspring, dam_offspring]).unique().filter(pl.col("Earliest Acq Category") == "Birth")

    sire_offspring = offspring.join(offspring, left_on="Sire", right_on="Id").select("Id", "Sire", "Dam", "Earliest Acq Category")
    dam_offspring = offspring.join(offspring, left_on="Dam", right_on="Id").select("Id", "Sire", "Dam", "Earliest Acq Category")

    grand_offspring = pl.concat([sire_offspring, dam_offspring]).unique().unique().filter(pl.col("Earliest Acq Category") == "Birth")
    return offspring, grand_offspring

breeder_offspring, breeder_grand_offspring = descendants(is_breeder=True)
nonbreeder_offspring, nonbreeder_grand_offspring = descendants(is_breeder=False)


In [5]:
#nonbreeder_grand_offspring

In [6]:
#breeder_grand_offspring.join(nonbreeder_grand_offspring, on="Id", how="anti")

In [7]:
#breeders#.with_columns(pl.col("Id").sort())
#data

In [8]:
demographics = breeders.with_columns(  # Crate column with list of years alive
    pl.date_ranges(pl.col("Date of Birth"), pl.col("Date of Death"), interval="1y").alias("Year"),
).explode("Year"  # Create row for every year alive for every animal
).with_columns(  # Convert date to year
    pl.col("Year").dt.round("1y")#.dt.year()  # .cast(pl.Date),
)
# .filter(
#     pl.col("is_breeder") == True
# )

## Notes:
# - Animals without a birth date are removed.
# - Depending on date of birth, not all animals show up in last year due to the year interval

In [9]:
demographics.group_by("Id").agg()

Id
str
"""30125"""
"""28110"""
"""38244"""
"""33926"""
"""16649"""
…
"""14396"""
"""8961"""
"""33960"""
"""31961"""


In [10]:
# Using output from RFMix
Q_file = "/master/abagwell/variant-analysis/results/rhesus_prev/admixture/RFMix.chr10.rfmix.Q"
Q = pl.read_csv(Q_file, separator="\t", skip_rows=1).filter( # Remove reference samples
    pl.col("#sample").str.starts_with("W")
).with_columns( # Remove seq prefixes
    pl.col("#sample").str.strip_prefix("WES").str.strip_prefix("WGS")
)

In [11]:
# # Colony assignments
colonies_file = "/master/abagwell/variant-analysis/resources/rhesus/pop/colonies.tsv"
colonies = pl.read_csv(colonies_file, separator="\t", infer_schema_length=None)

In [12]:
colony_demographics = demographics.join(colonies, how="left", on="Id")

merged = colony_demographics.join(Q, how="left", left_on="Id", right_on="#sample"
).group_by("Year", "Colony").agg(pl.count("Id").alias("Count"), pl.mean("Indian"), pl.mean("Chinese")).drop_nulls()

In [13]:
# Plot of u42 colony sizes over time. Counts are small because this colony designations file only includes animals that are sequenced.

alt.Chart(demographics).mark_line(line=True).encode(
    alt.X("Year", title="Year"),
    alt.Y("Count", title="Population Size"),
    #color=alt.Color("Colony:O").scale(scheme='dark2'),
).properties(
    title=f"Rhesus Population over Time\nas of {date_of_pedigree}",
    width=340
)

MaxRowsError: The number of rows in your dataset is greater than the maximum allowed (5000).

Try enabling the VegaFusion data transformer which raises this limit by pre-evaluating data
transformations in Python.
    >> import altair as alt
    >> alt.data_transformers.enable("vegafusion")

Or, see https://altair-viz.github.io/user_guide/large_datasets.html for additional information
on how to plot large datasets.

alt.Chart(...)

In [14]:
# Read runs file
runs_file = "/master/abagwell/variant-analysis/resources/rhesus/samples/runs.all.list"
runs = pl.read_csv(runs_file, separator="\t", has_header=False, new_columns=["batch/run"]).with_columns(
    pl.col("batch/run").str.split("/").list.get(0).alias("batch"),
    pl.col("batch/run").str.split("/").list.get(1).str.split("_").list.get(0).alias("sample"),
    pl.col("batch/run").str.split("/").list.get(1).str.split("_").list.get(1).alias("library"),
# Pull out seq type and indiv. Also made seq types an Enum, which can be sorted later
).with_columns(
    pl.col("sample").str.slice(0, 3).cast(pl.Enum(["unsequenced", "LRS", "WGS", "WES", "GBS"])).alias("seq"),
    pl.col("sample").str.slice(3).alias("indiv"),
# Group by batch + indiv (to not double count multiple runs from same batch) and then by indiv
).group_by("batch", "indiv").agg(pl.first("seq")).group_by("indiv").agg("seq"
# One of two ways to filter seq. Change this depending on which method is preferred. The first doesn't current work though.
).with_columns(
    # Keep only one of each sequencing type for each animal
    #pl.col("seq").list.unique()
    # Keep only largest sequencing type for each animal
    pl.col("seq").list.sort().list.first()
# Keep only certain seq types
)
# .filter(
#     (pl.col("seq") == "WGS") | (pl.col("seq") == "WES")
# )


In [15]:
runs

indiv,seq
str,enum
"""35038""","""GBS"""
"""31414""","""GBS"""
"""36932""","""GBS"""
"""27783""","""WES"""
"""36805""","""GBS"""
…,…
"""28403""","""WES"""
"""34722""","""GBS"""
"""17591""","""WGS"""
"""44527""","""WGS"""


In [16]:
colony_demographics_runs = colony_demographics.join(runs, how="left", left_on="Id", right_on="indiv").with_columns(
    pl.col("seq").fill_null("unsequenced")

).group_by("Year", "seq").agg(pl.count("Id").alias("Count")).sort("Year", "seq", descending=True)

In [17]:
# For testing

colony_demographics_runs
#colony_demographics_runs.sort("Year", "seq").filter(pl.col("Year") == 2021)

Year,seq,Count
date,enum,u32
2024-01-01,"""GBS""",30
2024-01-01,"""WES""",73
2024-01-01,"""WGS""",58
2024-01-01,"""LRS""",1
2024-01-01,"""unsequenced""",14
…,…,…
1980-01-01,"""unsequenced""",7
1979-01-01,"""unsequenced""",7
1978-01-01,"""unsequenced""",6
1977-01-01,"""unsequenced""",3


In [18]:
# The total area shows the number of animals overall, subdivided by type of sequencing (or if unsequenced).
# In order to keep the sum of sequencing types equal to the true total, sequencing types are given priority as follows: unsequenced > LRS > WGS > WES > GBS

alt.data_transformers.disable_max_rows()
alt.Chart(colony_demographics_runs).mark_area().encode(
    alt.X("Year", title="Year", scale=alt.Scale(domainMax=alt.DateTime(year=2023, month=1, day=1), clamp=True)),#.scale(domainMax=2023, clamp=True),
    alt.Y("Count:Q", title="Population size"),
    color=alt.Color("seq:O", title="Sequencing method", sort=["unsequenced", "LRS", "WGS", "WES", "GBS"]).scale(scheme='dark2'), #, sort=["unsequenced", "LRS", "WGS", "WES"]
    order=alt.Order('seq_order:O', sort='ascending')
).properties(
    title=["Rhesus Breeder Population over Time"], #\nas of {date_of_pedigree} #, "With at Least One Offspring ≥ 3 Years Old"
    #height=2000,
    width=340,
).transform_calculate(
    order="{'unsequenced': 0, 'LRS': 1, 'WGS': 2, 'WES': 3, 'GBS': 4}[datum.seq_order]"  
)


In [19]:
# Plot by portion sequenced per year (instead of counts)


In [20]:
colony_demographics_runs.group_by("Year").agg("seq", "Count", pl.sum("Count").alias("Sum")).with_columns(
    pl.col("seq").list.set_union(["unsequenced", "LRS", "WGS", "WES", "GBS"]).alias("all_seq")
    #pl.List("unsequenced", "LRS", "WGS", "WES", "GBS").alias("list")
).with_columns(
    pl.col("all_seq").list.set_difference(pl.col("seq")).alias("missing_seq")
).with_columns(
    # Add empty rows
    pl.col("seq").list.concat("missing_seq"),
    #pl.col("Count").list.concat(pl.col("missing_seq").list.len())
    pl.col("missing_seq").list.len().alias("len"),
    #pl.arange(0, 3).alias("arange")
)
# .with_columns(
#     pl.col("len")
# )

Year,seq,Count,Sum,all_seq,missing_seq,len
date,list[enum],list[u32],u32,list[enum],list[enum],u32
2024-01-01,"[""GBS"", ""WES"", … ""unsequenced""]","[30, 73, … 14]",176,"[""GBS"", ""WES"", … ""unsequenced""]",[],0
2023-01-01,"[""GBS"", ""WES"", … ""unsequenced""]","[62, 153, … 25]",349,"[""GBS"", ""WES"", … ""unsequenced""]",[],0
2022-01-01,"[""GBS"", ""WES"", … ""unsequenced""]","[71, 167, … 25]",382,"[""GBS"", ""WES"", … ""unsequenced""]",[],0
2021-01-01,"[""GBS"", ""WES"", … ""unsequenced""]","[74, 181, … 29]",408,"[""GBS"", ""WES"", … ""unsequenced""]",[],0
2020-01-01,"[""GBS"", ""WES"", … ""unsequenced""]","[78, 195, … 32]",440,"[""GBS"", ""WES"", … ""unsequenced""]",[],0
…,…,…,…,…,…,…
1980-01-01,"[""WES"", ""unsequenced"", … ""LRS""]","[1, 7]",8,"[""WES"", ""unsequenced"", … ""GBS""]","[""GBS"", ""WGS"", ""LRS""]",3
1979-01-01,"[""unsequenced"", ""GBS"", … ""WES""]",[7],7,"[""unsequenced"", ""LRS"", … ""GBS""]","[""GBS"", ""LRS"", … ""WES""]",4
1978-01-01,"[""unsequenced"", ""GBS"", … ""WES""]",[6],6,"[""unsequenced"", ""LRS"", … ""GBS""]","[""GBS"", ""LRS"", … ""WES""]",4
1977-01-01,"[""unsequenced"", ""GBS"", … ""WES""]",[3],3,"[""unsequenced"", ""LRS"", … ""GBS""]","[""GBS"", ""LRS"", … ""WES""]",4


In [21]:
by_fraction = colony_demographics_runs.group_by("Year").agg("seq", "Count", pl.sum("Count").alias("Sum")).explode("seq", "Count").with_columns(
    pl.col("Count").truediv("Sum").alias("Portion")
)
by_fraction

Year,seq,Count,Sum,Portion
date,enum,u32,u32,f64
2024-01-01,"""GBS""",30,176,0.170455
2024-01-01,"""WES""",73,176,0.414773
2024-01-01,"""WGS""",58,176,0.329545
2024-01-01,"""LRS""",1,176,0.005682
2024-01-01,"""unsequenced""",14,176,0.079545
…,…,…,…,…
1980-01-01,"""unsequenced""",7,8,0.875
1979-01-01,"""unsequenced""",7,7,1.0
1978-01-01,"""unsequenced""",6,6,1.0
1977-01-01,"""unsequenced""",3,3,1.0


In [22]:
by_fraction

Year,seq,Count,Sum,Portion
date,enum,u32,u32,f64
2024-01-01,"""GBS""",30,176,0.170455
2024-01-01,"""WES""",73,176,0.414773
2024-01-01,"""WGS""",58,176,0.329545
2024-01-01,"""LRS""",1,176,0.005682
2024-01-01,"""unsequenced""",14,176,0.079545
…,…,…,…,…
1980-01-01,"""unsequenced""",7,8,0.875
1979-01-01,"""unsequenced""",7,7,1.0
1978-01-01,"""unsequenced""",6,6,1.0
1977-01-01,"""unsequenced""",3,3,1.0


In [23]:
alt.data_transformers.disable_max_rows()
alt.Chart(by_fraction).mark_area().encode(
    alt.X("Year", title="Year", scale=alt.Scale(domainMax=alt.DateTime(year=2023, month=1, day=1), clamp=True)),#.scale(domainMax=2023, clamp=True),
    alt.Y("Portion:Q", title="Population size"),
    color=alt.Color("seq:O", title="Sequencing method", sort=["unsequenced", "LRS", "WGS", "WES", "GBS"]).scale(scheme='dark2'), #, sort=["unsequenced", "LRS", "WGS", "WES"]
    order=alt.Order('seq_order:O', sort='ascending')
).properties(
    title=["Rhesus Breeder Population over Time"], #\nas of {date_of_pedigree} #, "With at Least One Offspring ≥ 3 Years Old"
    #height=2000,
    width=340,
).transform_calculate(
    order="{'unsequenced': 0, 'LRS': 1, 'WGS': 2, 'WES': 3, 'GBS': 4}[datum.seq_order]"  
)
