In [38]:
# Count the sequence data

import polars as pl


seq_data = pl.read_csv("/master/abagwell/workspace/batch_seq/seq_data.tsv", comment_prefix="#", separator="\t"
).with_columns(
    pl.col("run").str.split("_").alias("run_split")
).with_columns(
    pl.col("run_split").list.get(0).alias("sample"),
    pl.col("run_split").list.get(1).alias("library"),
    pl.col("run_split").list.get(2).alias("flowcell"),
    pl.col("run_split").list.get(3).alias("lane"),
).with_columns(
    # Set library as "1" if doesn't exist. This is needed for later to prevent null values that would leave out these rows.
    pl.col("library").fill_null("1")
).with_columns(
    pl.concat_str("sample", "library", separator="_").alias("sample_library"),
    pl.col("sample").str.slice(3).alias("animal"),
    pl.col("sample").str.slice(0, length=3).alias("seq"),
).drop("run", "run_split", "sample")

batch_data = pl.read_csv("/master/abagwell/workspace/batch_seq/batch_data.tsv", comment_prefix="#", separator="\t"
).with_columns(
    pl.col("date").str.strptime(pl.Date, format="%Y-%m") #.str.to_date()
)

animal_data = pl.read_csv("/master/abagwell/workspace/batch_seq/demographics.tsv", comment_prefix="#", separator="\t"
).with_columns(
    # Reokace all species names that start with "P" as "Papio".
    # This is because there are three subspecies plus hybrids in this species group
    pl.col("species").map_elements(lambda x: "Papio spp." if "P" in x else x)
)
    

In [39]:
# Join first two tables
joined = seq_data.join(batch_data, on="batch").filter(
    # Remove batches that are from other institutes
    #pl.col("batch") != "marmosets_from_BCM"
    pl.col("grant") != "other_institute"
).group_by(
    # Consider sequences of the same animal from the same batch the same
    #"batch", "seq", "animal"
    # Consider sequences of the same animal from any batches as the same
    "seq", "animal",
).first(
    # Unnest aggregate lists
)

# Join third tables
joined2 = joined.join(animal_data, on="animal").sort("date"
).group_by_dynamic("date", every="1y", offset="4mo", closed="both", by=["species", "seq", "grant"]
).agg(pl.count("sample_library")
).sort("date", "species", "seq", "grant"
).rename({"date": "year",
          "sample_library": "sample_library_count",
          }).with_columns(
    pl.cum_sum("sample_library_count").over("species", "seq", "grant").alias("sample_library_cumulative_count")
)

In [45]:
#joined.write_csv("/master/abagwell/workspace/batch_seq/summary_stats.ignore_duplicates.tsv", separator="\t", date_format="%Y-%m-%d")
#by_batch = joined.group_by("batch").agg(pl.count("sample_library"))


In [49]:
joined2

species,seq,grant,year,sample_library_count,sample_library_cumulative_count
str,str,str,date,u32,u32
"""Macaca mulatta…","""GBS""","""U42""",2017-05-01,382,382
"""Macaca mulatta…","""GBS""","""U42""",2018-05-01,14,396
"""Macaca mulatta…","""GBS""","""U42""",2019-05-01,278,674
"""Callithrix spp…","""WGS""","""P51""",2020-05-01,3,3
"""Callithrix spp…","""WES""","""P51""",2021-05-01,6,6
"""Macaca mulatta…","""WES""","""U42""",2021-05-01,142,142
"""Macaca mulatta…","""WGS""","""U42""",2021-05-01,24,24
"""Callithrix spp…","""WES""","""P51""",2022-05-01,28,34
"""Macaca mulatta…","""WES""","""U42""",2022-05-01,306,448
"""Macaca mulatta…","""WGS""","""P51 supplement…",2022-05-01,111,111


In [47]:
#by_batch.glimpse()

Rows: 23
Columns: 2
$ batch          <str> 'marmoset_GBS', 'Rhesus_MHC_Barcode_List', 'X202SC23020672-Z01-F001', 'X202SC23114612-Z01-F001', 'X202SC21012362-Z01-F001', 'X202SC23102082-Z01-F001', 'X202SC22041999-Z01-F001', 'X202SC22011652-Z01-F001', 'X202SC22042002-Z01-F001', 'X202SC22111092-Z01-F001'
$ sample_library <u32> 85, 382, 52, 50, 27, 8, 24, 17, 6, 111



In [9]:
# Combine runs from same animal within same batch

# joined.filter(
#     # Remove batches that are from other institutes
#     pl.col("batch") != "marmosets_from_BCM"
# ).group_by("batch", "animal").agg(pl.col("seq").first"*")

# joined.group_by("animal", "seq").agg(
#     pl.all().sort_by('date').first()
# )

In [212]:
#joined.group_by("batch", "species").agg(pl.col("animal").count()).sort("batch")

In [52]:
# joined.with_columns(
#     pl.col("date").fill_null(strategy="min")
# ).sort("date").group_by_dynamic("date", every="1y").agg("batch")

In [213]:
# General stats
# joined.filter(
#     (pl.col("species").str.contains("Call"))
#     & (pl.col("seq") == "WES")
#     & (pl.col("date").is_between(pl.datetime(1990, 5, 1), pl.datetime(2025, 4, 30), closed="both"))
# )
# joined.group_by("seq", "species").agg(
#     "*"
#     #pl.all().sort_by('date').first()
# )

In [50]:
# Filter by desired

# If group_by + agg comes first, we exc

# joined.filter(
#     (pl.col("species").str.contains("Call"))
#     & (pl.col("seq") == "WES")
#     & (pl.col("date").is_between(pl.datetime(1990, 5, 1), pl.datetime(2025, 4, 30), closed="both"))
# ).group_by("animal", "seq").agg(
#     "*"
# )


# joined.filter(
#     (pl.col("species").str.contains("Call"))
#     & (pl.col("seq") == "WES")
#     & (pl.col("date").is_between(pl.datetime(1990, 5, 1), pl.datetime(2025, 4, 30), closed="both"))
# ).group_by("seq", "species").agg(
#     "*"
#     #pl.all().sort_by('date').first()
# )


In [51]:
# joined.filter(
#     (pl.col("species").str.contains("Call"))
#     & (pl.col("seq") == "WES")
#     & (pl.col("date").is_between(pl.datetime(1990, 5, 1), pl.datetime(2025, 4, 30), closed="both"))
# ).group_by("animal", "seq").agg(
#     pl.all().sort_by('date').first()
# )

In [144]:
# # Filter by desired
# #species = "Macaca mulatta"
# keep_only_first = False

# my_agg = pl.DataFrame.group_by


# joined.filter(
#     (pl.col("species").str.contains("Macaca"))
#     & (pl.col("seq") == "WES")
#     & (pl.col("date").is_between(pl.datetime(2020, 5, 1), pl.datetime(2024, 4, 30), closed="both"))
# ).group_by("animal").agg(
#     pl.all().sort_by('date').first()
# )
# #.group_by("animal", "seq").agg('*')