In [None]:
# Merge ROH files and filter
# Altair. Using output from `scikit-allel`
import altair as alt
import pandas as pd
import polars as pl

from datetime import date

# pedigree_file = "/master/abagwell/variant-analysis/resources/rhesus/pedigree/Demographics_2024-04-17_10-28-20.tsv"
# date_of_pedigree = date(2024, 4, 17)
# dates = pl.read_csv(pedigree_file, separator="\t", infer_schema_length=None, columns=["Id", "Most Recent Acq"]).with_columns(
#     #pl.col("Account Description").str.contains("eserved for breeding").alias("is_reserved_for_breeding"),
#     #pl.col("Account Description").str.contains("reeder").alias("is_breeder"),
# ).with_columns( # Change str to date type
#     date = pl.col("Most Recent Acq").str.to_datetime("%m-%d-%Y %H:%M"),
# ).drop("Most Recent Acq").with_columns(  # Set death date of living animals to date of pedigree
#     #pl.col("Date of Death").fill_null(date_of_pedigree),
# )



# Read colony info
colonies_file = "/master/abagwell/variant-analysis/resources/rhesus/pop/colonies.tsv"
colonies = pl.read_csv(colonies_file, separator="\t", infer_schema_length=None)#.join(dates, on="Id", how="left")

chromosomes = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20"]
chromosome_lengths = [223_616_942, 196_197_964, 185_288_947, 169_963_040, 187_317_192, 179_085_566, 169_868_564, 145_679_320, 134_124_166, 99_517_758, 133_066_086, 130_043_856, 108_737_130, 128_056_306, 113_283_604, 79_627_064, 95_433_459, 74_474_043, 58_315_233, 77_137_495]  # For Mmul_10
#chromosomes = ["1", "2"]
#chromosome_lengths = [223_616_942, 196_197_964]  # For Mmul_10


roh_dfs = []
for chrom in chromosomes:
    roh_file = f"/master/abagwell/variant-analysis/results/rhesus/roh/scikit-allel/with_is_accessible/U42_WGS_WES.SNP.chr{chrom}.roh_poisson.tsv"
    roh_df = pl.read_csv(roh_file, comment_prefix="#", has_header=True, separator="\t",
        dtypes=[pl.String, pl.String, pl.Int32, pl.Int32, pl.Int32, pl.Boolean]
    ).filter(
        # Keep only large ROHs
        pl.col("length") > 1_000_000
    ).with_columns(
        pl.col("sample").str.split("_").list.get(0).str.slice(3).alias("animal")
    ).join(colonies, how="left", left_on="animal", right_on="Id").drop("tag")
    
    roh_dfs.append(roh_df)

roh_df = pl.concat(roh_dfs)


In [None]:
# Read sample names from fROH file
froh_file = f"/master/abagwell/variant-analysis/results/rhesus/roh/scikit-allel/with_is_accessible/U42_WGS_WES.SNP.chr{chrom}.froh_poisson.tsv"
froh_df = pl.read_csv(froh_file, comment_prefix="#", has_header=True, separator="\t",
    new_columns=["sample", "chrom", "froh"],
    dtypes=[pl.String, pl.String, pl.Float64]
).with_columns(
    pl.col("sample").str.split("_").list.get(0).str.slice(3).alias("animal")
)

In [None]:


sample_names = list(froh_df.unique("sample").sort("sample")["sample"])

# Create dummy lines
#dummies = roh_df.group_by("tag", "sample", "chrom").agg(pl.first("*"))

dummies = pl.DataFrame([
    pl.Series("sample", sample_names, dtype=pl.String),
]).join(pl.DataFrame([pl.Series("chrom", chromosomes, dtype=pl.String),]), how="cross").with_columns(
    pl.lit(0).alias("start"),
    pl.lit(0).alias("stop"),
    pl.lit(0).alias("length"),
    pl.lit(False).alias("is_marginal"),
).with_columns(
    pl.col("sample").str.split("_").list.get(0).str.slice(3).alias("animal")
).join(colonies, how="left", left_on="animal", right_on="Id")

roh_df_with_dummy = pl.concat([roh_df, dummies])

# Keep only 1 instance of each animal
no_duplicates = roh_df_with_dummy.group_by("animal").agg("sample").with_columns(
    # Pull out the WGS over WES and select first
    pl.col("sample").list.sort(descending=True).list.first(),
).with_columns(
    #pl.col("sample").str.split("_").list.get(0).str.slice(3),
)
roh_df_no_duplicates = roh_df_with_dummy.join(no_duplicates, on="sample", how="semi"
).with_columns(
    # Keep only animal name
    sample = pl.col("animal")
)

# Alternatively, keep only duplicates
only_duplicates = roh_df_with_dummy.group_by("animal").agg("sample").with_columns(
    # Pull out the WGS over WES and select first
    pl.col("sample").list.unique()#list.sort(descending=True).list.first(),
).filter(
    pl.col("sample").list.len() > 1
).explode("sample")
roh_df_only_duplicates = roh_df_with_dummy.join(only_duplicates, on="sample", how="semi"
).with_columns(
    # Keep only animal name
    #sample = pl.col("animal")
)

# Set which df to use: roh_df_no_duplicates or roh_df_only_duplicates
#roh_df = roh_df_only_duplicates
roh_df = roh_df_no_duplicates

In [None]:
# Altair. Using output from `bcftools roh`

alt.data_transformers.disable_max_rows()

def roh_plot_colony(roh_df, colony, color):

    #chromosomes = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20"]
    #chromosome_lengths = [223_616_942, 196_197_964, 185_288_947, 169_963_040, 187_317_192, 179_085_566, 169_868_564, 145_679_320, 134_124_166, 99_517_758, 133_066_086, 130_043_856, 108_737_130, 128_056_306, 113_283_604, 79_627_064, 95_433_459, 74_474_043, 58_315_233, 77_137_495]  # For Mmul_10
    # chromosomes = ["1", "2"]
    # chromosome_lengths = [223_616_942, 196_197_964]  # For Mmul_10


    home = '/master/abagwell'
    contig_length = f"{home}/variant-analysis/results/rhesus/relatedness/roh/contig_lengths.tsv"

    # Find max fROH in Mb across all colony groups so that each plot will have the same axis
    max_froh = roh_df.group_by("sample").agg(pl.sum("length"))["length"].max() / 1_000_000


    # Subset df
    roh_df = roh_df.filter(
        # Keep only members of specified colony
        pl.col("Colony") == colony
    )



    bar_df = roh_df.filter(
        # Remove dummy variables
        pl.col("length") > 0
    ).with_columns(
        pl.int_ranges(pl.col("start"), pl.col("stop"), 1_000_000).alias("position")
        ).explode("position").group_by("chrom", "position", "Colony").agg(pl.len())

    max_bar_height = bar_df["len"].max()


    plot_list = []
    columns = []
    for idx, chrom in enumerate(chromosomes):
        column_plots = []
        # Only keep y-axis labels on leftmost plot
        if idx == 0:
            axis = alt.Axis()
        else:
            axis = None
        # Info from `bcftools roh`
        chrom_roh_df = roh_df.filter(
            # Keep only large ROHs
            #pl.col("length") > 2_000_000
            pl.col("chrom") == chrom
        )

        roh_plot = alt.Chart(chrom_roh_df).mark_bar().encode(
            alt.X("mb_start_pos:Q").title(["Position", "(Mb)"]).scale(domainMin=0, domainMax=chromosome_lengths[int(chrom) -1]/1_000_000, clamp=True),
            alt.X2("mb_stop_pos:Q"), #.title("Stop"),
            alt.Y("sample:O", axis=axis, title="Sample"), # sort=alt.EncodingSortField(field='date', order='ascending')
            #color = alt.Color("Colony:N"),  # .scale(range=['#1f77b4', 'orange', '#9F2B68'])
            color = alt.value(color),
            #order=alt.Order('animal:O', sort='ascending'),
            tooltip=[
                alt.Tooltip("chrom", title="Chr"),
                alt.Tooltip("start", title="Start (bp)"),
                alt.Tooltip("stop", title="End (bp)"),
                alt.Tooltip("length", title="Length"),
            ]
        ).properties(
            width=chromosome_lengths[int(chrom) - 1]/1500000,
            height=alt.Step(10),
            #width=300, # For when only displaying one chromosome
            title=["RoH", f"chr{chrom}"]
        ).transform_calculate(
            mb_start_pos = 'datum.start / 1000000',
            mb_stop_pos = 'datum.stop / 1000000',
        )
        plot_list.append(roh_plot)
        column_plots.append(roh_plot)

        chrom_bar_df = bar_df.filter(
            pl.col("chrom") == chrom
        )


        # Create bar plot
        if idx == 0:
            axis = alt.Axis()
        else:
            axis = alt.Axis(labels=False, title=None)  #ticks=False,
        bar_plot = alt.Chart(chrom_bar_df).mark_bar(size=1).encode(
            alt.X("mb_pos:Q").title(["Position", "(Mb)"]).scale(domainMin=0, domainMax=chromosome_lengths[int(chrom) -1]/1_000_000, clamp=True),  ## Modified here last
            alt.Y("len", axis=axis).title("Count").scale(domainMin=0, domainMax=max_bar_height), #axis=axis  #, domainMax=bar_df["len"].max()
            #alt.Row("Colony:N", title=None), 
            #color=alt.Color("Colony:N"),
            color=alt.value(color)
            ).properties(
                width=chromosome_lengths[int(chrom) - 1]/1500000,
                height=70,
                #width=300, # For when only displaying one chromosome
                title=["RoH", f"chr{chrom}"]
            ).transform_calculate(
                mb_pos = 'datum.position / 1000000',
            )

        column_plots.append(bar_plot)
        columns.append(alt.vconcat(*column_plots))

    total_len_roh = roh_df.group_by("sample").agg(pl.sum("length"))#.sort("sample")

    total_roh = alt.Chart(total_len_roh).mark_bar().encode(
        alt.X("mb_length:Q", title="Total length (Mb)").scale(domainMin=0, domainMax=max_froh, clamp=True),
        #alt.X2(),
        alt.Y("sample:O", title="Sample"), #sort=alt.EncodingSortField(field='date', order='ascending')
        color=alt.value(color),
        tooltip=[
            alt.Tooltip("length", title="Length"),
        ]
    ).properties(
        height=alt.Step(10),
        width=100,
        title="Total ROH Length",
    ).transform_calculate(
        mb_length = 'datum.length / 1000000',
    )
    columns.append(total_roh)

    return alt.hconcat(*columns)

In [None]:
roh_plot_colony(roh_df, 1, "teal")

In [None]:
roh_plot_colony(roh_df, 2, "orange")