In [None]:
import altair as alt
import polars as pl
import yaml

# Read config file. User should update this file path as needed
configfile = "/master/abagwell/workspace/github_project/variant-analysis/config/marmoset.yaml"
with open(configfile, 'r') as file:
    config = yaml.safe_load(file)

# Load colors
colors = pl.read_csv(config["colors"], separator="\t", comment_prefix='#', schema_overrides={"Cohort": pl.String})

# Read autosomes
with open(config["resources"] + "ref_fna/autosomes.list") as f:
    chromosomes = f.read().splitlines()

In [None]:
# Merge ROH files and filter

# Read colony info
#colonies_file = "/master/abagwell/variant-analysis/resources/rhesus/pop/colonies.tsv"
colonies_file = config["cohorts"]
colonies = pl.read_csv(colonies_file, separator="\t", infer_schema_length=None, schema_overrides={
    "Dam": pl.String,
    "Sire": pl.String,
    "Id": pl.String,
    "Cohort": pl.String,
})

# Read chromosome lengths
chromosome_df = pl.read_csv(f"{config['resources']}ref_fna/chromosomes.tsv", comment_prefix='#', separator='\t')
chromosome_lengths = list(chromosome_df["Length"])
chromosomes = chromosome_df["Chromosome"]
genome_length = sum(chromosome_lengths)

In [None]:
## For when using `bcftools roh`
# Changes column names to match scikit-allel

# Variables to adjust
min_ROH_length = 1_000_000
#min_ROH_length = 0


roh_dfs = []
for chrom in chromosomes:
    roh_file = f"{config['results']}roh/bcftools/{config['dataset']}.all.chr{chrom}.RG.roh"
    roh_df = pl.read_csv(roh_file, comment_prefix="$", separator="\t", skip_lines=3,
        schema_overrides=[pl.String, pl.String, pl.String, pl.Int32, pl.Int32, pl.Int64, pl.Int32, pl.Float32],
        new_columns=["RG", "sample", "chrom", "start", "stop", "length", "Number of markers", "Quality (average fwd-bwd phred score)"]
    ).filter(
        # Keep only large ROHs
        pl.col("length") > min_ROH_length
    ).with_columns(
    )#.join(colonies, how="left", left_on="sample", right_on="Id")
    
    roh_dfs.append(roh_df)

roh_df = pl.concat(roh_dfs)


In [None]:
# Data per animal for source groups
per_indiv_df = roh_df.select("sample", "chrom", "start", "stop", "length"
# .filter(
#     # Use a filter if we wanting to keep only certain cohorts
#     ~pl.col("Cohort").is_in(["Conventional source", "Brooks source", "NEPRC source"])
).group_by("sample").agg(
    froh = pl.sum("length").truediv(genome_length),
    number_of_ROHs = pl.count("length"),
)

In [None]:
# NOTE: It's very important that the animals listed in `colonies` actually were run through the fROH analysis;
# otherwise, they will show up as having 0 ROHs when they really might still have some.
all_indiv_df = colonies.join(per_indiv_df, how="left", left_on="Id", right_on="sample").with_columns(
# Index populations to be in correct order (TODO: Does this actually do anything?)
    pl.col("Cohort").cast(pl.Enum(
        colors["Cohort"]
    )),
    # Assign 0 to the samples not found to have ROHs
    pl.col("froh").fill_null(0),
    pl.col("number_of_ROHs").fill_null(0)
)

In [None]:
alt.Chart(all_indiv_df).mark_circle(opacity=0.9).encode(
    alt.X("number_of_ROHs", title="Number of ROHs"),
    alt.Y("froh", title="fROH"),
    alt.Color("Cohort", title="Cohorts").scale(
        domain = list(colors["Cohort"]),
        range = list(colors["Color"])
    )
).properties(
    title="Individuals by ROH"
)#.save("/master/abagwell/figures/roh/U42_WES.common_between_founding_cohorts2.by_indiv.min1Mb.html")

In [None]:
# roh_df = roh_df.drop_nulls("Cohort"
roh_df = all_indiv_df.drop_nulls("Cohort"
).group_by("Cohort").agg("*").with_columns(
# Index populations so that year ranges have their own color
    pl.col("Cohort").cast(pl.Enum(
        colors["Cohort"]
    )),
).with_row_index("pop_idx", offset=1).with_columns(
    # Find which are year ranges
    is_year = pl.col("Cohort").cast(pl.String).str.contains("-").not_().cast(pl.Int8)
).with_columns(
    # Set the index of year ranges to 0
    pl.col("pop_idx").mul("is_year")
).drop("is_year").sort("Cohort"#.explode(pl.exclude("pop_idx"))
).group_by(
    # Create color index
    "pop_idx", maintain_order=True
).agg('*').with_row_index("color_idx").drop("pop_idx").explode(pl.exclude("color_idx")).explode(pl.exclude("Cohort", "color_idx"))

In [None]:
# # Plot ROH lengths by group

# # mean_roh_df = roh_df.group_by("Cohort", "color_idx").agg(pl.mean("length"))
# mean_roh_df = roh_df.group_by("Cohort").agg(pl.mean("length"))

# # # Index populations so that year ranges have their own color
# # ).with_columns(
# #     pl.col("Cohort").cast(pl.Enum(
# #         colors["Cohort"]
# #     )),
# # ).with_row_index("pop_idx", offset=1).with_columns(
# #     # Find which are year ranges
# #     is_year = pl.col("Cohort").cast(pl.String).str.contains("-").not_().cast(pl.Int8)
# # ).with_columns(
# #     # Set the index of year ranges to 0
# #     pl.col("pop_idx").mul("is_year")
# # ).drop("is_year").sort("Cohort").group_by(
# #     # Create color index
# #     "pop_idx", maintain_order=True
# # ).agg('*').with_row_index("color_idx").drop("pop_idx"
# # ).explode("Cohort", "length")

# mean_roh_df




# alt.Chart(mean_roh_df).mark_bar().encode(
#     alt.X("Cohort", title="Cohort"),
#     alt.Y("length", title="Mean Length"),
#     alt.Color("Cohort:N", title=None, legend=None).scale(
#         domain = list(colors["Cohort"]),
#         range = list(colors["Color"])
#     ),
# ).properties(
#     title="ROH Across Cohorts"
# )#.save("/master/abagwell/figures/roh/U42_WES.common_between_founding_cohorts2.mean_roh.min1Mb.html")

In [None]:
def plot_trio_roh(df):
    """Plot the ROHs across all chromosomes of one trio."""

    # Create dummy records. That is, an ROH of length 0 for every combination of sample and chromosome so that they all appear on the plot
    chromosomes = roh_df.select("chrom").unique()
    sequenced_animals = df.select("sample", "Colony").unique()
    dummy_records = sequenced_animals.join(chromosomes, how="cross").with_columns(
        start = pl.lit(0).cast(pl.Int32),
        stop = pl.lit(0).cast(pl.Int32),
        length = pl.lit(0).cast(pl.Int64),
    ).select(  # Reorder
        "sample", "chrom", "start", "stop", "length", "Colony"
    )

    # Concatenate ROHs with dummy records
    df = pl.concat([df, dummy_records]).sort("sample")  # TODO: Improve sorting for when needs to be numeric or animal ID has "X" in it

    columns = []
    for idx, chrom in enumerate(chromosomes):
        #column_plots = []
        # Only keep y-axis labels on leftmost plot
        if idx == 0:
            axis = alt.Axis()
        else:
            axis = None
        # # Info from `bcftools roh`
        chrom_df = df.filter(
            # Keep only large ROHs
            #pl.col("length") > 2_000_000
            pl.col("chrom") == chrom
        )

        roh_plot = alt.Chart(chrom_df).mark_bar().encode(
        alt.X("mb_start_pos:Q", title=["Position", "(Mb)"]).scale(domainMin=0, domainMax=chromosome_lengths[int(chrom) -1]/1_000_000, clamp=True),
        alt.X2("mb_stop_pos:Q"),
        alt.Y("sample:O", axis=axis, title="Sample", sort=alt.EncodingSortField(field='animal', order='ascending')), # sort=alt.EncodingSortField(field='date', order='ascending')
        color = alt.Color("Colony:N").scale(
            domain = list(colors["Cohort"]),
            range = list(colors["Color"])
        ),
        #order=alt.Order('animal:O', sort='ascending'),
        tooltip=[
            alt.Tooltip("chrom", title="Chr"),
            alt.Tooltip("start", title="Start (bp)"),
            alt.Tooltip("stop", title="End (bp)"),
            alt.Tooltip("length", title="Length"),
        ]
        ).properties(
            width=chromosome_lengths[int(chrom) - 1]/1500000,
            height=alt.Step(10),
            #width=300, # For when only displaying one chromosome
            #title=["RoH", f"chr{chrom}"]
        ).transform_calculate(
            mb_start_pos = 'datum.start / 1000000',
            mb_stop_pos = 'datum.stop / 1000000',
        )
    
        #plot_list.append(roh_plot)
        columns.append(roh_plot)
    
    return alt.hconcat(*columns)

In [None]:
def plot_all_trio_roh(df):
    # Find sequenced trios
    sequenced_animals = df["sample"].unique()
    trios = colonies.filter(
        pl.col("Id").is_in(sequenced_animals) & pl.col("Sire").is_in(sequenced_animals) & pl.col("Dam").is_in(sequenced_animals)
    ).select("Id", "Sire", "Dam").sort("Id")

    # Pull out specific trio
    rows = []
    for trio in trios.transpose().select([f"column_{num}" for num in range(50,75)]):
        trio_df = roh_df.filter(
            pl.col("sample").is_in(trio)
        ).select("sample", "chrom", "start", "stop", "length", "Colony")
        
        rows.append(plot_trio_roh(trio_df))
    return alt.vconcat(*rows)

In [None]:
# Create dataframe for subsequent plots

chrom_df = pl.DataFrame({
    "chrom": chromosomes,
    "chrom_len": chromosome_lengths
})





# # total_len_roh = roh_df.group_by("sample", "color_idx", "Cohort").agg(pl.sum("length")).sort("sample").with_columns(
# total_len_roh = roh_df.sort("Id").with_columns(
#     pl.col("length").truediv(genome_length).alias("froh")
# # ).with_columns(
# #     # Add seq info for splitting by seq
# #     seq = pl.col("sample").str.slice(0, 3)
# # )
# # .with_columns(
# #     pl.col("sample").str.split("_").list.get(0).str.slice(3).alias("animal")
# ).group_by("sample").agg(pl.last("froh"), pl.last("color_idx"), pl.last("Cohort"))#.with_columns(
#     # Pull out the WGS over WES and select first
#     #pl.col("sample").list.sort(descending=True).list.first(),
#     #pl.col("froh").list.sort(descending=True).list.first(),
# # ).with_columns(
# #     cohort = pl.concat_str([pl.col("Colony"), pl.col("Cohort")], separator="_")
#     #pl.col("sample").str.split("_").list.get(0).str.slice(3),
# # ).filter(
# #     ~pl.col("Cohort").is_null()
# # )

In [None]:
# Boxplot of fROH over Cohorts
boxplot = alt.Chart(roh_df).mark_boxplot().encode(
    alt.X("Cohort:N", title="Cohort", #axis=alt.Axis(labelAngle=-45),
        sort=colors["Cohort"],   
    ),
    alt.Y("froh:Q", title="fROH"),
    #alt.Column("seq", title="Sequence Type"),
    color=alt.Color("Cohort:N", legend=None).scale(
        domain = list(colors["Cohort"]),
        range = list(colors["Color"])
    )
).properties(
    #width=100
    title="fROH in Cohorts"
)

boxplot#.save("/master/abagwell/figures/final_plots/U42_WES.common_between_founding_cohorts2.froh_boxplot.min1Mb.html")

#boxplot.save(f"/master/abagwell/figures/roh/U42_WES.common_between_founding_cohorts2.boxplot.maf_0.05.gt{min_ROH_length / 1_000_000}Mb.html")

In [None]:
# Violin plot (without error bars or means). See next code block for it with extra details

violin = alt.Chart(roh_df
#                    .with_columns(
#    pl.col("Cohort").str.replace("Offspring of merger", "Merged"))
).transform_density(
    'froh',
    as_=['froh', 'density'],
    #extent=[0, 0.05],
    #extent=[0, 0.1],
    groupby=['Cohort', 'color_idx']
).mark_area(orient='horizontal').encode(
    alt.X("density:Q").stack('center')
        .impute(None)
        .title(None)
        .axis(labels=False, values=[0], grid=False, ticks=True),
    alt.Y("froh:Q", title="fROH"),
    alt.Column("Cohort:N", title="Cohort",
          # TODO: Generalize this
        sort=colors["Cohort"],   
        ).spacing(0).header(titleOrient='bottom', labelOrient='bottom', labelPadding=0),
    color=alt.Color("Cohort:N").scale(
        domain = list(colors["Cohort"]),
        range = list(colors["Color"])
    ),
).properties(
    width=65,
    title="fROH Across Cohorts"
)


violin

#violin.save(f"/master/abagwell/figures/roh/U42_WES.common_between_founding_cohorts2.violinplot.maf_0.05.gt{min_ROH_length / 1_000_000}Mb.full_height.html")
#violin.save(f"/master/abagwell/figures/roh/U42_WES.common_between_founding_cohorts2.violinplot.maf_0.05.gt{min_ROH_length}Mb.max_0.1.html")

In [None]:
# Violin plot but with error bars and means
# Has to be more complicated in order to construct a layered chart that also is faceted
# Variations on the graph can be made my adjusting the scale on the alt.Y of the violin plot

# Varibles to adjust
error_unit = 'stderr' # Can switch extent to `stdev`, `stderr`, or `ci`
# max_y = 0.08
# min_y = -0.02
#max_y = 0.055
#max_y = 0.09
max_y = 0.6
#max_y=0.14 # For zoomed of Merger
min_y = 0.0

violin = alt.Chart().transform_density(
    'froh',
    as_=['froh', 'density'],
    extent=[min_y, max_y],
    groupby=['Cohort', 'color_idx']
).mark_area(orient='horizontal').encode(
    alt.X("density:Q").stack('center')
        .impute(None)
        .title(None)
        .axis(labels=False, values=[0], grid=False, ticks=True)
        .scale(nice=False,zero=False),
    #alt.X("Cohort"),
    #alt.Y("froh:Q", title=x_title).axis(range=[0, 0.05]),
    alt.Y("froh:Q", title='fROH'),#.scale(domain=[min_y, max_y]),
    # alt.Column("Cohort:N", title="Cohort",
    #       # TODO: Generalize this
    #     ).spacing(0).header(titleOrient='bottom', labelOrient='bottom', labelPadding=0),
    color=alt.Color("Cohort:N", legend=None).scale(
        domain = list(colors["Cohort"]),
        range = list(colors["Color"])
    )
).properties(
    width=25
)

error = alt.Chart().mark_errorbar(extent=error_unit).encode(
    #alt.X('Cohort', title=None),
    alt.Y('froh:Q', title='fROH')
    )

mean = alt.Chart().mark_circle(color='black').encode(
    #alt.X('Cohort', title=None),
    alt.Y('mean(froh):Q', title='fROH')
    )

layered = alt.layer(violin, error, mean, data=roh_df
        # .filter(
        #     pl.col("Cohort").is_in(["2018-2020", "Offspring of merger", "NEPRC source"]))
        #     #pl.col("Cohort").is_in(["Conventional source", "Brooks source", "NEPRC source"]))
).facet(
    #column='Cohort'
    alt.Column('Cohort',
        header=alt.Header(
            labelOrient='bottom', labelPadding=0, labelAnchor='middle', labelAngle=-90, labelBaseline="middle", labelAlign="right",
            title='Cohort', titleAlign="center", titleOrient='bottom', ) # labelAngle=-45
    )
).resolve_scale(x=alt.ResolveMode("independent")
).configure_facet(
    spacing=0,
).configure_title(anchor='middle').properties(
    title="fROH in Cohorts"
    #title="fROH of Merger"
    #title=["fROH in", "Founding Populations"]
)


layered#.save("/master/abagwell/figures/final_plots/narrow_violins/U42_WES.common_between_founding_cohorts2.violinplot_froh.min1Mb.merger_zoomed.html")
layered#.save(f"/master/abagwell/figures/roh/U42_WES.common_between_founding_cohorts2.froh_violinplot.gt{min_ROH_length / 1_000_000}Mb.{error_unit}.full_height.html")
#layered.save(f"/master/abagwell/figures/roh/U42_WES.common_between_founding_cohorts2.froh_violinplot.{error_unit}.html")
#layered.save(f"{config["results"]}figures/{config["dataset"]}.all.froh_violinplot.{error_unit}.cutoff.html")
#layered.save(f"{config["results"]}figures/{config["dataset"]}.all.froh_violinplot.{error_unit}.html")

In [None]:
from itertools import combinations

import pandas as pd
import scipy

# Do T-test for each pair of cohorts
results = []
for cohort1, cohort2 in combinations(list(roh_df["Cohort"].unique()), 2):
    result = scipy.stats.ttest_ind(
        roh_df.filter(
            pl.col('Cohort') == cohort1
    )['froh'],
        roh_df.filter(
            pl.col('Cohort') == cohort2
    )['froh'],
    )
    results.append((cohort1, cohort2, result.statistic, result.pvalue, result.df))

df = pd.DataFrame(results, columns=['Cohort1', 'Cohort2', 'Statistc', 'p-Value', 'DF'])

In [None]:
df