In [1]:
# Merge ROH files and filter

import altair as alt
import polars as pl

# Read colony info
colonies_file = "/master/abagwell/variant-analysis/resources/rhesus/pop/colonies.tsv"
colonies = pl.read_csv(colonies_file, separator="\t", infer_schema_length=None,schema_overrides={
    "Dam": pl.String,
    "Sire": pl.String,
    "Id": pl.String
})

chromosomes = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20"]
chromosome_lengths = [223_616_942, 196_197_964, 185_288_947, 169_963_040, 187_317_192, 179_085_566, 169_868_564, 145_679_320, 134_124_166, 99_517_758, 133_066_086, 130_043_856, 108_737_130, 128_056_306, 113_283_604, 79_627_064, 95_433_459, 74_474_043, 58_315_233, 77_137_495]  # For Mmul_10


In [None]:
## For when using `bcftools roh`
# Changes column names to match scikit-allel

roh_dfs = []
for chrom in chromosomes:
    roh_file = f"/master/abagwell/variant-analysis/results/rhesus/roh/bcftools/U42_WES.common_between_founding_cohorts.chr{chrom}.RG.roh"
    roh_df = pl.read_csv(roh_file, comment_prefix="#", has_header=False, separator="\t",
        schema_overrides=[pl.String, pl.String, pl.String, pl.Int32, pl.Int32, pl.Int64, pl.Int32, pl.Float32],
        new_columns=["RG", "sample", "chrom", "start", "stop", "length", "Number of markers", "Quality (average fwd-bwd phred score)"]
    ).filter(
        # Keep only large ROHs
        pl.col("length") > 1_000_000
    ).with_columns(
    ).join(colonies, how="left", left_on="sample", right_on="Id").drop("tag")
    
    roh_dfs.append(roh_df)

roh_df = pl.concat(roh_dfs)


In [3]:
def plot_trio_roh(df):
    """Plot the ROHs across all chromosomes of one trio."""

    # Create dummy records. That is, an ROH of length 0 for every combination of sample and chromosome so that they all appear on the plot
    chromosomes = roh_df.select("chrom").unique()
    sequenced_animals = df.select("sample", "Colony").unique()
    dummy_records = sequenced_animals.join(chromosomes, how="cross").with_columns(
        start = pl.lit(0).cast(pl.Int32),
        stop = pl.lit(0).cast(pl.Int32),
        length = pl.lit(0).cast(pl.Int64),
    ).select(  # Reorder
        "sample", "chrom", "start", "stop", "length", "Colony"
    )

    # TODO: Ideally don't have this redundancy
    chromosomes = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20"]

    # Concatenate ROHs with dummy records
    df = pl.concat([df, dummy_records]).sort("sample")  # TODO: Improve sorting for when needs to be numeric or animal ID has "X" in it

    columns = []
    for idx, chrom in enumerate(chromosomes):
        #column_plots = []
        # Only keep y-axis labels on leftmost plot
        if idx == 0:
            axis = alt.Axis()
        else:
            axis = None
        # # Info from `bcftools roh`
        chrom_df = df.filter(
            # Keep only large ROHs
            #pl.col("length") > 2_000_000
            pl.col("chrom") == chrom
        )

        roh_plot = alt.Chart(chrom_df).mark_bar().encode(
        alt.X("mb_start_pos:Q", title=["Position", "(Mb)"]).scale(domainMin=0, domainMax=chromosome_lengths[int(chrom) -1]/1_000_000, clamp=True),
        alt.X2("mb_stop_pos:Q"),
        alt.Y("sample:O", axis=axis, title="Sample", sort=alt.EncodingSortField(field='animal', order='ascending')), # sort=alt.EncodingSortField(field='date', order='ascending')
        color = alt.Color("Colony:N"),  # .scale(range=['#1f77b4', 'orange', '#9F2B68'])
        #order=alt.Order('animal:O', sort='ascending'),
        tooltip=[
            alt.Tooltip("chrom", title="Chr"),
            alt.Tooltip("start", title="Start (bp)"),
            alt.Tooltip("stop", title="End (bp)"),
            alt.Tooltip("length", title="Length"),
        ]
        ).properties(
            width=chromosome_lengths[int(chrom) - 1]/1500000,
            height=alt.Step(10),
            #width=300, # For when only displaying one chromosome
            #title=["RoH", f"chr{chrom}"]
        ).transform_calculate(
            mb_start_pos = 'datum.start / 1000000',
            mb_stop_pos = 'datum.stop / 1000000',
        )
    
        #plot_list.append(roh_plot)
        columns.append(roh_plot)
    
    return alt.hconcat(*columns)

In [4]:
def plot_all_trio_roh(df):
    # Find sequenced trios
    sequenced_animals = df["sample"].unique()
    trios = colonies.filter(
        pl.col("Id").is_in(sequenced_animals) & pl.col("Sire").is_in(sequenced_animals) & pl.col("Dam").is_in(sequenced_animals)
    ).select("Id", "Sire", "Dam").sort("Id")

    # Pull out specific trio
    rows = []
    for trio in trios.transpose().select([f"column_{num}" for num in range(50,75)]):
        trio_df = roh_df.filter(
            pl.col("sample").is_in(trio)
        ).select("sample", "chrom", "start", "stop", "length", "Colony")
        
        rows.append(plot_trio_roh(trio_df))
    return alt.vconcat(*rows)

In [None]:
# Create dataframe for subsequent plots

chrom_df = pl.DataFrame({
    "chrom": chromosomes,
    "chrom_len": chromosome_lengths
})
genome_length = sum(chromosome_lengths)

total_len_roh = roh_df.group_by("sample", "Colony", "Interval").agg(pl.sum("length")).sort("sample").with_columns(
    pl.col("length").truediv(genome_length).alias("froh")
# ).with_columns(
#     # Add seq info for splitting by seq
#     seq = pl.col("sample").str.slice(0, 3)
# )
# .with_columns(
#     pl.col("sample").str.split("_").list.get(0).str.slice(3).alias("animal")
).group_by("sample").agg(pl.last("froh"), pl.last("Colony"), pl.last("Interval")).with_columns(
    # Pull out the WGS over WES and select first
    #pl.col("sample").list.sort(descending=True).list.first(),
    #pl.col("froh").list.sort(descending=True).list.first(),
).with_columns(
    cohort = pl.concat_str([pl.col("Colony"), pl.col("Interval")], separator="_")
    #pl.col("sample").str.split("_").list.get(0).str.slice(3),
).filter(
    ~pl.col("Interval").is_null()
)

In [None]:
# Boxplot of fROH over intervals

boxplot = alt.Chart(total_len_roh).mark_boxplot().encode(
    alt.X("Interval:N", title="Cohort",
          # TODO: Generalize this
        sort=['Founders', 'pre-2003', '2003-2005', '2006-2008', '2009-2011', '2012-2014', '2015-2017', '2018-2020', 'Founders2', 'Merged']),
    alt.Y("froh:Q", title="fROH"),
    #alt.Column("seq", title="Sequence Type"),
    color=alt.Color("Colony:N"),
    # tooltip=[
    #     alt.Tooltip()
    # ]
).properties(
    #width=100
    title="fROH Across Cohorts"
)

boxplot

In [None]:
# Violin plot (without error bars or means)

violin = alt.Chart(total_len_roh).transform_density(
    'froh',
    as_=['froh', 'density'],
    extent=[0, 0.05],
    groupby=['Interval', 'Colony']
).mark_area(orient='horizontal').encode(
    alt.X("density:Q").stack('center')
        .impute(None)
        .title(None)
        .axis(labels=False, values=[0], grid=False, ticks=True),
    alt.Y("froh:Q", title="fROH"),
    alt.Column("Interval:N", title="Cohort",
          # TODO: Generalize this
        sort=['Founders', 'pre-2003', '2003-2005', '2006-2008', '2009-2011', '2012-2014', '2015-2017', '2018-2020', 'Founders2', 'Merged'],   
        ).spacing(0).header(titleOrient='bottom', labelOrient='bottom', labelPadding=0),
    color=alt.Color("Colony:N"),
).properties(
    width=60,
    title="fROH Across Cohorts"
)


violin

In [None]:
# Violin plot but with error bars and means
# Has to be more complicated in order to construct a layered chart that also is faceted
# Variations on the graph can be made my adjusting the scale on the alt.Y of the violin plot

violin = alt.Chart().transform_density(
    'froh',
    as_=['froh', 'density'],
    #extent=[0, 0.06],
    groupby=['Interval', 'Colony']
).mark_area(orient='horizontal').encode(
    alt.X("density:Q").stack('center')
        .impute(None)
        .title(None)
        .axis(labels=False, values=[0], grid=False, ticks=True)
        .scale(nice=False,zero=False),
    #alt.X("Interval"),
    #alt.Y("froh:Q", title=x_title).axis(range=[0, 0.05]),
    alt.Y("froh:Q", title='fROH'),#.scale(domain=[0, 0.05]),
    # alt.Column("Interval:N", title="Cohort",
    #       # TODO: Generalize this
    #     sort=['Founders', 'pre-2003', '2003-2005', '2006-2008', '2009-2011', '2012-2014', '2015-2017', '2018-2020', 'Founders2', 'Merged'],   
    #     ).spacing(0).header(titleOrient='bottom', labelOrient='bottom', labelPadding=0),
    color=alt.Color("Colony:N"),
).properties(
    width=60
)

error = alt.Chart().mark_errorbar(extent='ci').encode(
    #alt.X('Interval', title=None),
    alt.Y('froh', title='fROH')
    )

mean = alt.Chart().mark_circle(color='black').encode(
    #alt.X('Interval', title=None),
    alt.Y('mean(froh)', title='fROH')
    )

alt.layer(violin, error, mean, data=total_len_roh).facet(
    #column='Interval'
    alt.Column('Interval',
        sort=['Founders', '2003-2005', '2006-2008', '2009-2011', '2012-2014', '2015-2017', '2018-2020', 'Founders2', 'Merged'],
        header=alt.Header(labelOrient='bottom', titleOrient='bottom', labelPadding=0, title='Cohort')
    )
).resolve_scale(x=alt.ResolveMode("independent")
).configure_facet(
    spacing=0,
).configure_title(anchor='middle').properties(title="fROH Across Cohorts")

In [8]:
# Dataframes for t-tests
merged = total_len_roh.filter(
    pl.col('Colony') == 'Merged'
)['froh']

colony1 = total_len_roh.filter(
    (pl.col('Colony') == '1') & (pl.col('Interval') == '2018-2020')
)['froh']

colony2 = total_len_roh.filter(
    (pl.col('Colony') == '2') & (pl.col('Interval') == 'Founders2')
)['froh']

In [None]:
# T-test to compare groups
import scipy

# Comare Colony1 to Merged
scipy.stats.ttest_ind(colony1, merged)


In [None]:
scipy.stats.ttest_ind(colony2, merged)