In [2]:
import altair as alt
import polars as pl
import yaml

# Read config file
configfile = "/master/abagwell/workspace/github_project/variant-analysis/config/rhesus_old.yaml"
with open(configfile, 'r') as file:
    config = yaml.safe_load(file)

# Load colors
colors = pl.read_csv(config["colors"], separator="\t")

# Read autosomes
with open(config["resources"] + "ref_fna/autosomes.list") as f:
    chromosomes = f.read().splitlines()

In [5]:
# Merge ROH files and filter

# Read colony info
#colonies_file = "/master/abagwell/variant-analysis/resources/rhesus/pop/colonies.tsv"
colonies_file = "/master/abagwell/variant-analysis/resources/rhesus/pop/MML_groups_from_Martha.fixed7.tsv"
colonies = pl.read_csv(colonies_file, separator="\t", infer_schema_length=None,schema_overrides={
    "Dam": pl.String,
    "Sire": pl.String,
    "Id": pl.String
})

# TODO: Generalize this (currently values for rhesus)
chromosome_lengths = [223_616_942, 196_197_964, 185_288_947, 169_963_040, 187_317_192, 179_085_566, 169_868_564, 145_679_320, 134_124_166, 99_517_758, 133_066_086, 130_043_856, 108_737_130, 128_056_306, 113_283_604, 79_627_064, 95_433_459, 74_474_043, 58_315_233, 77_137_495]  # For Mmul_10
genome_length = sum(chromosome_lengths)

In [6]:
## For when using `bcftools roh`
# Changes column names to match scikit-allel

# Variables to adjust
min_ROH_length = 1_000_000
#min_ROH_length = 0


roh_dfs = []
for chrom in chromosomes:
    roh_file = f"{config["results"]}roh/bcftools/U42_WES.common_between_founding_cohorts2.chr{chrom}.RG.roh"
    roh_df = pl.read_csv(roh_file, comment_prefix="#", has_header=False, separator="\t",
        schema_overrides=[pl.String, pl.String, pl.String, pl.Int32, pl.Int32, pl.Int64, pl.Int32, pl.Float32],
        new_columns=["RG", "sample", "chrom", "start", "stop", "length", "Number of markers", "Quality (average fwd-bwd phred score)"]
    ).filter(
        # Keep only large ROHs
        pl.col("length") > min_ROH_length
    ).with_columns(
    ).join(colonies, how="left", left_on="sample", right_on="Id")#.drop("tag")
    
    roh_dfs.append(roh_df)

roh_df = pl.concat(roh_dfs)


In [7]:
# Data per animal for source groups

per_indiv_df = roh_df.select("sample", "chrom", "start", "stop", "length", "Interval").filter(
    pl.col("Interval").is_in(["Conventional source", "Brooks source", "NEPRC source"])
).group_by("sample", "Interval").agg(
    froh = pl.sum("length").truediv(genome_length),
    number_of_ROHs = pl.count("length"),
).with_columns(
# Index populations to be in correct order
    pl.col("Interval").cast(pl.Enum(
        colors["Cohort"]
    )),
)

alt.Chart(per_indiv_df).mark_circle(opacity=0.9).encode(
    alt.X("number_of_ROHs", title="Number of ROHs"),
    alt.Y("froh", title="fROH"),
    alt.Color("Interval", title="Cohorts").scale(
        domain = list(colors["Cohort"]),
        range = list(colors["Color"])
    )
).properties(
    title="Individuals by ROH"
)#.save("/master/abagwell/figures/roh/U42_WES.common_between_founding_cohorts2.by_indiv.min1Mb.html")

In [8]:

roh_df = roh_df.drop_nulls("Interval"
).group_by("Interval").agg("*").with_columns(
# Index populations so that year ranges have their own color
    pl.col("Interval").cast(pl.Enum(
        colors["Cohort"]
    )),
).with_row_index("pop_idx", offset=1).with_columns(
    # Find which are year ranges
    is_year = pl.col("Interval").cast(pl.String).str.contains("-").not_().cast(pl.Int8)
).with_columns(
    # Set the index of year ranges to 0
    pl.col("pop_idx").mul("is_year")
).drop("is_year").sort("Interval"#.explode(pl.exclude("pop_idx"))
).group_by(
    # Create color index
    "pop_idx", maintain_order=True
).agg('*').with_row_index("color_idx").drop("pop_idx").explode(pl.exclude("color_idx")).explode(pl.exclude("Interval", "color_idx"))

In [9]:
# Plot ROH lengths by group

mean_roh_df = roh_df.group_by("Interval", "color_idx").agg(pl.mean("length"))
# # Index populations so that year ranges have their own color
# ).with_columns(
#     pl.col("Interval").cast(pl.Enum(
#         colors["Cohort"]
#     )),
# ).with_row_index("pop_idx", offset=1).with_columns(
#     # Find which are year ranges
#     is_year = pl.col("Interval").cast(pl.String).str.contains("-").not_().cast(pl.Int8)
# ).with_columns(
#     # Set the index of year ranges to 0
#     pl.col("pop_idx").mul("is_year")
# ).drop("is_year").sort("Interval").group_by(
#     # Create color index
#     "pop_idx", maintain_order=True
# ).agg('*').with_row_index("color_idx").drop("pop_idx"
# ).explode("Interval", "length")

mean_roh_df




alt.Chart(mean_roh_df).mark_bar().encode(
    alt.X("Interval", title="Cohort"),
    alt.Y("length", title="Mean Length"),
    alt.Color("Interval:N", title=None, legend=None).scale(
        domain = list(colors["Cohort"]),
        range = list(colors["Color"])
    ),
).properties(
    title="ROH Across Cohorts"
)#.save("/master/abagwell/figures/roh/U42_WES.common_between_founding_cohorts2.mean_roh.min1Mb.html")

In [10]:
mean_roh_df

Interval,color_idx,length
enum,u32,f64
"""2012-2014""",2,2460500.0
"""NEPRC source""",3,1963800.0
"""Brooks source""",1,2129800.0
"""Conventional source""",0,6231200.0
"""2015-2017""",2,3107000.0
"""2003-2005""",2,2399300.0
"""Offspring of merger""",4,1817600.0
"""2006-2008""",2,2278800.0
"""2018-2020""",2,2523600.0
"""2009-2011""",2,2295500.0


In [11]:
mean_roh_df

Interval,color_idx,length
enum,u32,f64
"""2012-2014""",2,2460500.0
"""NEPRC source""",3,1963800.0
"""Brooks source""",1,2129800.0
"""Conventional source""",0,6231200.0
"""2015-2017""",2,3107000.0
"""2003-2005""",2,2399300.0
"""Offspring of merger""",4,1817600.0
"""2006-2008""",2,2278800.0
"""2018-2020""",2,2523600.0
"""2009-2011""",2,2295500.0


In [12]:
# 
roh_df

color_idx,Interval,RG,sample,chrom,start,stop,length,Number of markers,Quality (average fwd-bwd phred score),Species code (3 char),WES,WGS,Colony,Gender,Status,Dam,Sire,Date of Birth,Date of Death
u32,enum,str,str,str,i32,i32,i64,i32,f32,str,str,str,str,str,str,str,str,str,str
0,"""Conventional source""","""RG""","""10235""","""1""",65574431,66923894,1349464,32,2.5,"""MML""","""Y""",,"""1""","""Female""","""Dead""","""6228""","""8X0036""","""03/22/1991""","""04/22/2002"""
0,"""Conventional source""","""RG""","""10235""","""1""",142431093,145655479,3224387,52,39.700001,"""MML""","""Y""",,"""1""","""Female""","""Dead""","""6228""","""8X0036""","""03/22/1991""","""04/22/2002"""
0,"""Conventional source""","""RG""","""10244""","""1""",169810317,201692994,31882678,5679,97.099998,"""MML""","""Y""",,"""1""","""Female""","""Dead""","""7099""","""8X0036""","""03/29/1991""","""01/26/2007"""
0,"""Conventional source""","""RG""","""10250""","""1""",1146146,16497705,15351560,1663,94.099998,"""MML""","""Y""",,"""1""","""Male""","""Dead""","""7069""","""8X0036""","""04/08/1991""","""06/24/2003"""
0,"""Conventional source""","""RG""","""10250""","""1""",126215202,141616034,15400833,876,91.5,"""MML""","""Y""",,"""1""","""Male""","""Dead""","""7069""","""8X0036""","""04/08/1991""","""06/24/2003"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
4,"""Offspring of merger""","""RG""","""38610""","""20""",29826915,32733910,2906996,89,11.7,"""MML""","""Y""",,"""Merged""","""Male""","""Alive""","""34275""","""28310""","""09/16/2019""",
4,"""Offspring of merger""","""RG""","""38995""","""20""",15757560,16882661,1125102,105,53.400002,"""MML""","""Y""",,"""Merged""","""Female""","""Alive""","""33987""","""31143""","""04/11/2020""",
4,"""Offspring of merger""","""RG""","""39290""","""20""",29821291,32733910,2912620,109,80.0,"""MML""","""Y""",,"""Merged""","""Female""","""Dead""","""34321""","""31204""","""06/11/2020""","""08/11/2023"""
4,"""Offspring of merger""","""RG""","""39442""","""20""",24474692,25633428,1158737,65,42.099998,"""MML""","""Y""",,"""Merged""","""Female""","""Alive""","""34312""","""31204""","""07/28/2020""",


In [13]:
def plot_trio_roh(df):
    """Plot the ROHs across all chromosomes of one trio."""

    # Create dummy records. That is, an ROH of length 0 for every combination of sample and chromosome so that they all appear on the plot
    chromosomes = roh_df.select("chrom").unique()
    sequenced_animals = df.select("sample", "Colony").unique()
    dummy_records = sequenced_animals.join(chromosomes, how="cross").with_columns(
        start = pl.lit(0).cast(pl.Int32),
        stop = pl.lit(0).cast(pl.Int32),
        length = pl.lit(0).cast(pl.Int64),
    ).select(  # Reorder
        "sample", "chrom", "start", "stop", "length", "Colony"
    )

    # Concatenate ROHs with dummy records
    df = pl.concat([df, dummy_records]).sort("sample")  # TODO: Improve sorting for when needs to be numeric or animal ID has "X" in it

    columns = []
    for idx, chrom in enumerate(chromosomes):
        #column_plots = []
        # Only keep y-axis labels on leftmost plot
        if idx == 0:
            axis = alt.Axis()
        else:
            axis = None
        # # Info from `bcftools roh`
        chrom_df = df.filter(
            # Keep only large ROHs
            #pl.col("length") > 2_000_000
            pl.col("chrom") == chrom
        )

        roh_plot = alt.Chart(chrom_df).mark_bar().encode(
        alt.X("mb_start_pos:Q", title=["Position", "(Mb)"]).scale(domainMin=0, domainMax=chromosome_lengths[int(chrom) -1]/1_000_000, clamp=True),
        alt.X2("mb_stop_pos:Q"),
        alt.Y("sample:O", axis=axis, title="Sample", sort=alt.EncodingSortField(field='animal', order='ascending')), # sort=alt.EncodingSortField(field='date', order='ascending')
        color = alt.Color("Colony:N").scale(
            domain = list(colors["Cohort"]),
            range = list(colors["Color"])
        ),
        #order=alt.Order('animal:O', sort='ascending'),
        tooltip=[
            alt.Tooltip("chrom", title="Chr"),
            alt.Tooltip("start", title="Start (bp)"),
            alt.Tooltip("stop", title="End (bp)"),
            alt.Tooltip("length", title="Length"),
        ]
        ).properties(
            width=chromosome_lengths[int(chrom) - 1]/1500000,
            height=alt.Step(10),
            #width=300, # For when only displaying one chromosome
            #title=["RoH", f"chr{chrom}"]
        ).transform_calculate(
            mb_start_pos = 'datum.start / 1000000',
            mb_stop_pos = 'datum.stop / 1000000',
        )
    
        #plot_list.append(roh_plot)
        columns.append(roh_plot)
    
    return alt.hconcat(*columns)

In [14]:
def plot_all_trio_roh(df):
    # Find sequenced trios
    sequenced_animals = df["sample"].unique()
    trios = colonies.filter(
        pl.col("Id").is_in(sequenced_animals) & pl.col("Sire").is_in(sequenced_animals) & pl.col("Dam").is_in(sequenced_animals)
    ).select("Id", "Sire", "Dam").sort("Id")

    # Pull out specific trio
    rows = []
    for trio in trios.transpose().select([f"column_{num}" for num in range(50,75)]):
        trio_df = roh_df.filter(
            pl.col("sample").is_in(trio)
        ).select("sample", "chrom", "start", "stop", "length", "Colony")
        
        rows.append(plot_trio_roh(trio_df))
    return alt.vconcat(*rows)

In [15]:
# Create dataframe for subsequent plots

chrom_df = pl.DataFrame({
    "chrom": chromosomes,
    "chrom_len": chromosome_lengths
})

total_len_roh = roh_df.group_by("sample", "color_idx", "Interval").agg(pl.sum("length")).sort("sample").with_columns(
    pl.col("length").truediv(genome_length).alias("froh")
# ).with_columns(
#     # Add seq info for splitting by seq
#     seq = pl.col("sample").str.slice(0, 3)
# )
# .with_columns(
#     pl.col("sample").str.split("_").list.get(0).str.slice(3).alias("animal")
).group_by("sample").agg(pl.last("froh"), pl.last("color_idx"), pl.last("Interval"))#.with_columns(
    # Pull out the WGS over WES and select first
    #pl.col("sample").list.sort(descending=True).list.first(),
    #pl.col("froh").list.sort(descending=True).list.first(),
# ).with_columns(
#     cohort = pl.concat_str([pl.col("Colony"), pl.col("Interval")], separator="_")
    #pl.col("sample").str.split("_").list.get(0).str.slice(3),
# ).filter(
#     ~pl.col("Interval").is_null()
# )

In [16]:
total_len_roh

sample,froh,color_idx,Interval
str,f64,u32,enum
"""34265""",0.013964,3,"""NEPRC source"""
"""27441""",0.008664,2,"""2003-2005"""
"""34717""",0.015907,2,"""2015-2017"""
"""38254""",0.018261,2,"""2018-2020"""
"""17525""",0.003808,1,"""Brooks source"""
…,…,…,…
"""34235""",0.019812,3,"""NEPRC source"""
"""33937""",0.020773,3,"""NEPRC source"""
"""32604""",0.017489,2,"""2012-2014"""
"""18408""",0.011106,1,"""Brooks source"""


In [48]:
# Boxplot of fROH over intervals

boxplot = alt.Chart(total_len_roh).mark_boxplot().encode(
    alt.X("Interval:N", title="Cohort", #axis=alt.Axis(labelAngle=-45),
          # TODO: Generalize this
        #sort=['Founders', 'pre-2003', '2003-2005', '2006-2008', '2009-2011', '2012-2014', '2015-2017', '2018-2020', 'Founders2', 'Merged']),
        sort=['Conventional source', 'Brooks source', 'Pre-2003', '2003-2005', '2006-2008', '2009-2011', '2012-2014', '2015-2017', '2018-2020', 'NEPRC source', 'Offspring of merger'],   
    ),
    alt.Y("froh:Q", title="fROH"),
    #alt.Column("seq", title="Sequence Type"),
    color=alt.Color("Interval:N", legend=None).scale(
        domain = list(colors["Cohort"]),
        range = list(colors["Color"])
    )
    # tooltip=[
    #     alt.Tooltip()
    # ]
).properties(
    #width=100
    title="fROH"
)

boxplot#.save("/master/abagwell/figures/final_plots/U42_WES.common_between_founding_cohorts2.froh_boxplot.min1Mb.html")

#boxplot.save(f"/master/abagwell/figures/roh/U42_WES.common_between_founding_cohorts2.boxplot.maf_0.05.gt{min_ROH_length / 1_000_000}Mb.html")

In [18]:
# Violin plot (without error bars or means). See next code block for it with extra details

violin = alt.Chart(total_len_roh
#                    .with_columns(
#    pl.col("Interval").str.replace("Offspring of merger", "Merged"))
).transform_density(
    'froh',
    as_=['froh', 'density'],
    #extent=[0, 0.05],
    #extent=[0, 0.1],
    groupby=['Interval', 'color_idx']
).mark_area(orient='horizontal').encode(
    alt.X("density:Q").stack('center')
        .impute(None)
        .title(None)
        .axis(labels=False, values=[0], grid=False, ticks=True),
    alt.Y("froh:Q", title="fROH"),
    alt.Column("Interval:N", title="Cohort",
          # TODO: Generalize this
        sort=colors["Cohort"],   
        ).spacing(0).header(titleOrient='bottom', labelOrient='bottom', labelPadding=0),
    color=alt.Color("Interval:N").scale(
        domain = list(colors["Cohort"]),
        range = list(colors["Color"])
    ),
).properties(
    width=65,
    title="fROH Across Cohorts"
)


violin

#violin.save(f"/master/abagwell/figures/roh/U42_WES.common_between_founding_cohorts2.violinplot.maf_0.05.gt{min_ROH_length / 1_000_000}Mb.full_height.html")
#violin.save(f"/master/abagwell/figures/roh/U42_WES.common_between_founding_cohorts2.violinplot.maf_0.05.gt{min_ROH_length}Mb.max_0.1.html")

In [19]:
total_len_roh

sample,froh,color_idx,Interval
str,f64,u32,enum
"""34265""",0.013964,3,"""NEPRC source"""
"""27441""",0.008664,2,"""2003-2005"""
"""34717""",0.015907,2,"""2015-2017"""
"""38254""",0.018261,2,"""2018-2020"""
"""17525""",0.003808,1,"""Brooks source"""
…,…,…,…
"""34235""",0.019812,3,"""NEPRC source"""
"""33937""",0.020773,3,"""NEPRC source"""
"""32604""",0.017489,2,"""2012-2014"""
"""18408""",0.011106,1,"""Brooks source"""


In [32]:
# Violin plot but with error bars and means
# Has to be more complicated in order to construct a layered chart that also is faceted
# Variations on the graph can be made my adjusting the scale on the alt.Y of the violin plot

# Varibles to adjust
error_unit = 'stderr' # Can switch extent to `stdev`, `stderr`, or `ci`
# max_y = 0.08
# min_y = -0.02
#max_y = 0.055
#max_y = 0.09
max_y = 0.45
#max_y=0.14 # For zoomed of Merger
min_y = 0.0

violin = alt.Chart().transform_density(
    'froh',
    as_=['froh', 'density'],
    extent=[min_y, max_y],
    groupby=['Interval', 'color_idx']
).mark_area(orient='horizontal').encode(
    alt.X("density:Q").stack('center')
        .impute(None)
        .title(None)
        .axis(labels=False, values=[0], grid=False, ticks=True)
        .scale(nice=False,zero=False),
    #alt.X("Interval"),
    #alt.Y("froh:Q", title=x_title).axis(range=[0, 0.05]),
    alt.Y("froh:Q", title='fROH').scale(domain=[min_y, max_y]),
    # alt.Column("Interval:N", title="Cohort",
    #       # TODO: Generalize this
    #     ).spacing(0).header(titleOrient='bottom', labelOrient='bottom', labelPadding=0),
    color=alt.Color("Interval:N", legend=None).scale(
        domain = list(colors["Cohort"]),
        range = list(colors["Color"])
    )
).properties(
    width=92
)

error = alt.Chart().mark_errorbar(extent=error_unit).encode(
    #alt.X('Interval', title=None),
    alt.Y('froh:Q', title='fROH')
    )

mean = alt.Chart().mark_circle(color='black').encode(
    #alt.X('Interval', title=None),
    alt.Y('mean(froh):Q', title='fROH')
    )

layered = alt.layer(violin, error, mean, data=total_len_roh
        .filter(
            pl.col("Interval").is_in(["2018-2020", "Offspring of merger", "NEPRC source"]))
            #pl.col("Interval").is_in(["Conventional source", "Brooks source", "NEPRC source"]))
).facet(
    #column='Interval'
    alt.Column('Interval',
        header=alt.Header(labelOrient='bottom', titleOrient='bottom', labelPadding=0, title='Cohort'), #labelAnchor='end', labelAngle=-90) # labelAngle=-45
    )
).resolve_scale(x=alt.ResolveMode("independent")
).configure_facet(
    spacing=0,
).configure_title(anchor='middle').properties(
    #title="fROH in Cohorts"
    title="fROH of Merger"
    #title=["fROH in", "Founding Populations"]
)


layered#.save("/master/abagwell/figures/roh/U42_WES.common_between_founding_cohorts2.violinplot_froh.min1Mb.merger_zoomed.html")
layered#.save(f"/master/abagwell/figures/roh/U42_WES.common_between_founding_cohorts2.froh_violinplot.gt{min_ROH_length / 1_000_000}Mb.{error_unit}.full_height.html")
#layered.save(f"/master/abagwell/figures/roh/U42_WES.common_between_founding_cohorts2.froh_violinplot.{error_unit}.html")

In [101]:
total_len_roh

sample,froh,color_idx,Interval
str,f64,u32,enum
"""17507""",0.023796,1,"""Brooks source"""
"""17660""",0.017183,1,"""Brooks source"""
"""30011""",0.012335,2,"""2009-2011"""
"""38945""",0.021436,2,"""2018-2020"""
"""33920""",0.019759,3,"""NEPRC source"""
…,…,…,…
"""34253""",0.019894,3,"""NEPRC source"""
"""31982""",0.029756,2,"""2012-2014"""
"""40201""",0.018264,4,"""Offspring of merger"""
"""31127""",0.009487,2,"""2009-2011"""


In [37]:
# Dataframes for t-tests
Conventional_source = total_len_roh.filter(
    pl.col('Interval') == 'Conventional source'
)['froh']

Brooks_source = total_len_roh.filter(
    pl.col('Interval') == 'Brooks source'
)['froh']

NEPRC_source = total_len_roh.filter(
    pl.col('Interval') == 'NEPRC source'
)['froh']

In [38]:
# T-test to compare groups
import scipy

# Comare Colony1 to Merged
scipy.stats.ttest_ind(Conventional_source, Brooks_source)


TtestResult(statistic=np.float64(5.059169848151928), pvalue=np.float64(1.5333542772508571e-06), df=np.float64(120.0))

In [39]:
scipy.stats.ttest_ind(Conventional_source, NEPRC_source)

TtestResult(statistic=np.float64(5.023968441163438), pvalue=np.float64(1.9940327398462695e-06), df=np.float64(109.0))

In [40]:
scipy.stats.ttest_ind(Brooks_source, NEPRC_source)

TtestResult(statistic=np.float64(-0.6795470068454496), pvalue=np.float64(0.4976499814522465), df=np.float64(183.0))

In [41]:
y2018_2020 = total_len_roh.filter(
    pl.col('Interval') == '2018-2020'
)['froh']

offspring_of_merger = total_len_roh.filter(
    pl.col('Interval') == 'Offspring of merger'
)['froh']

NEPRC_source = total_len_roh.filter(
    pl.col('Interval') == 'NEPRC source'
)['froh']

In [42]:
scipy.stats.ttest_ind(y2018_2020, offspring_of_merger)

TtestResult(statistic=np.float64(2.076622817411684), pvalue=np.float64(0.04100652825539894), df=np.float64(81.0))

In [43]:
scipy.stats.ttest_ind(y2018_2020, NEPRC_source)

TtestResult(statistic=np.float64(0.8416986633995069), pvalue=np.float64(0.40153878801706944), df=np.float64(127.0))

In [44]:
scipy.stats.ttest_ind(offspring_of_merger, NEPRC_source)

TtestResult(statistic=np.float64(-2.1843471326414248), pvalue=np.float64(0.03078773987146404), df=np.float64(126.0))