In [None]:
import polars as pl
import yaml

# Read config file
configfile = "/master/abagwell/workspace/github_project/variant-analysis/config/rhesus.yaml"
with open(configfile, 'r') as file:
    config = yaml.safe_load(file)

# Load colors
colors = pl.read_csv(config["colors"], separator="\t")

In [None]:
import altair as alt
import polars as pl


# colonies_file = config["resources"] + "pop/MML_groups_from_Martha.fixed7.tsv"
colonies_file = config["cohorts"]
colonies = pl.read_csv(colonies_file, separator="\t", infer_schema_length=None).select("Id", "Interval")

# df = pl.read_csv("/master/abagwell/variant-analysis/results/rhesus/kinship/het/U42_WES.common_between_founding_cohorts.het", separator='\t', infer_schema_length=5000
#     ).join(colonies, how="left", left_on="INDV", right_on="Id").with_columns(
#     cohort = pl.concat_str([pl.col("Colony"), pl.col("Interval")], separator="_")
#     #pl.col("sample").str.split("_").list.get(0).str.slice(3),
# ).filter(
#     ~pl.col("Interval").is_null()
# )


# ibc_file = config["results"] + f"inbreeding/GCTA/pass/U42_WES.common_between_founding_cohorts2.ibc"
ibc_file = config["results"] + f"inbreeding/GCTA/pass/{config['dataset']}.all.ibc"
df = pl.read_csv(ibc_file, separator="\t", schema_overrides={"IID": pl.String}).join(colonies, left_on="IID", right_on="Id", how="left")

In [None]:
df

In [None]:
df = df.drop_nulls("Interval"
).group_by("Interval").agg("*").with_columns(
# Index populations so that year ranges have their own color
    pl.col("Interval").cast(pl.Enum(
        list(colors["Cohort"])
    )),
).with_row_index("pop_idx", offset=1).with_columns(
    # Find which are year ranges
    is_year = pl.col("Interval").cast(pl.String).str.contains("-").not_().cast(pl.Int8)
).with_columns(
    # Set the index of year ranges to 0
    pl.col("pop_idx").mul("is_year")
).drop("is_year").sort("Interval"#.explode(pl.exclude("pop_idx"))
).group_by(
    # Create color index
    "pop_idx", maintain_order=True
).agg('*').with_row_index("color_idx").drop("pop_idx").explode(pl.exclude("color_idx")).explode(pl.exclude("Interval", "color_idx"))

In [None]:
df

In [None]:
alt.Chart(df).mark_boxplot().encode(
    alt.X("Interval:N", title="Cohort",
        sort=colors["Cohort"]
    ),
    alt.Y("Fhat3", title="Inbreeding Coefficient, Fhat1"),
    alt.Color("color_idx:N", legend=None).scale(scheme="category10"),
).properties(
    title=["Inbreeding Coefficient", "Across Cohorts"]
).save(f'{config['results']}inbreeding/inbreeding.{config['project']}.all.Fhat1.html')

In [None]:
df["Interval"].unique()

In [None]:
# As a violin plot
# Violin plot but with error bars and means
# Has to be more complicated in order to construct a layered chart that also is faceted
# Variations on the graph can be made my adjusting the scale on the alt.Y of the violin plot


# # Create theme
# #@alt.theme.register("black_marks", enable=True)
# def black_marks() -> alt.theme.ThemeConfig:
#     # return {
#     #     "config": {
#     #         "view": {"continuousWidth": 300, "continuousHeight": 300},
#     #         "mark": {"color": "black", "fill": "black"},
#     #     }
#     # }

#     # return alt.theme.ThemeConfig(
#     #     config = {
#     #         "bar": {"color": "red"}
#     #     }
#     # )


#     return {'spec': {'layer': [{'mark': {'type': 'area', 'orient': 'horizontal'},
#     'encoding': {'x': {'axis': {'labels': False,
#        'values': [0],
#        'grid': False,
#        'ticks': True},
#       'field': 'density',
#       'impute': None,
#       'scale': {'nice': False, 'zero': False},
#       'stack': 'center',
#       'title': None,
#       'type': 'quantitative'},
#      'y': {'field': 'Fhat2', 'title': 'Fhat2', 'type': 'quantitative'},},
#     'transform': [{'density': 'Fhat2',
#       'extent': [-0.3, 0.45],
#       'groupby': ['Interval', 'color_idx'],
#       'as': ['Fhat2', 'density']}]},
#    {'mark': {'type': 'errorbar', 'extent': 'stderr'},
#     'encoding': {'y': {'field': 'Fhat2',
#       'title': 'Fhat2',
#       'type': 'quantitative'}}},
#    {'mark': {'type': 'circle', 'color': 'black'},
#     'encoding': {'y': {'aggregate': 'mean',
#       'field': 'Fhat2',
#       'title': 'Fhat2',
#       'type': 'quantitative'}}}],
#   'width': 92},
# }


    # return {
    #     "encoding": {
    #         "color": {
    #             "scale": {
    #                 "domain": ["Conventional source"],
    #                 "range": ["#A1C40F"]
    #             }
    #         }
    #     }
    # }

    # return {
    #     "spec": {
    #         "layer": [
    #         {
    #             "encoding": {
    #                 "color": {
    #                     "field": "Interval",
    #                     "scale": {
    #                         "domain": ["Conventional source"],
    #                         "range": ["#A1C40F"]
    #                     },
    #                 }
    #             }
    #         }
    #         ]
    #     },
    # }


# Varibles to adjust
error_unit = 'stderr' # Can switch extent to `stdev`, `stderr`, or `ci`

# TODO: Generalize min and max y values
max_y = 0.45
min_y = -0.3


violin = alt.Chart().transform_density(
    'Fhat2',
    as_=['Fhat2', 'density'],
    extent=[min_y, max_y],
    groupby=['Interval', 'color_idx']
).mark_area(orient='horizontal').encode(
    alt.X("density:Q").stack('center')
        .impute(None)
        .title(None)
        .axis(labels=False, values=[0], grid=False, ticks=True)
        .scale(nice=False,zero=False),
    #alt.X("Interval"),
    #alt.Y("froh:Q", title=x_title).axis(range=[0, 0.05]),
    alt.Y("Fhat2:Q", title='Fhat2'),#.scale(domain=[min_y, max_y]),
    # alt.Column("Interval:N", title="Cohort",
    #       # TODO: Generalize this
    #     ).spacing(0).header(titleOrient='bottom', labelOrient='bottom', labelPadding=0),
    #color=alt.Color("color_idx:N", legend=None).scale(scheme="category10"),
    color=alt.Color("Interval:N", legend=None).scale(
        domain = list(colors["Cohort"]),
        range = list(colors["Color"])
    )
).properties(
    width=92
)

error = alt.Chart().mark_errorbar(extent=error_unit).encode(
    #alt.X('Interval', title=None),
    alt.Y('Fhat2:Q', title='Fhat2')
    )

mean = alt.Chart().mark_circle(color='black').encode(
    #alt.X('Interval', title=None),
    alt.Y('mean(Fhat2):Q', title='Fhat2')
    )

layered = alt.layer(violin, error, mean, data=df
    .filter(
        pl.col("Interval").is_in(["2018-2020", "Offspring of merger", "NEPRC source"]))
        #pl.col("Interval").is_in(["Conventional source", "Brooks source", "NEPRC source"]))
).facet(
    #column='Interval'
    alt.Column('Interval',
        header=alt.Header(labelOrient='bottom', titleOrient='bottom', labelPadding=0, title='Cohort'), #labelAnchor='end', labelAngle=-90) # labelAngle=-45
    )
).resolve_scale(x=alt.ResolveMode("independent")
).configure_facet(
    spacing=0,
).configure_title(anchor='middle').properties(
    #title="Inbreeding in Cohorts",
    title=["Inbreeding of Merger"]
)


layered.save(f"/master/abagwell/figures/final_plots/U42_WES.common_between_founding_cohorts2.violinplot_Fhat2.merger.html")
#layered.save("/master/abagwell/figures/final_plots/full.html")

In [None]:
unpivoted_df = df.unpivot(on=["Fhat1", "Fhat2", "Fhat3"], index=["color_idx", "Interval"], variable_name="statistic", value_name="F")

In [None]:
unpivoted_df

In [None]:
alt.Chart(unpivoted_df).mark_boxplot().encode(
    alt.X("Interval:N", title=None,
        sort=colors["Cohort"]
    ),
    alt.Y("F:Q", title="Inbreeding Coefficient, F"),
    alt.Column("statistic:N", 
               title="Inbreeding Statistic",
               #title=None
    ),
    #alt.Color("color_idx:N", legend=None).scale(scheme="category10"),
    alt.Color("Interval:N", legend=None).scale(
        domain = list(colors["Cohort"]),
        range = list(colors["Color"])
    )
).properties(
    #title=["Inbreeding Coefficient", "Across Cohorts"]
).configure_title(
    anchor="middle"
)#.save('/master/abagwell/figures/final_plots/inbreeding.U42_WES.common_between_founding_cohorts2.all_Fs.all.html')

In [None]:
# Just Fhat2
alt.Chart(unpivoted_df
    .filter(
        pl.col("statistic") == "Fhat2"
)).mark_boxplot().encode(
    alt.X("Interval:N", title="Cohort",
        sort=colors["Cohort"]
    ),
    alt.Y("F:Q", title="Inbreeding Coefficient, Fhat2"),
    #alt.Color("color_idx:N", legend=None).scale(scheme="category10"),
    alt.Color("Interval:N", legend=None).scale(
        domain = list(colors["Cohort"]),
        range = list(colors["Color"])
    )
).properties(
    title=["Inbreeding in Cohorts"]
).configure_title(
    anchor="middle"
)#.save('/master/abagwell/figures/final_plots/inbreeding.U42_WES.common_between_founding_cohorts2.Fhat2.html')

In [None]:
# Same as above, but labels rotated and plot condensed as violin

# Varibles to adjust
error_unit = 'stderr' # Can switch extent to `stdev`, `stderr`, or `ci`

# TODO: Generalize min and max y values
# max_y = 0.005
# min_y = 0.0015

max_y = 0.1
min_y = -0.2


violin = alt.Chart().transform_density(
    'F',
    as_=['F', 'density'],
    extent=[min_y, max_y],
    groupby=['Interval']
).mark_area(orient='horizontal').encode(
    alt.X("density:Q").stack('center')
        .impute(None)
        .title(None)
        .axis(labels=False, values=[0], grid=False, ticks=True)
        .scale(nice=False,zero=False),
    #alt.Y("froh:Q", title=x_title).axis(range=[0, 0.05]),
    alt.Y("F:Q", title='Inbreeding, Fhat2'),#.scale(domain=[min_y, max_y]),
    # alt.Column("Interval:N", title="Cohort",
    #       # TODO: Generalize this
    #     ).spacing(0).header(titleOrient='bottom', labelOrient='bottom', labelPadding=0),
    #color=alt.Color("color_idx:N", legend=None).scale(scheme="category10"),
    color=alt.Color("Interval:N", legend=None).scale(
        domain = list(colors["Cohort"]),
        range = list(colors["Color"])
    )
).properties(
    width=25
)

error = alt.Chart().mark_errorbar(extent=error_unit).encode(
    #alt.X('Interval', title=None),
    alt.Y('F:Q', title='Inbreeding, Fhat2')
    )

mean = alt.Chart().mark_circle(color='black').encode(
    #alt.X('Interval', title=None),
    alt.Y('mean(F):Q', title='Inbreeding, Fhat2')
    )

layered = alt.layer(violin, error, mean, data=df
    # .filter(
    # #     # pl.col("Interval").is_in(["2018-2020", "Offspring of merger", "NEPRC source"]))
    #     pl.col("Interval").is_in(["Conventional source", "Brooks source", "NEPRC source"]))
).facet(
    #column='Interval'
    alt.Column('Interval',
        header=alt.Header(
            labelOrient='bottom', labelPadding=0, labelAnchor='middle', labelAngle=-90, labelBaseline="middle", labelAlign="right",
            title='Cohort', titleAlign="center", titleOrient='bottom', ),
        sort=colors["Cohort"]
    )
).resolve_scale(x=alt.ResolveMode("independent")
).configure_facet(
    spacing=0,
).configure_title(anchor='middle').properties(
    #title=["Heterozygosity in", "Founding Populations"]
    title=["Inbreeding in Cohorts"]
    #title=["Heterozygosity of Merger"]
)


layered#.save("/master/abagwell/figures/final_plots/narrow_violins/inbreeding.U42_WES.common_between_founding_cohorts2.Fhat2.html")
#layered.save("/master/abagwell/figures/final_plots/full.html")

In [None]:
# Independent T-tests
df

In [None]:
# Dataframes for t-tests
Conventional_source = df.filter(
    pl.col('Interval') == 'Conventional source'
)['Fhat2']

Brooks_source = df.filter(
    pl.col('Interval') == 'Brooks source'
)['Fhat2']

NEPRC_source = df.filter(
    pl.col('Interval') == 'NEPRC source'
)['Fhat2']

In [None]:
# T-test to compare groups
import scipy

# Comare Colony1 to Merged
scipy.stats.ttest_ind(Conventional_source, Brooks_source)


In [None]:
scipy.stats.ttest_ind(Conventional_source, NEPRC_source)

In [None]:
scipy.stats.ttest_ind(Brooks_source, NEPRC_source)

In [None]:
y2018_2020 = df.filter(
    pl.col('Interval') == '2018-2020'
)['Fhat2']

offspring_of_merger = df.filter(
    pl.col('Interval') == 'Offspring of merger'
)['Fhat2']

NEPRC_source = df.filter(
    pl.col('Interval') == 'NEPRC source'
)['Fhat2']

In [None]:
scipy.stats.ttest_ind(y2018_2020, offspring_of_merger)

In [None]:
scipy.stats.ttest_ind(y2018_2020, NEPRC_source)

In [None]:
scipy.stats.ttest_ind(offspring_of_merger, NEPRC_source)

In [None]:
# HET attempt 1

import altair as alt
import polars as pl


df = pl.read_csv(config["results"] + 'heterozygosity/gvcf_counts.het',
    separator='\t',
    schema_overrides={'SAMPLE': pl.String}
).with_columns(
    pl.col('SAMPLE').str.split('_').list.get(0).str.slice(3),
    FRACTION = pl.col('HET').truediv('TOTAL')
# Deduplicate
).group_by('SAMPLE').agg(pl.first('*')
# Join cohort info
).join(colonies, left_on='SAMPLE', right_on='Id').with_columns(
    pl.col('Interval').cast(pl.Enum( 
        list(colors["Cohort"])
    )),
).group_by("Interval").agg("FRACTION", 'SAMPLE').with_row_index("pop_idx", offset=1).with_columns(
    # Find which are year ranges
    is_year = pl.col("Interval").cast(pl.String).str.contains("-").not_().cast(pl.Int8)
).with_columns(
    # Set the index of year ranges to 0
    pl.col("pop_idx").mul("is_year")
).drop("is_year").sort("Interval"#.explode(pl.exclude("pop_idx"))
).group_by(
    # Create color index
    "pop_idx", maintain_order=True
).agg('*').with_row_index("color_idx").drop("pop_idx").explode(pl.exclude("color_idx")).explode(pl.exclude("Interval", "color_idx"))

In [None]:
# HET attempt 2
import altair as alt
import polars as pl

file = "/master/abagwell/variant-analysis/results/rhesus_old/kinship/het/U42_WES.common_between_founding_cohorts.het"

df = pl.read_csv(file, separator='\t', schema_overrides={'INDV': pl.String}).with_columns(
    FRACTION = (pl.col('N_SITES').sub('E(HOM)')).truediv('N_SITES'),
    SAMPLE = pl.col("INDV"),
).join(colonies, left_on='INDV', right_on='Id').drop("INDV")

In [None]:
df

In [None]:
#mean_df = df.group_by("Interval").agg(pl.mean("FRACTION"))

alt.Chart(df).mark_boxplot().encode(
    alt.X('Interval', title='Cohort', sort=colors["Cohort"]),
    alt.Y('FRACTION', title='Heterozygosity').scale(zero=False), 
    #.scale(domainMin=0.3, domainMax=0.34),
    alt.Color('Interval:N', legend=None,).scale(
        domain = list(colors["Cohort"]),
        range = list(colors["Color"])
    ),
    tooltip=[
        alt.Tooltip('SAMPLE')
    ]
).properties(
    title="Heterozygosity"
)#.save('/master/abagwell/figures/final_plots/U42_WES.heterozygosity.barplot.html')

In [None]:
# As a violin plot
# Violin plot but with error bars and means
# Has to be more complicated in order to construct a layered chart that also is faceted
# Variations on the graph can be made my adjusting the scale on the alt.Y of the violin plot

# Varibles to adjust
error_unit = 'stderr' # Can switch extent to `stdev`, `stderr`, or `ci`

# TODO: Generalize min and max y values
# max_y = 0.005
# min_y = 0.0015

max_y = 0.34
min_y = 0.305


violin = alt.Chart().transform_density(
    'FRACTION',
    as_=['FRACTION', 'density'],
    extent=[min_y, max_y],
    groupby=['Interval']
).mark_area(orient='horizontal').encode(
    alt.X("density:Q").stack('center')
        .impute(None)
        .title(None)
        .axis(labels=False, values=[0], grid=False, ticks=True)
        .scale(nice=False,zero=False),
    #alt.Y("froh:Q", title=x_title).axis(range=[0, 0.05]),
    alt.Y("FRACTION:Q", title='Heterozygosity'),#.scale(domain=[min_y, max_y]),
    # alt.Column("Interval:N", title="Cohort",
    #       # TODO: Generalize this
    #     ).spacing(0).header(titleOrient='bottom', labelOrient='bottom', labelPadding=0),
    #color=alt.Color("color_idx:N", legend=None).scale(scheme="category10"),
    color=alt.Color("Interval:N", legend=None).scale(
        domain = list(colors["Cohort"]),
        range = list(colors["Color"])
    )
).properties(
    width=92
)

error = alt.Chart().mark_errorbar(extent=error_unit).encode(
    #alt.X('Interval', title=None),
    alt.Y('FRACTION:Q', title='Heterozygosity')
    )

mean = alt.Chart().mark_circle(color='black').encode(
    #alt.X('Interval', title=None),
    alt.Y('mean(FRACTION):Q', title='Heterozygosity')
    )

layered = alt.layer(violin, error, mean, data=df
    # .filter(
    # #     # pl.col("Interval").is_in(["2018-2020", "Offspring of merger", "NEPRC source"]))
    #     pl.col("Interval").is_in(["Conventional source", "Brooks source", "NEPRC source"]))
).facet(
    #column='Interval'
    alt.Column('Interval',
        header=alt.Header(labelOrient='bottom', titleOrient='bottom', labelPadding=0, title='Cohort',), #labelAnchor='end', labelAngle=-90) # labelAngle=-45
        sort=colors["Cohort"]
    )
).resolve_scale(x=alt.ResolveMode("independent")
).configure_facet(
    spacing=0,
).configure_title(anchor='middle').properties(
    #title=["Heterozygosity in", "Founding Populations"]
    title=["Heterozygosity in Cohorts"]
    #title=["Heterozygosity of Merger"]
)


layered#.save("/master/abagwell/figures/final_plots/U42_WES.common_between_founding_cohorts2.violinplot_het.all.vcftools.html")
#layered.save("/master/abagwell/figures/final_plots/full.html")

In [None]:
# Same as above, but labels rotated and plot condensed

# Varibles to adjust
error_unit = 'stderr' # Can switch extent to `stdev`, `stderr`, or `ci`

# TODO: Generalize min and max y values
# max_y = 0.005
# min_y = 0.0015

max_y = 0.33
min_y = 0.31


violin = alt.Chart().transform_density(
    'FRACTION',
    as_=['FRACTION', 'density'],
    extent=[min_y, max_y],
    groupby=['Interval']
).mark_area(orient='horizontal').encode(
    alt.X("density:Q").stack('center')
        .impute(None)
        .title(None)
        .axis(labels=False, values=[0], grid=False, ticks=True)
        .scale(nice=False,zero=False),
    #alt.Y("froh:Q", title=x_title).axis(range=[0, 0.05]),
    alt.Y("FRACTION:Q", title='Heterozygosity'),#.scale(domain=[min_y, max_y]),
    # alt.Column("Interval:N", title="Cohort",
    #       # TODO: Generalize this
    #     ).spacing(0).header(titleOrient='bottom', labelOrient='bottom', labelPadding=0),
    #color=alt.Color("color_idx:N", legend=None).scale(scheme="category10"),
    color=alt.Color("Interval:N", legend=None).scale(
        domain = list(colors["Cohort"]),
        range = list(colors["Color"])
    )
).properties(
    width=25
)

error = alt.Chart().mark_errorbar(extent=error_unit).encode(
    #alt.X('Interval', title=None),
    alt.Y('FRACTION:Q', title='Heterozygosity')
    )

mean = alt.Chart().mark_circle(color='black').encode(
    #alt.X('Interval', title=None),
    alt.Y('mean(FRACTION):Q', title='Heterozygosity')
    )

layered = alt.layer(violin, error, mean, data=df
    # .filter(
    # #     # pl.col("Interval").is_in(["2018-2020", "Offspring of merger", "NEPRC source"]))
    #     pl.col("Interval").is_in(["Conventional source", "Brooks source", "NEPRC source"]))
).facet(
    #column='Interval'
    alt.Column('Interval',
        header=alt.Header(
            labelOrient='bottom', labelPadding=0, labelAnchor='middle', labelAngle=-90, labelBaseline="middle", labelAlign="right",
            title='Cohort', titleAlign="center", titleOrient='bottom', ),
        sort=colors["Cohort"]
    )
).resolve_scale(x=alt.ResolveMode("independent")
).configure_facet(
    spacing=0,
).configure_title(anchor='middle').properties(
    #title=["Heterozygosity in", "Founding Populations"]
    title=["Heterozygosity in Cohorts"]
    #title=["Heterozygosity of Merger"]
)


layered#.save("/master/abagwell/figures/final_plots/narrow_violins/U42_WES.common_between_founding_cohorts2.violinplot_het.all.vcftools.html")
#layered.save("/master/abagwell/figures/final_plots/full.html")

In [None]:
# Subset to living animals only
#living = pl.read_csv('/master/abagwell/workspace/living_rhesus_with_WES.tsv', separator='\t', schema_overrides={"Id": pl.String})
#df.join(living, how='inner', left_on='SAMPLE', right_on='Id').select(pl.mean('FRACTION'))

In [None]:
# Dataframes for t-tests
Conventional_source = df.filter(
    pl.col('Interval') == 'Conventional source'
)['FRACTION']

Brooks_source = df.filter(
    pl.col('Interval') == 'Brooks source'
)['FRACTION']

NEPRC_source = df.filter(
    pl.col('Interval') == 'NEPRC source'
)['FRACTION']

In [None]:
# T-test to compare groups
import scipy

# Comare Colony1 to Merged
scipy.stats.ttest_ind(Conventional_source, Brooks_source)

In [None]:
scipy.stats.ttest_ind(Conventional_source, NEPRC_source)

In [None]:
scipy.stats.ttest_ind(Brooks_source, NEPRC_source)

In [None]:
y2018_2020 = df.filter(
    pl.col('Interval') == '2018-2020'
)['FRACTION']

offspring_of_merger = df.filter(
    pl.col('Interval') == 'Offspring of merger'
)['FRACTION']

NEPRC_source = df.filter(
    pl.col('Interval') == 'NEPRC source'
)['FRACTION']

In [None]:
scipy.stats.ttest_ind(y2018_2020, offspring_of_merger)

In [None]:
scipy.stats.ttest_ind(y2018_2020, NEPRC_source)

In [None]:
scipy.stats.ttest_ind(offspring_of_merger, NEPRC_source)