In [None]:
# This file can be run manually, but errors without explanation when run through Snakemake.

import pickle as pk

import altair as alt
import pandas as pd
import polars as pl

# # Snakemake variables
# html = snakemake.output.html
# pickle = snakemake.input.pickle
# window_size = snakemake.wildcards.window_size

# Snakemake variables
html = "/master/abagwell/variant-analysis/results/rhesus/scikit-allel/expected_heterozygosity/WGS/SNPRC_WGS_WES.SNP.merged.html"  # snakemake.output.html
pickle = "/master/abagwell/variant-analysis/results/rhesus/scikit-allel/expected_heterozygosity/WGS/SNPRC_WGS_WES.SNP.merged.pickle"  #snakemake.input.pickle

# Load data
with open(pickle, "rb") as f:
    df = pk.load(f)

In [None]:
pl_df = pl.from_pandas(df).with_columns(
    pl.concat_str([
        pl.lit("chr"),
        pl.col("chrom"),
        pl.lit(":"),
        pl.col("position"),
    ]).alias("chr_pos"),
    #(pl.col("dxy") / pl.col("counts")).alias("avg_dxy")  # In case the average is more important since dxy is correlated with number of variants
)

In [None]:
pl_df

In [None]:
grouped = pl_df.groupby("pop", "chrom").agg(pl.mean("exp_heterozygosity"))
grouped

In [None]:
# Altair plot
alt.data_transformers.disable_max_rows()
alt.Chart(grouped.to_arrow().to_pandas()).mark_circle().encode(
    alt.X("pop", title=f"Population", axis=alt.Axis(labels=False, tickSize=0), sort=['1','2','3','4','5']),
    alt.Y("exp_heterozygosity", title="Exp. Heterozygosity"),
    color=alt.Color("chrom", title="Chrom", sort=['1','2','3','4','5']),
    tooltip=[
        alt.Tooltip("chrom", title="Chrom"),
        alt.Tooltip("exp_heterozygosity", title="Exp. Heterozygosity")
    ]
).properties(
    title = "Expected Heterozygosity between RPL and non-RPL Rhesus Macaques",
    width = 300,
).save(html)