In [None]:
# To replace above block

import allel
import altair as alt
from altair import datum
import numpy as np
import pandas as pd
import polars as pl

# Query VCF
vcf = "/master/abagwell/variant-analysis/results/rhesus/structural_variants/annotated/U42_WGS_WES.genotyped.pass.split_BNDs.vcf.gz"
variants = pl.from_pandas(
    pd.DataFrame(
        allel.read_vcf(vcf, ['variants/CHROM', 'variants/POS', 'variants/SVTYPE', 'variants/ID', 'variants/CHR2', 'variants/POS2', 'variants/CSQ'])
    )
)

pops = pl.read_csv(
    #"/master/abagwell/variant-analysis/resources/rhesus/samples/pops/RPL.tsv",
    "/master/abagwell/variant-analysis/resources/rhesus/samples/pops/RPL_with_males.tsv",
    separator="\t"
).group_by("RPL_status"
).agg("sample")


groups = []
for pop_id, pop_members in pops.rows():
    callset = allel.read_vcf(vcf, ['calldata/GT'], samples=pop_members)
    genotypes = allel.GenotypeArray(callset['calldata/GT'])

    # Allele counts
    #ac_tmp = genotypes.count_alleles()
    ac = pl.from_pandas(pd.DataFrame(genotypes.count_alleles())).with_columns(
        pl.lit(pop_id).alias("group"),
        pl.col("1").truediv(pl.col("0").add(pl.col("1"))).alias("fraction" + str(pop_id)),
        pl.col("0").add(pl.col("1")).alias("AN" + str(pop_id)),
    ).drop(
        ["group", "0", "1"]
    )

    # groups.append(
    #     pl.concat([variants, ac], how="horizontal")
    # )

    groups.append(ac)

In [None]:
all_acs = pl.concat(groups, how="horizontal")

all = pl.concat([variants, all_acs], how="horizontal").with_columns(
    pl.col("fraction0").sub(pl.col("fraction1")).alias("difference"),
    pl.concat_str([
        pl.col("variants/CHROM"),
        pl.lit(":"),
        pl.col("variants/POS"),
    ]).alias("locus"),
    pl.col("variants/CSQ").str.split("|").list.get(1).alias("consequence"),
    pl.col("variants/CSQ").str.split("|").list.get(2).alias("impact"),
    pl.col("variants/CSQ").str.split("|").list.get(3).alias("symbol"),
    pl.col("variants/CSQ").str.split("|").list.get(4).alias("gene"),
# ).filter(
#     (pl.col("difference") > 0.28) | (pl.col("difference") < -0.28)
#).filter(
    #(pl.col("variants/SVTYPE") == "BND")
).drop(
    "variants/CSQ"
).group_by(
    pl.col("locus")
).first( # Removes rows with duplicate position, keeping only one's info
).sort(
    pl.col("difference"), descending=True
).with_row_count()


# all = pl.concat(groups, how="vertical").filter(
#     pl.col("variants/CHROM") == "10",
# )

#all.group_by(pl.col("impact")).count()
all
#all.write_csv("/master/abagwell/figures/SVs_biallelic.split_breakpoints.VEP.tsv", separator="\t")

In [None]:
# Examples of further filtering
# all = all.filter(
#     #pl.col("variants/CHROM") == "15",
#     (pl.col("difference") > 0.2) | (pl.col("difference") < -0.2)
# ).filter(
#     pl.col("variants/SVTYPE") == "BND"
# )

# all = all.filter(
#     pl.col("fraction0") == 0
# )

In [None]:
all = all.sort("fraction1", descending = True)

#all.write_csv("/master/abagwell/figures/SVs_including_males_RPL_only.tsv", separator="\t")

In [None]:
# Scatterplot of allele frequencies for RPL and non-RPL

slider_0 = alt.binding_range(min=0, max=1, step=0.01, name='fraction0 ')
selector_0 = alt.selection_point(
    name="selector0",
    fields=['fraction0'],
    bind=slider_0,
    value=[{'fraction0': 1}]
)
slider_1 = alt.binding_range(min=0, max=1, step=0.01, name='fraction1 ')
selector_1 = alt.selection_point(
    name="selector1",
    fields=['fraction1'],
    bind=slider_1,
    value=[{'fraction1': 1}]
)

alt.data_transformers.disable_max_rows()

chart = alt.Chart(all.to_arrow().to_pandas()).mark_circle().encode(
    #alt.X("variants/POS:N"),
    #alt.X("row_nr:N").title("Structural Variants"),
    #alt.X("fraction1").title("Allele Frequency in RPL"),
    alt.X("fraction1").title("Allele Frequency in RPL"),
    alt.Y("fraction0").title("Allele Frequency in non-RPL"),
    #color = alt.Color("group:N"),
    # color=alt.condition(
    #     alt.datum.difference > 0,
    #     alt.value("steelblue"),  # The positive color
    #     alt.value("orange")  # The negative color
    # ),
    color = alt.Color("variants/SVTYPE", scale=alt.Scale(domain=['BND', 'DUP', 'DEL', 'INV', 'INS'])).title("SV Type"),
    tooltip=[
        alt.Tooltip("symbol", title="Symbol"),
        alt.Tooltip("gene", title="Gene"),
        alt.Tooltip("consequence", title="Consequence"),
        alt.Tooltip("impact", title="Impact"),
        alt.Tooltip("variants/SVTYPE", title="SVTYPE"),
        alt.Tooltip("locus", title="Locus"),
        alt.Tooltip("fraction0:O", title="frequency0"),
        alt.Tooltip("fraction1:O", title="frequency1"),
        alt.Tooltip("difference:Q", title="Difference"),
    ]
).properties(
    title = "Correlation of SVs to RPL vs non-RPL",
    # height = 600,
    # width = 600,
).add_params(
    selector_0, selector_1
).transform_filter(
    (datum.fraction0 <= selector_0.fraction0) & (datum.fraction1 <= selector_1.fraction1) 
)

chart
#chart.save("/master/abagwell/figures/SV_allele_frequency.WGS_including_males_RPL_only.selection2.html")


In [None]:
alt.data_transformers.disable_max_rows()

chart = alt.Chart(all.to_arrow().to_pandas()).mark_bar().encode(
    #alt.X("variants/POS:N"),
    #alt.X("row_nr:N").title("Structural Variants"),
    alt.X("locus", sort=alt.EncodingSortField(field="row_nr:N", op="min", order='descending')).title("Locus"),
    alt.Y("difference:Q").title("non-RPL freq. - RPL freq."),
    #color = alt.Color("group:N"),
    # color=alt.condition(
    #     alt.datum.difference > 0,
    #     alt.value("steelblue"),  # The positive color
    #     alt.value("orange")  # The negative color
    # ),
    color = alt.Color("variants/SVTYPE", scale=alt.Scale(domain=['BND', 'DUP', 'DEL', 'INV', 'INS'])).title("SV Type"),
    tooltip=[
        #alt.Tooltip("0:O", title="Ref AC"),
        #alt.Tooltip("1:O", title="Alt AC"),
        alt.Tooltip("non_AN:O", title="non_AN"),
        alt.Tooltip("RPL_AN:O", title="RPL_AN"),
        alt.Tooltip("variants/SVTYPE", title="SVTYPE"),
        alt.Tooltip("locus", title="Locus"),
        alt.Tooltip("difference:Q", title="Difference"),
    ]
).properties(
    title = "SVs Most Correlated to RPL or non-RPL",
    width = 600,
)

chart #.save("/master/abagwell/figures/SV4.html")

In [None]:
alt.data_transformers.disable_max_rows()

without_QNVO = all.filter(
    ~pl.col("variants/CHROM").str.contains("QNVO")
)

chart = alt.Chart(without_QNVO.to_arrow().to_pandas()).mark_bar().encode(
    #alt.X("variants/POS:N"),
    #alt.X("row_nr:N").title("Structural Variants"),
    alt.X("locus", sort=alt.EncodingSortField(field="row_nr:N", op="count", order='descending')).title("Locus"),
    alt.Y("difference:Q").title("non-RPL freq. - RPL freq."),
    #color = alt.Color("group:N"),
    # color=alt.condition(
    #     alt.datum.difference > 0,
    #     alt.value("steelblue"),  # The positive color
    #     alt.value("orange")  # The negative color
    # ),
    color = alt.Color("variants/SVTYPE", scale=alt.Scale(domain=['BND', 'DUP', 'DEL', 'INS'])).title("SV Type"),
    tooltip=[
        #alt.Tooltip("0:O", title="Ref AC"),
        #alt.Tooltip("1:O", title="Alt AC"),
        #alt.Tooltip("AC:O", title="Total AC"),
        alt.Tooltip("variants/SVTYPE", title="SVTYPE"),
        alt.Tooltip("locus", title="Locus"),
        alt.Tooltip("difference:Q", title="Difference"),
    ]
).properties(
    title = "SVs Most Correlated to RPL or non-RPL",
    width = 500,
)

chart

In [None]:
alt.data_transformers.disable_max_rows()

chart = alt.Chart(all.to_arrow().to_pandas()).mark_circle().encode(
    alt.X("variants/POS"),
    alt.Y("fraction:Q").title("Fraction"),
    color = alt.Color("group:N"),
    tooltip=[
        alt.Tooltip("0:O", title="Ref AC"),
        alt.Tooltip("1:O", title="Alt AC"),
        alt.Tooltip("AC:O", title="Total AC"),
        alt.Tooltip("variants/SVTYPE", title="SVTYPE"),
        alt.Tooltip("variants/POS", title="Position"),
        alt.Tooltip("fraction:Q", title="Fraction Alt"),
    ]
).properties(
    title = "SVs across Rhesus",
    width = 10000,
)

chart