In [None]:
import altair as alt
import pandas as pd
import polars as pl


#path = "/master/abagwell/variant-analysis/results/rhesus/admixture/test/WGS_chr1_admix_by_birth_year.tsv"
path = "/master/abagwell/variant-analysis/results/rhesus/admixture/test/chr1_admix_by_birth_year.tsv"

data = pl.read_csv(path, separator="\t", new_columns=["id", "Chinese", "Indian", "birth_date"], has_header=False
).filter(pl.col("id").str.starts_with("m").is_not()
).with_columns(
    (pl.col("Indian") - 0.5).alias("scale")
).select(
    "id",
    "Chinese",
    "Indian",
    pl.col("birth_date").str.to_date("%m-%d-%Y").dt.year().alias("birth_year"),
).sort(
    "birth_year",
)

In [None]:
## Overall admixture barplot. Second plot that displays number of births each year. These bars are selectable and modify the admixture plot.

select_year = alt.selection_interval(encodings=['x'])
selector = alt.selection_point(fields=["id"])
base = alt.Chart(data.to_arrow().to_pandas())

heatmap = base.mark_bar().encode(
    alt.X("Indian", title="Fraction Indian", scale=alt.Scale(domain=[0, 1])).bin(maxbins=20, extent=[0, 1]),
    alt.Y("count(id)", title="Counts"),
    #alt.Color('count()').legend(orient="right", title="Paired Counts"),
    #tooltip=[
    #    alt.Tooltip("count()"),
    #]
).properties(
    title="Indian vs Chinese Admixture in Rhesus Macaques"
).transform_filter(
    select_year
).add_params(
    selector
)

births_per_year = base.mark_bar(width=7).encode(
    alt.X("birth_year:O", title='Birth Year'),
    alt.Y("count()", title='Count of Births'),
    #color=alt.value("#dbe5b5"),
    #color=alt.value("#fc8d62"),
    color=alt.value("#66c2a5"),
    tooltip=[
        alt.Tooltip("count()"),
    ]
).properties(
    height = 70,
    width = 500,
    title = ""
).add_selection(
    select_year
)

heatmap & births_per_year

In [None]:
## Ridgeline plot

# Add duplicate rows to table, but with year adjusted. This allows for records to fall into multiple bins.
tables = []
for i in range(-2, 3):  # This range allows for a window that includes two below and two above the current year
    table = data.with_columns(
        pl.col("birth_year").add(i) #apply(lambda y: y +1)
    )
    tables.append(table)

data_extra_years = pl.concat(tables)

step = 50
overlap = 1.5

def ridgeline(start):
    return alt.Chart(data_extra_years.to_arrow().to_pandas(), height=step
    ).transform_joinaggregate(
        mean_Indian='mean(Indian)', groupby=['birth_year']
    ).mark_area(
        interpolate='monotone',
        fillOpacity=0.8,
        stroke='lightgray',
        strokeWidth=0.5,
    ).encode(
        alt.X("Indian").bin(maxbins=100).scale(domain=[start, 1], clamp=True).title("Avg. Indian"),
        alt.Y("count(id)").axis(None).scale(range=[step, -step * overlap]).title("Count"),
        alt.Fill("mean_Indian:Q").legend(None),#.scale(domain=[.35, .36], scheme='blueorange'),
        alt.Row("birth_year:Q").title("Birth Year"),
        tooltip=[
            alt.Tooltip("count(id)", title="Samples in Bin")
        ]
    ).properties(
        title="Indian Ancestry by Birth Year (overlapping windows of 5 years)",
        bounds='flush',
    ).configure_facet(
        spacing=0,
    ).configure_title(
        anchor='middle',
    )

ridgeline(0) #+ ridgeline(0.8)

In [None]:
## Locus-specific Admixture
import altair as alt
from altair.expr import datum
import polars as pl

#path = "/master/abagwell/variant-analysis/results/rhesus/admixture/RFMix.chr1.msp.tsv"
#path = "/master/abagwell/variant-analysis/results/rhesus/admixture/RFMix.some.msp.tsv"
path = "/master/abagwell/variant-analysis/results/rhesus/admixture/RFMix.all.msp.tsv"

data = pl.read_csv(path, separator="\t", skip_rows=1)

# Unpack each genotype and sample into it's own record
columns = data.columns

unpacked = data.select(
    pl.col("#chm").alias("chm"),
    "spos",
    "epos",
    "sgpos",
    "egpos",
    "n snps",
    pl.concat_list(pl.exclude("#chm", "spos", "epos", "sgpos", "egpos", "n snps")).alias("genotype"),
    pl.Series([columns[6:] for record in range(len(data))]).alias("sample")
).explode("genotype", "sample").with_columns(
    pl.col("epos").sub(pl.col("spos")).alias("length"),
    pl.lit(0).alias("0"),  # Create a column of all zeroes for starting position of y-axis in figure
)#.sample(1000)

In [None]:
# Finding intervals that are discrepant between WES and WGS. And then removing them.
discordant = unpacked.with_columns(
    pl.col("sample").str.slice(0, 3).alias("seq_type"),
    pl.col("sample").str.split("S").arr.last().str.split(".").arr.first().alias("id"),
).groupby(
    "id",
    "chm",
    "spos",
    "epos",
    #maintain_order=True
).agg(  # Create list of WES/WGS for sample
    "seq_type",
    "genotype",
).with_columns(  # Not required. Just to verify length of lists
    pl.col("seq_type").arr.lengths().alias("length")
).filter(  # Keeps only samples with both WES and WGS
    pl.col("seq_type").arr.lengths() == 4
).filter(  # Keep only discordant regions between WES and WGS genotypes
    (pl.col("genotype").arr.get(0) != pl.col("genotype").arr.get(2)) &
    (pl.col("genotype").arr.get(1) != pl.col("genotype").arr.get(3)),
).explode(  # Reverse the groupby
    "seq_type",
    "genotype"
).with_columns(
    pl.concat_str(pl.col("seq_type"), pl.col("id"), pl.lit("."), pl.col("genotype")).alias("sample")
)

masked = unpacked.join(discordant, on=["chm", "spos", "epos"], how="anti")

# Group bordering intervals that have the same genotype on the same chromosome for the same sample
grouped = masked.sort("sample").with_columns(
    (
        (pl.col("epos") != pl.col("spos").shift(-1)) |
        (pl.struct("sample", "chm", "genotype") != pl.struct("sample", "chm", "genotype").shift(-1))
    ).shift_and_fill(periods=1, fill_value=False).cumsum().alias("group"),
).groupby("group").agg(
    pl.col("chm").first(),
    pl.col("spos").min().alias("spos"),
    pl.col("epos").max().alias("epos"),
    pl.col("genotype").first(),
    pl.col("sample").first(),
    pl.col("length").first(),
)#.sort("group")

discordant

In [None]:
# discordant.filter(  # Remove WES that also have WGS
#     pl.col("seq_type") == "WGS"
# )

In [None]:
## Long variable-width bar plot with dropdown for chromosome selection 

alt.data_transformers.disable_max_rows()

xlim = unpacked.select("epos").max().item()

# For chromosome selection
chromosomes = unpacked.select("chm").unique().to_series().to_list()
input_dropdown = alt.binding_select(options=chromosomes, name='Chromosome')
chr_selection = alt.param(value=1, bind=input_dropdown)

alt.Chart(masked.to_arrow().to_pandas()).mark_bar(point=True).encode(
    alt.X("spos", title="Chromosomal Position (bp)").scale(domain=[0, xlim], clamp=True),
    alt.X2("epos"),
    alt.Y("0:Q"),
    alt.Y2("mean_genotype:Q", title="Indian Admixture"),
    alt.Fill("mean_genotype:Q").legend(None),
    alt.Row("chm", title="Chromosome"),
    tooltip=[
        alt.Tooltip("mean_genotype:Q", title="Mean"),
        alt.Tooltip("spos", title="Start"),
        alt.Tooltip("epos", title="End"),
    ]
).transform_aggregate(
    mean_genotype='mean(genotype)',
    groupby=["spos", "epos", "0", "chm"],
).properties(
    title="Indian to Chinese Admixture Across Chromosome",
    width=20000
).add_params(
    chr_selection,
).transform_filter(
    (datum.chm == chr_selection)
).save("admixture_variable-width_bar.html")

In [None]:
## For Manhattan plot
chr_lengths = unpacked.select("chm", "epos").groupby("chm").max().sort("chm")

# Cumsum offsets of chromosomal end positions
offsets = []
offset = 0
for chm, epos in chr_lengths.rows():
    offsets.append((chm, offset))
    offset += epos
offsets = dict(offsets)

# Add offsets
concatenated = masked.with_columns(
    pl.col("spos").add(pl.col("chm").map_dict(offsets)).alias("concat_spos")
)

grouped_concat = grouped.with_columns(
    pl.col("spos").add(pl.col("chm").map_dict(offsets)).alias("concat_spos"),
    pl.col("epos").add(pl.col("chm").map_dict(offsets)).alias("concat_epos")
)

In [None]:
## Manhattan plot
alt.data_transformers.disable_max_rows()
alt.Chart(concatenated.to_arrow().to_pandas()).mark_point(filled=True).encode(
    alt.X("concat_spos", title="Chromosomal Position (bp)"), #.scale(domain=[0, xlim], clamp=True),
    #alt.X2("epos"),
    alt.Y("mean_genotype:Q", title="Indian Admixture"),
    #alt.Fill("mean_genotype:Q").legend(None),
    #alt.Column("chm", align="each", spacing=0, title="Chromosome"),
    color=alt.Color("chm:O").scale(scheme='dark2').legend(None),
    tooltip=[
        alt.Tooltip("mean_genotype:Q", title="Mean"),
        alt.Tooltip("chm", title="Chr"),
        alt.Tooltip("spos", title="Start"),
        alt.Tooltip("epos", title="End"),
    ]
).transform_aggregate(
    mean_genotype='mean(genotype)',
    groupby=["spos", "epos", "concat_spos", "chm"],
).properties(
    title="Indian to Chinese Admixture Across Chromosome",
    width=1000
) #.save("admixture_Manhattan.html")

In [None]:
## Horizontal bar graph. Each individual has two bars for each haplotype
import pandas as pd


alt.data_transformers.disable_max_rows()
chart = alt.Chart(grouped_concat.to_arrow().to_pandas()).mark_bar().encode(
    alt.X("concat_spos").title("Start"),
    alt.X2("concat_epos").title("End"),
    alt.Y("sample").title("Sample"),
    color = alt.Color("genotype:N").scale(range=['#1f77b4', 'orange']),
    tooltip=[
        alt.Tooltip("chm", title="Chr"),
        alt.Tooltip("spos", title="Start (bp)"),
        alt.Tooltip("epos", title="End (bp)"),
        alt.Tooltip("length", title="Length"),
        #alt.Tooltip("sgpos", title="Start (cM)"),
        #alt.Tooltip("egpos", title="Start (cM)"),
    ]
).properties(
    width=3000,
)

chromosome_boundaries = alt.Chart(pd.DataFrame({'x': list(offsets.values())[1:]})).mark_rule(color="red", strokeWidth=8).encode(x='x')

(chart + chromosome_boundaries) #.save("admixture_indiv_grouped.html")