In [102]:
import altair as alt
from altair import datum
import polars as pl

# Second attempt at pulling info from .Q file

# Specify which tool is being used
#tool = "RFMix"
tool = "ADMIXTURE"

# For when RFMix is used per chromosome
if tool == "RFMix":
    chromosomes = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20"]
    chromosome_lengths = [223_616_942, 196_197_964, 185_288_947, 169_963_040, 187_317_192, 179_085_566, 169_868_564, 145_679_320, 134_124_166, 99_517_758, 133_066_086, 130_043_856, 108_737_130, 128_056_306, 113_283_604, 79_627_064, 95_433_459, 74_474_043, 58_315_233, 77_137_495]  # For Mmul_10

    Qs = []
    for idx, chrom in enumerate(chromosomes):
        #Q_file = f"/master/abagwell/variant-analysis/results/rhesus/admixture/SHAPEIT5_merged/U42_WGS_WES.RFMix.chr{chrom}.rfmix.Q"
        Q_file = f"/master/abagwell/variant-analysis/results/rhesus/admixture/RFMix/U42_WES.RFMix.chr{chrom}.rfmix.Q"
        Q = pl.read_csv(Q_file, separator="\t", comment_prefix="#", has_header=False, new_columns=["Indiv", "Chinese", "Indian"], schema_overrides=[pl.String, pl.Float32, pl.Float32]).with_columns(
            pl.col("Chinese").mul(chromosome_lengths[idx]),
            pl.col("Indian").mul(chromosome_lengths[idx]),
            chrom = pl.lit(chrom),
        )
        Qs.append(Q)
    admixture = pl.concat(Qs
        # Change to fraction to length
    ).group_by(
        # Merge chromosomes
        "Indiv"
    ).agg(pl.sum("Chinese"), pl.sum("Indian")).with_columns(
        # Change lengths back to fractions
        pl.col("Chinese").truediv(sum(chromosome_lengths)),
        pl.col("Indian").truediv(sum(chromosome_lengths)),
    ).sort("Indiv")
# For when ADMIXTURE is used from one file (not split by chromosome)
elif tool == "ADMIXTURE":
    Q_file = "/master/abagwell/variant-analysis/results/rhesus/admixture/ADMIXTURE/supervised/U42_WES.all2_Indian-Chinese_merged.SNP.autosomal.2.Q"
    #Q_file = "/master/abagwell/variant-analysis/results/rhesus/admixture/ADMIXTURE/supervised/U42_WES.all2.SNP.autosomal.3.Q"
    # Make sure to verified whether the "new_columns" are in the correct order on subsequent runs
    Q = pl.read_csv(Q_file, separator=" ", has_header=False, new_columns=["Indian", "Chinese"])

    fam = pl.read_csv("/master/abagwell/variant-analysis/results/rhesus/genotypes/pruned/plink/U42_WES.all2_Indian-Chinese_merged.SNP.autosomal.fam",
    #fam = pl.read_csv("/master/abagwell/variant-analysis/results/rhesus/genotypes/pruned/plink/U42_WES.all2.SNP.autosomal.fam",
    has_header=False, separator=" ", schema_overrides={"Indiv": pl.String, "Sire": pl.String, "Dam": pl.String}, new_columns=["Fam", "Indiv", "Sire", "Dam", "Sex", "Phenotype"])

    # Join tables
    admixture = pl.concat([fam.select("Indiv"), Q], how="horizontal")

    # Name columns

In [117]:
# Remove individuals that are descended from 7893
admixture.write_csv("/master/abagwell/variant-analysis/results/rhesus/admixture/ADMIXTURE/supervised/U42_WES.all2_Indian-Chinese_merged.SNP.autosomal.2.Q.tsv", separator="\t")

In [104]:
# admixture = admixture.group_by(
#     # Average ploidy
#     "Indiv", maintain_order=True
# ).agg(pl.last("Chinese"), pl.last("Indian"))
# # .sort("indiv", "seq").group_by(
# #     # Keep one of each animal
# #     "indiv", "contig_id", "variant_position", maintain_order=True).agg(pl.last("seq"), pl.last("call_genotype")
# # )

In [105]:
# admixture = pl.concat(Qs
#     # Change to fraction to length
# ).group_by(
#     # Merge chromosomes
#     "sample"
# ).agg(pl.sum("Chinese"), pl.sum("Indian")).with_columns(
#     # Change lengths back to fractions
#     pl.col("Chinese").truediv(sum(chromosome_lengths)),
#     pl.col("Indian").truediv(sum(chromosome_lengths)),
# ).sort("sample").group_by(
#     # Average ploidy
#     "sample", maintain_order=True
# ).agg(pl.last("Chinese"), pl.last("Indian"))
# # .sort("indiv", "seq").group_by(
# #     # Keep one of each animal
# #     "indiv", "contig_id", "variant_position", maintain_order=True).agg(pl.last("seq"), pl.last("call_genotype")
# # )

In [106]:
# For when sample names have seq prefixed

# admixture = pl.concat(Qs
# ).filter(
#     pl.col("sample").str.starts_with("W")
# ).with_columns(
#     # Pull out animal id
#     seq = pl.col("sample").str.slice(0, 3),# .cast(pl.Enum(seq_order)).alias("seq"),
#     indiv = pl.col("sample").str.slice(3).str.split("_").list.get(0),
#     # Change to fraction to length
# ).group_by(
#     # Merge chromosomes
#     "sample", "seq", "indiv"
# ).agg(pl.sum("Chinese"), pl.sum("Indian")).with_columns(
#     # Change lengths back to fractions
#     pl.col("Chinese").truediv(sum(chromosome_lengths)),
#     pl.col("Indian").truediv(sum(chromosome_lengths)),
# ).sort("indiv", "seq").group_by(
#     # Average ploidy
#     "indiv", maintain_order=True
# ).agg(pl.last("Chinese"), pl.last("Indian"))
# # .sort("indiv", "seq").group_by(
# #     # Keep one of each animal
# #     "indiv", "contig_id", "variant_position", maintain_order=True).agg(pl.last("seq"), pl.last("call_genotype")
# # )

In [107]:
# Add birthdates
demographics_file = "/master/abagwell/variant-analysis/resources/rhesus/pedigree/Demographics_2024-04-17_10-28-20.tsv"

demographics = pl.read_csv(demographics_file, separator="\t", infer_schema_length=None, columns=["Id", "Date of Birth"]).with_columns( # Change str to date type
    pl.col("Date of Birth").str.to_date("%m-%d-%Y").dt.round("1y"),
    #pl.col("Date of Death").str.to_date("%m-%d-%Y"),
)

#dated_admixture = admixture.join(demographics, left_on="indiv", right_on="Id")
dated_admixture = admixture.join(demographics, left_on="Indiv", right_on="Id")

In [None]:
#dated_admixture.filter(pl.col("indiv").str.contains("X"))
dated_admixture.filter(pl.col("Indiv").str.contains("X"))

In [109]:
#dated_admixture.write_csv("/master/abagwell/figures/admixture/admixture.tsv", separator='\t')

In [None]:
dated_admixture

In [None]:
# Plot dotplot

# Technically, this plot's birth years are rounded to the nearest year
base = alt.Chart(dated_admixture)

plot = base.mark_point(width=10, shape="circle", strokeWidth=2, size=75).encode(  # mark_boxplot
    alt.X("Date of Birth", title="Birth Year"),  #scale=alt.Scale(domain=[1984, 2024])
    alt.Y("Chinese"),
    tooltip=[
        alt.Tooltip("Indiv")
    ]
).properties(
    title="Chinese Admixture in Rhesus",
    width=500,
)

text = plot.mark_text(
    align='left',
    baseline='middle',
    dx=7
).encode(
    text='Indiv'
).transform_filter(
    datum.Chinese >= 0.18
)

plot# + text
#plot.save("/master/abagwell/figures/WES_ancestry/WES.admixutre.dotplot.html")
#plot.save("/master/abagwell/figures/WES_ancestry/WES.admixutre.dotplot.svg")


In [None]:
# Plot boxplot

boxplot = alt.Chart(dated_admixture).mark_boxplot().encode(  # mark_boxplot
    alt.X("Date of Birth", title="Birth Year"),
    alt.Y("Chinese", title="Chinese Fraction"),
).properties(
    title=["Chinese Admixture in Rhesus"],
    width=600,
)

boxplot

#boxplot.save("/master/abagwell/figures/WES_ancestry/WES.admixutre.boxplot.html")
#boxplot.save("/master/abagwell/figures/WES_ancestry/WES.admixutre.boxplot.svg")

In [None]:
dated_admixture.with_columns(
    floor = pl.col("Chinese").mul(10).floor().truediv(10).cast(pl.String).str.slice(0, 3),
    ceil = pl.col("Chinese").mul(10).ceil().truediv(10).cast(pl.String).str.slice(0, 3),
).with_columns(
    range = pl.concat_str([
        pl.col('floor'),
        pl.lit(' - '),
        pl.col('ceil'),
    ])
).group_by(
    'Date of Birth'
).agg('Indiv', 'range', pl.count()).explode("Indiv", 'range')

In [None]:
# Plot stacked bar plot
# Toggle on the `alt.Y` lines to switch whether or not the bars are stacked by proportion

stacked_barplot = alt.Chart(
    dated_admixture.with_columns(
        floor = pl.col("Chinese").mul(10).floor().truediv(10).cast(pl.String).str.slice(0, 3),
        ceil = pl.col("Chinese").mul(10).ceil().truediv(10).cast(pl.String).str.slice(0, 3),
    ).with_columns(
        range = pl.concat_str([
            pl.col('floor'),
            pl.lit(' - '),
            pl.col('ceil'),
        ])
    )
).mark_bar(width=10).encode(  # mark_boxplot
    alt.X("Date of Birth", title="Birth Year"),
    #alt.Y("count()", title="Percentage of Animals").stack("normalize"),
    alt.Y("count()", title="Number of Animals"),
    color=alt.Color("range:N", title="Chinese Ancestry").scale(scheme="lightmulti") #sort=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])
).properties(
    title=["Chinese Ancestry in Rhesus"],
    width=600,
)

stacked_barplot#.save("/master/abagwell/figures/admixture/U42_WES/U42_WES.all2_Indian-Chinese_merged.stacked_barplot.html")

# By proportion

In [None]:
colonies_file = "/master/abagwell/variant-analysis/resources/rhesus/pop/MML_groups_from_Martha.fixed4.tsv"
colonies = pl.read_csv(colonies_file, separator="\t", infer_schema_length=None)#.join(dates, on="Id", how="left")


joined = dated_admixture.join(colonies, how="left", left_on="Indiv", right_on="Id").filter(
    ~pl.col("Interval").is_null()
)#.drop("tag")

In [None]:
grouped_boxplot = alt.Chart(joined).mark_boxplot().encode(  # mark_boxplot
    alt.X("Interval:N", title="Cohort",
        sort=['Founders', 'pre-2003', '2003-2005', '2006-2008', '2009-2011', '2012-2014', '2015-2017', '2018-2020', 'Founders2', 'Merged']),
    #alt.X("Date of Birth"),
    alt.Y("Chinese", title="Chinese Admixture"),
    color=alt.Color("Colony:N", title="Cohort"),
).properties(
    title=["Chinese Admixture in Rhesus"],
    #width=500,
)


grouped_boxplot#.save("/master/abagwell/figures/admixture/U42_WES/U42_WES.all2_Indian-Chinese_merged.grouped_barplot.html")

#grouped_boxplot.save("/master/abagwell/figures/WES_ancestry/WES.admixutre.grouped_boxplot.html")
#grouped_boxplot.save("/master/abagwell/figures/WES_ancestry/WES.admixutre.grouped_boxplot.svg")