In [144]:
import altair as alt
import polars as pl

# Change the k value to run for different k-clusters
# Run first with the ideal value of k before others. That way, the sorting of the others will align with that of the ideal k
k = 2

Q = pl.read_csv(
    f"/master/abagwell/variant-analysis/results/rhesus/admixture/ADMIXTURE/supervised/U42_WES.all2_Indian-Chinese_merged.SNP.autosomal.2.Q",
    has_header=False, separator=" ")

fam = pl.read_csv("/master/abagwell/variant-analysis/results/rhesus/genotypes/pruned/plink/U42_WES.all2_Indian-Chinese_merged.SNP.autosomal.fam",
    has_header=False, separator=" ", schema_overrides={"Indiv": pl.String, "Sire": pl.String, "Dam": pl.String}, new_columns=["Fam", "Indiv", "Sire", "Dam", "Sex", "Phenotype"])

demographics = pl.read_csv("/master/abagwell/variant-analysis/resources/rhesus/pedigree/Demographics_2024-04-17_10-28-20.tsv",
    has_header=True, separator="\t", schema_overrides={"Id": pl.String, "Sire": pl.String, "Dam": pl.String},
).with_columns(
    pl.col("Date of Birth").str.to_date("%m-%d-%Y")
).select("Id", "Date of Birth")

# colonies = pl.read_csv("/master/abagwell/variant-analysis/resources/rhesus/pop/MML_groups_from_Martha.fixed4.with_Brooks_origin.tsv",
#     has_header=True, separator="\t", infer_schema_length=1_000
# ).filter(
#     pl.col("Interval").is_in(["Founders", "Founders2"])
# ).with_columns(
#     pl.concat_str([
#         pl.col("Origin"),
#         pl.lit(" Colony "),
#         pl.col("Colony"),
#     ])
# ).select("Id", "Origin")
colonies = pl.read_csv("/master/abagwell/variant-analysis/resources/rhesus/pop/founder_origins.tsv",
    has_header=True, separator="\t", infer_schema_length=1_000
)

In [145]:
concat_data = pl.concat([fam.select("Indiv"), Q], how="horizontal")

data = concat_data.with_columns(
    admixture = pl.concat_list(pl.exclude('Indiv'))
).with_columns(
    pops=[f"pop{k}" for k in range(1, k+1)]
).explode("admixture", "pops"
).join(demographics, left_on="Indiv", right_on="Id").select("Indiv", "admixture", "pops", "Date of Birth"
).join(colonies, left_on="Indiv", right_on="Indiv"
).with_columns(
    # Remove non-numeric parts of animal names"
    pl.col("Indiv").str.replace("8X", "").str.to_integer()
)

In [146]:
# # For this particular dataset, k=3 has the lowest CV error, so all plots will use this ordering.
# # Note that this requires running for k=3 first before other values of k
# if k == 3:
#     sample_order = concat_data.sort(["column_1", "column_3", "column_2"], descending=True).with_row_index().select("Indiv", "index").with_columns(
#         # Remove non-numeric parts of animal names"
#         pl.col("Indiv").str.replace("8X", "").str.to_integer()
#     )

# # Join index to table
# data = data.join(sample_order, on="Indiv")
sample_order = pl.read_csv("/master/abagwell/variant-analysis/results/rhesus/admixture/ADMIXTURE/unsupervised/unsupervised_order.tsv", separator="t")

In [149]:
data = data.join(sample_order, on="Indiv").sort("index")

In [150]:
# Renaming origins. Probably should change the actual source file
data = data.with_columns(
    pl.col("Origin"
    ).str.replace("non-Brooks Colony 1", "Early founders"
    ).str.replace("Brooks Colony 1", "Brooks source"
    ).str.replace("non-Brooks Colony 2", "NEPRC source")
)

In [151]:
partitions = data.partition_by("Origin")
# Manual reordering to make look like unsupervised plot
partitions = [partitions[2], partitions[1], partitions[0]]

In [156]:
subplots = []
colonies = [partition["Origin"][0] for partition in partitions]

#for idx, partition in enumerate(data.partition_by("Origin")):
for idx, partition in enumerate(partitions):
    if idx == 0:
        axis = alt.Axis()
    else:
        axis = alt.Axis(labels=False, title=None, ticks=False)
    subplot = alt.Chart(partition).mark_bar(width=19).encode(
        alt.X("Indiv:N", title="Animals", axis=alt.Axis(labels=False, ticks=False, title=None),
              sort=alt.EncodingSortField(field='index', order='ascending')),
        alt.Y("admixture:Q", title=f"k={k}", axis=axis).scale(domain=[0,1]), #f"k={k}"
        #alt.Column("Origin:N"),
        color=alt.Color("pops:N", title="Population", legend=None),
        tooltip="Indiv",
        # tooltip=[
        #     #alt.Tooltip("Indiv", "Indiv"),
        #     alt.Tooltip("pops", title="Pop")
        # ],
    ).properties(
        title=colonies[idx]
    )
    subplots.append(subplot)

In [None]:
alt.hconcat(*subplots).configure_range(
        category={'scheme': 'category10'}
).properties(
    #title=f"Unsupervised Admixture of Rhesus Founders, k={k}"
).configure_title(
    anchor="middle"
)#.save(f"/master/abagwell/figures/admixture/U42_WES/U42_WES.all2_Indian-Chinese_merged.admixture.barplot.{k}.html")