In [1]:
import altair as alt
import polars as pl

# Change the k value to run for different k-clusters
# Run first with the ideal value of k before others. That way, the sorting of the others will align with that of the ideal k

#partitioned_by = "Origin"
partitioned_by = "Interval"
#file_prefix = "U42_WES.all2_Indian-Chinese_merged.SNP.autosomal"
file_prefix = "U42_WES.all2.SNP.autosomal"

# For Indian vs. Chinese supervised admixture
Q_file = "/master/abagwell/variant-analysis/results/rhesus/admixture/ADMIXTURE/supervised/U42_WES.all2_Indian-Chinese_merged.SNP.autosomal.Q"
fam_file = "/master/abagwell/variant-analysis/results/rhesus/genotypes/pruned/plink/U42_WES.all2.SNP.autosomal.fam"

# For projection
#Q_file = "/master/abagwell/variant-analysis/results/rhesus/admixture/ADMIXTURE/projection/U42_WES.all2_founders2_pruned_subset.SNP.autosomal.3.Q"
#fam_file = "/master/abagwell/variant-analysis/results/rhesus/genotypes/subset/plink/U42_WES.all2_founders2_pruned_subset.SNP.autosomal.fam"




Q = pl.read_csv(
    Q_file,
    #f"/master/abagwell/variant-analysis/results/rhesus/admixture/ADMIXTURE/supervised/{file_prefix}.Q",
    #"/master/abagwell/variant-analysis/results/rhesus/admixture/ADMIXTURE/projection/U42_WES.all2_founders2_pruned_subset.SNP.autosomal.3.Q",
    #f"{file_prefix}.Q",
    has_header=False, separator=" ")

fam = pl.read_csv(
    fam_file,
    #f"/master/abagwell/variant-analysis/results/rhesus/genotypes/pruned/plink/{file_prefix}.fam",
    #"/master/abagwell/variant-analysis/results/rhesus/genotypes/subset/plink/U42_WES.all2_founders2_pruned_subset.SNP.autosomal.fam",
    has_header=False, separator=" ", schema_overrides={"Indiv": pl.String, "Sire": pl.String, "Dam": pl.String}, new_columns=["Fam", "Indiv", "Sire", "Dam", "Sex", "Phenotype"])

demographics = pl.read_csv("/master/abagwell/variant-analysis/resources/rhesus/pedigree/Demographics_2024-04-17_10-28-20.tsv",
    has_header=True, separator="\t", schema_overrides={"Id": pl.String, "Sire": pl.String, "Dam": pl.String},
).with_columns(
    pl.col("Date of Birth").str.to_date("%m-%d-%Y")
).select("Id", "Date of Birth")

colonies = pl.read_csv("/master/abagwell/variant-analysis/resources/rhesus/pop/founder_origins.tsv",
    has_header=True, separator="\t", infer_schema_length=1_000
)

In [None]:
k = Q.width

concat_data = pl.concat([fam.select("Indiv"), Q], how="horizontal")

data = concat_data.with_columns(
    admixture = pl.concat_list(pl.exclude('Indiv'))
).with_columns(
    pops=[f"pop{k}" for k in range(1, k+1)]
    #pops=["Early founders", "Brooks source", "NEPRC source"]
).explode("admixture", "pops"
).join(demographics, left_on="Indiv", right_on="Id").select("Indiv", "admixture", "pops", "Date of Birth"
).join(colonies, left_on="Indiv", right_on="Indiv", how="left", 
).with_columns(
    pl.col("Origin").fill_null("Other")
)
# .filter(
#     # Take out reference animals
#     pl.col("Origin") == "Other"
# )


#        .with_columns(
#     # Remove non-numeric parts of animal names"
#     pl.col("Indiv").str.replace("8X", "").str.to_integer()
# )

In [None]:
data

In [None]:
sample_order = concat_data.sort([f"column_{k}" for k in range(1, k+1)]).with_row_index("sample_order")

#sample_order = concat_data.sort("column_1", "column_2", "column_3").with_row_index("sample_order")
#sample_order = concat_data.sort("column_1", "column_2").with_row_index("sample_order")

sample_order

In [5]:
# Add predefined year intervals.
# TODO: Generalize this so that I don't have to pull from the file and just interpret them from dates of birth
year_intervals_file = "/master/abagwell/variant-analysis/resources/rhesus/pop/MML_groups_from_Martha.fixed6.tsv"
year_intervals = pl.read_csv(year_intervals_file, separator="\t", columns=["Id", "Interval"], infer_schema_length=None, schema_overrides={
    "Id": pl.String
})

In [6]:
data = data.join(year_intervals, left_on="Indiv", right_on="Id")

In [None]:
data

In [8]:
data.filter(
    pl.col("pops") == "pop1"
).select("Indiv", "admixture", "Date of Birth", "Interval").sort("Indiv").write_csv("/master/abagwell/figures/admixture/Indian_admixture.tsv", separator='\t')

In [None]:
if partitioned_by == "Origin":
    # This tries to order samples to be the same as the unsupervised run
    sample_order = pl.read_csv("/master/abagwell/variant-analysis/results/rhesus/admixture/ADMIXTURE/unsupervised/unsupervised_order.tsv",
        schema_overrides={"Indiv": pl.String}, separator="\t")

    # Manual reordering to make look like unsupervised plot
    data = data.join(sample_order, on="Indiv", how="left").sort("index")
    partitions = data.partition_by(partitioned_by)

    # This tries to put in the order ["Early founders", "Brooks source", "NEPRC source"] and optionally with "Other"
    #partitions = [partitions[3], partitions[2], partitions[1], partitions[0]]
    

    # Optional reordering
    #partitions = [partitions[3], partitions[2], partitions[1]]

#partitions.sort(key=lambda x: x[0].)
elif partitioned_by == "Interval":
    # Sort samples based on first pop
    # sample_order = data.filter(
    #     pl.col("pops") == "Early founders"
    # ).sort("admixture").with_row_index().select("index", "Indiv")

    # # Remove reference samples
    # data.filter(
    #     ~(pl.col("Interval") == "Early founders")
    # )

    # Sort subplots
    data = data.join(sample_order, on="Indiv", how="left").with_columns(
        pl.col(partitioned_by).cast(pl.Enum(['Early founders', 'Brooks source', 'pre-2003', '2003-2005', '2006-2008', '2009-2011', '2012-2014', '2015-2017', '2018-2020', 'NEPRC source', 'Offspring of merger'])),
    ).sort(partitioned_by)# Rename pops

    partitions = data.partition_by(partitioned_by, maintain_order=True)

In [10]:
partition_names = [partition[partitioned_by][0] for partition in partitions if partition[partitioned_by][0] != None]

In [None]:
data.columns

In [12]:
subplots = []

for idx, partition in enumerate(partitions):
    if idx == 0:
        axis = alt.Axis()
    else:
        axis = alt.Axis(labels=False, title=None, ticks=False)
    subplot = alt.Chart(partition).mark_bar(width=19).encode(
        alt.X("Indiv:N", title="Animals", axis=alt.Axis(labels=False, ticks=False, title=None),
              sort=alt.EncodingSortField(field='sample_order', order='ascending')),
        alt.Y("admixture:Q", title=f"Admixture", axis=axis).scale(domain=[0,1]), #f"k={k}"
        #alt.Column("Origin:N"),
        color=alt.Color("pops:N", title="Population"), #legend=None),
        tooltip="Indiv",
        # tooltip=[
        #     #alt.Tooltip("Indiv", "Indiv"),
        #     alt.Tooltip("pops", title="Pop")
        # ],
    ).properties(
        title=partition_names[idx]
    )
    subplots.append(subplot)

In [None]:
alt.hconcat(*subplots).configure_range(
        category={'scheme': 'category10'}
).properties(
    #title=f"Supervised Admixture"
).configure_title(
    anchor="middle"
)#.save(f"/master/abagwell/figures/admixture/U42_WES/U42_WES.all2_Indian-Chinese_merged2.admixture.barplot.{k}.html")
#.save(f"/master/abagwell/figures/admixture/U42_WES/U42_WES.all2.admixture2.barplot.html")