In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
import seaborn as sns

# Set chromosome
#chromosomes = ["1", "3", "5","18"]
chromosomes = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20"]

# Define path to files below
home = '/master/abagwell'
#samples_list = f"{home}/variant-analysis/resources/rhesus/samples/runs.WGS.U42.list"
samples_list = f"{home}/variant-analysis/resources/rhesus/samples/runs.WGS_WES.U42.list"
contig_length = f"{home}/variant-analysis/results/rhesus/relatedness/roh/contig_lengths.tsv"

# Create list of sample names
sample_names = []
with open(samples_list, "r") as f:
    for sample in f:
        sample_names.append("_".join(sample.strip().split("_")[0:2]))

# Create contig lists
contigs = pd.read_table(contig_length, names=["chr", "length"], index_col="chr")
contig_lengths = contigs.to_dict()["length"]
chrom_length = contig_lengths["1"] #contigs_lengths.iloc[0]

# Find total lengths of chromosomes
total_length = 0
for contig, length in contig_lengths.items():
    if contig in chromosomes:
        total_length += length

In [None]:
import numpy as np
from scipy.cluster import hierarchy
from scipy.spatial.distance import pdist
import matplotlib.pyplot as plt

# Must add individuals in same order as in matrix
kinship_samples = []
kinship_csv = "/master/abagwell/variant-analysis/resources/rhesus/pedigree/kinship_matrix.csv"
kinship_matrix = np.genfromtxt(kinship_csv, delimiter=",")

# for sample in kinship_samples:
#     if f"WGS{sample}" in sample_names:
#         sample_labels.append(f"WGS{sample}")
#     elif f"WES{sample}" in sample_names:
#         sample_labels.append(f"WES{sample}")
#     else:
#         print("ID not in samples.")

# Set identities to 1
np.fill_diagonal(kinship_matrix, 1)

# Set individuals with no kinship to a small number (since smaller kinship coefficients here mean less related)
for series in kinship_matrix:
    series[series == 0] = 0.0001  # Less than recorded kinship values

# Make values linear with log2, make positive with absolute value, and then subtract to to set identical pairs to 0
# Values will then be zero if identical and higher numbers are more distantly related. These grow linearly
kinship_matrix = np.absolute(np.log2(kinship_matrix))
print(kinship_matrix)

# Plot dendrogram
sns.set_theme()
hierarchy.set_link_color_palette(['m', 'c', 'crimson', 'orange'])
Z = hierarchy.linkage(pdist(kinship_matrix, "euclidean"), 'single')
plt.figure(figsize=(8,15))
dn = hierarchy.dendrogram(Z, orientation='left', labels=kinship_samples, color_threshold=20, above_threshold_color='black')

# Get colors and order for ROHs
colors = dn["leaves_color_list"]
leaves = dn["ivl"]  #dn["leaves"]

In [None]:
# Only run this block if there is no grouping by kinship
leaves = sorted(list(set([sample[3:] for sample in sample_names])))
leaves = [leaf.split("/")[1] for leaf in leaves]  # Requried for when the samples include batch name separated by "/"
leaves = ["_".join(leaf.split("_")[0:2]) for leaf in leaves]
sample_names = [sample.split("/")[1] for sample in sample_names]
colors = ['teal' for leaf in leaves]

In [None]:
# Attempting with polars instead of pandas
chromosomes = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20"]
chromosome_lengths = [223_616_942, 196_197_964, 185_288_947, 169_963_040, 187_317_192, 179_085_566, 169_868_564, 145_679_320, 134_124_166, 99_517_758, 133_066_086, 130_043_856, 108_737_130, 128_056_306, 113_283_604, 79_627_064, 95_433_459, 74_474_043, 58_315_233, 77_137_495]  # For Mmul_10
chrom_df = pl.DataFrame({
    "chrom": chromosomes,
    "chrom_len": chromosome_lengths
})

froh_dfs = []
for idx, chrom in enumerate(chromosomes):
    froh_file = f"{home}/variant-analysis/results/rhesus/relatedness/roh/merged/U42_WGS_WES.SNP.chr{chrom}.froh_poisson.pickle"
    froh_df = pl.from_pandas(pd.read_pickle(froh_file))
    froh_dfs.append(froh_df)

In [None]:
# Find fROH
froh_df = pl.concat(froh_dfs).join(chrom_df, on="chrom").with_columns(
    # Find total length of ROHs (per chromosome)
    pl.col("chrom_len").mul(pl.col("froh")).alias("roh_len")
).group_by("sample"
# Sum lengths from chromosomes
).agg(pl.sum("roh_len")
).with_columns(
    # Find fraction of genomes in ROH
    pl.col("roh_len").truediv(pl.lit(sum(chromosome_lengths))).alias("froh")
)
froh_df

In [None]:
# Colony assignments
colonies_file = "/master/abagwell/variant-analysis/resources/rhesus/pop/colonies.tsv"
colonies = pl.read_csv(colonies_file, separator="\t", infer_schema_length=None)

froh_df_by_colony = froh_df.with_columns(
    pl.col("sample").str.split("_").list.get(0).str.slice(3).alias("id"),
).join(colonies, how="left", left_on="id", right_on="Id").drop_nulls()
#froh_df_by_colony

# merged = colony_demographics.join(Q, how="left", left_on="Id", right_on="#sample"
# ).group_by("Year", "Colony").agg(pl.count("Id").alias("Count"), pl.mean("Indian"), pl.mean("Chinese")).drop_nulls()

In [None]:
# Plot Altair boxplot
import altair as alt

alt.Chart(froh_df_by_colony).mark_boxplot().encode(
    alt.X("Colony:N"),
    alt.Y("froh:Q", title="fROH"),
    color=alt.Color("Colony:N"),
).properties(
    #width=100
)

In [None]:
## Seaborn boxplot to show percentage of genome in ROH

g = sns.catplot(data=froh_df_by_colony, y="froh", x="Colony", hue="Colony", kind="box", aspect=0.6).set(ylabel="Fraction of Genome", title="Extent of ROHs in\nRhesus Macaque Populations")

In [None]:
froh_pandas = froh_df_by_colony.to_pandas()

In [None]:
froh_pandas

In [None]:
froh_pandas[froh_pandas["id"] == "30009"].values[0][2]

In [None]:
## Generate figure with dendrogram and fROH

fig, ax = plt.subplots(1, 2, figsize=(10, 30)) #height_ratios=[height, 2])
#fig.suptitle(f"FROH")
fig.tight_layout()
plt.subplots_adjust(hspace=0.1)
plt.subplots_adjust(wspace=0.27)

dnn = hierarchy.dendrogram(Z, ax=ax[0], orientation='left', labels=kinship_samples, no_labels=True, color_threshold=20, above_threshold_color='black')

#percent_froh = combined_froh[combined_froh["index"] == sample].values[0][1] / total_length

for indiv, color in zip(leaves, colors):
    ax[1].barh(indiv, froh_pandas[froh_pandas["id"] == indiv].values[0][2] / total_length, color=color)
ax[1].set_xlabel("fROH")
plt.margins(y=0)

In [None]:
# Collect invidual ids to submit in TAC to obtain kinship matrix
for sample in sorted(list(set([sample.split("/")[-1].split("_")[0][3:] for sample in sample_names]))):
    print(sample, end=",")

In [None]:
# regions_file = "/master/abagwell/variant-analysis/results/rhesus/coverage/common_WES_0.5_loci.bed"
# pl.read_csv(regions_file, separator="\t", has_header=False, new_columns=["chrom", "start", "stop"], dtypes=[pl.String, pl.Int32, pl.Int32])

In [None]:
# Altair. Using output from `bcftools roh`
import altair as alt
import pandas as pd
import polars as pl

alt.data_transformers.disable_max_rows()

#chromosomes = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20"]
#chromosome_lengths = [223_616_942, 196_197_964, 185_288_947, 169_963_040, 187_317_192, 179_085_566, 169_868_564, 145_679_320, 134_124_166, 99_517_758, 133_066_086, 130_043_856, 108_737_130, 128_056_306, 113_283_604, 79_627_064, 95_433_459, 74_474_043, 58_315_233, 77_137_495]  # For Mmul_10

chromosomes = ["1", "2"]
chromosome_lengths = [223_616_942, 196_197_964]  # For Mmul_10

home = '/master/abagwell'
samples_list = f"{home}/variant-analysis/resources/rhesus/samples/runs.WGS.U42.list"
contig_length = f"{home}/variant-analysis/results/rhesus/relatedness/roh/contig_lengths.tsv"


# Read colony info
colonies_file = "/master/abagwell/variant-analysis/resources/rhesus/pop/colonies.tsv"
colonies = pl.read_csv(colonies_file, separator="\t", infer_schema_length=None)

# Create list of sample names
sample_names = []
with open(samples_list, "r") as f:
    for sample in f:
        sample_names.append("_".join(sample.strip().split("/")[1].split("_")[0:2]))


plot_list = []
columns = []
for idx, chrom in enumerate(chromosomes):
    column_plots = []
    # Only keep y-axis labels on leftmost plot
    if idx == 0:
        axis = alt.Axis(tickSize=0)
    else:
        axis = None
    # Output file from `bcftools roh`
    rg_file = f"/master/abagwell/variant-analysis/results/rhesus/roh/SHAPEIT5_WGS/U42_WGS_WES.SNP.chr{chrom}.RG.roh"
    rg_df = pl.read_csv(rg_file, comment_prefix="#", has_header=False, separator="\t",
        new_columns=["tag", "sample", "chrom", "start", "end", "length", "num_of_markers", "quality"],
        dtypes=[pl.String, pl.String, pl.String, pl.Int32, pl.Int32, pl.Int32, pl.Int32, pl.Float64]
    ).filter(
        pl.col("length") > 5_000_000
    ).with_columns(
        pl.col("sample").str.split("_").list.get(0).str.slice(3).alias("animal")
    ).join(colonies, how="left", left_on="animal", right_on="Id")

    # Add dummy line for each sample to guarantee that it will show up even if it has no ROH
    # dummies = []
    # for sample in sample_names:
    #     rg_df_with_dummy = pl.concat([rg_df, rg_df.group_by("tag", "sample", "chrom").agg().with_columns(
    #         # Add back columns with constant values
    #         pl.lit(621085).cast(pl.Int32).alias("start"),
    #         pl.lit(2599489).cast(pl.Int32).alias("end"),
    #         pl.lit(0).cast(pl.Int32).alias("length"),
    #         pl.lit(0).cast(pl.Int32).alias("num_of_markers"),
    #         pl.lit(0).cast(pl.Float64).alias("quality"),
    #     )])

    rg_pandas = rg_df.to_pandas()
    for sample in sample_names:
       rg_pandas.loc[len(rg_pandas)] = ["RG", sample, chrom, 0, 0, 0, 0, 0, "", 1]
    rg_df_with_dummy = pl.from_pandas(rg_pandas)


    roh_plot = alt.Chart(rg_df_with_dummy).mark_bar().encode(
        alt.X("mb_start_pos:Q").title(["Position", "(Mb)"]).scale(domainMin=0, domainMax=chromosome_lengths[int(chrom) -1]/1_000_000, clamp=True),
        alt.X2("mb_stop_pos:Q"), #.title("Stop"),
        alt.Y("sample", axis=axis).title("Sample"),
        color = alt.Color("Colony:N"),  # .scale(range=['#1f77b4', 'orange', '#9F2B68'])
        #color = "orange",
        tooltip=[
            alt.Tooltip("chrom", title="Chr"),
            alt.Tooltip("start", title="Start (bp)"),
            alt.Tooltip("end", title="End (bp)"),
            alt.Tooltip("length", title="Length"),
        ]
    ).properties(
        width=chromosome_lengths[int(chrom) - 1]/1500000,
        height=alt.Step(10),
        #width=300, # For when only displaying one chromosome
        title=["RoH", f"chr{chrom}"]
    ).transform_calculate(
        mb_start_pos = 'datum.start / 1000000',
        mb_stop_pos = 'datum.end / 1000000',
    )
    plot_list.append(roh_plot)
    column_plots.append(roh_plot)

    # Create df for bar plot
    bar_df = rg_df.with_columns(
        pl.int_ranges(pl.col("start"), pl.col("end"), 1_000).alias("position")
        ).explode("position").group_by("chrom", "position", "Colony").agg(pl.len())

    # Create bar plot
    if idx == 0:
        axis = alt.Axis(labels=False, title=None)
    else:
        axis = alt.Axis(labels=False, ticks=False, title="")
    bar_plot = alt.Chart(bar_df).mark_bar(size=1).encode(
        alt.X("mb_pos:Q").title(["Position", "(Mb)"]).scale(domainMin=0, domainMax=chromosome_lengths[int(chrom) -1]/1_000_000, clamp=True),  ## Modified here last
        alt.Y("len").title("Count").scale(domainMin=0, domainMax=bar_df["len"].max()), #axis=axis
        alt.Row("Colony:N", title=None),
        color=alt.Color("Colony:N"),
        ).properties(
            width=chromosome_lengths[int(chrom) - 1]/1500000,
            height=70,
            #width=300, # For when only displaying one chromosome
            title=["RoH", f"chr{chrom}"]
        ).transform_calculate(
            mb_pos = 'datum.position / 1000000',
        )

    column_plots.append(bar_plot)
    columns.append(alt.vconcat(*column_plots))



# (plot_list[0] | plot_list[1] | plot_list[2] | plot_list[16] | plot_list[19]).properties(
#     title="Runs of Homozygosity",
# )
alt.hconcat(*columns)


In [None]:
total_len_roh = rg_df_with_dummy.group_by("sample").agg(pl.sum("length")).sort("sample")

total_roh = alt.Chart(total_len_roh).mark_bar().encode(
    alt.X("length"),
    alt.Y("sample"),
)

In [None]:
bar_df