# Explore alpha diversity in dataset


## Setup

In [None]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from src.viz_alpha_div import (
    assign_columns_for_plots,
    lineplot_all_div_metrics_over_time,
    read_and_prep_abx_exposure_data,
)

%load_ext autoreload
%autoreload 2
%matplotlib inline

plt.rcParams.update({"font.family": "DejaVu Sans"})
plt.style.use("tableau-colorblind10")

In [None]:
# USER DEFINED variables
tag = "20240806"
tag_output = "entero_genus"
path_to_data = "../data/final/"

# END USER DEFINED variables

In [None]:
# Setting all paths needed
path_to_ft = os.path.join(path_to_data, f"ft_vat19_anomaly_v{tag}_{tag_output}.tsv")
path_to_abx = os.path.join(path_to_data, f"ts_vat19_abx_v{tag}.tsv")

# location to save all outputs of this notebook
path_to_output = os.path.join("../results", f"desc_{tag}_{tag_output}")
if not os.path.isdir(path_to_output):
    os.makedirs(path_to_output)

## Read datasets

In [None]:
abx_df = read_and_prep_abx_exposure_data(path_to_abx)
md_df = pd.read_csv(path_to_ft, sep="\t")
md_df = assign_columns_for_plots(md_df)

# add simple abx no and yes feature
md_df["abx"] = "no"
md_df.loc[md_df["max_abx_w_microbiome"] > 0.0, "abx"] = "yes"

In [None]:
noabx = md_df.loc[md_df["max_abx_w_microbiome"] == 0.0, :]
hosts_noabx = noabx.host_id.unique()
print(f"Number of unique hosts w/o abx: {noabx.host_id.nunique()}")

abx = md_df.loc[md_df["max_abx_w_microbiome"] > 0.0, :]
hosts_abx = abx.host_id.unique()
print(f"Number of unique hosts with abx: {abx.host_id.nunique()}")

# remove hosts with only abx after sample collection from abx cohort:
abx_df = abx_df[abx_df.host_id.isin(hosts_abx)].copy()
print(
    f"Number of unique hosts with abx and microbial samples after: {abx_df.host_id.nunique()}"
)

## Diversity in all noabx infants over time

In [None]:
# boxplot of one diversity metric over age
metric = "div_alpha_faith_pd"
x_axis = "age_months_rounded1"
title = f"Alpha diversity over age in infants w/o abx exposure"
fig, axs = plt.subplots(1, 1, figsize=(12, 5), dpi=400)

noabx[x_axis] = noabx[x_axis].astype(int).copy()
sns.boxplot(x=x_axis, y=metric, data=noabx, ax=axs, color="lightblue")

axs.set_xticklabels(axs.get_xticklabels(), rotation=0, ha="center", fontsize=8)
# axs.xaxis.set_major_locator(MaxNLocator(integer=True))
metric_name = metric.replace("div_alpha_", "")
axs.set_xlabel("Age [months]")
axs.set_ylabel("Faith PD")
axs.set_title(title)
plt.tight_layout()

filename = os.path.join(path_to_output, "alpha_noabx_boxplot_over_age.pdf")
print(filename)
plt.savefig(filename, dpi=400, bbox_inches="tight", format="pdf")
plt.show()

## Diversity in all infants split by covariates

In [None]:
# add simple abx no and yes feature
md_df["abx"] = "no"
md_df.loc[md_df["abx_max_count_ever"] > 0.0, "abx"] = "yes"

In [None]:
metric = "div_alpha_faith_pd"
group_by_values = [
    "delivery_mode",
    "diet_milk",
    "diet_weaning",
    "abx",
    # "geo_location_name",
    # "sex",
]
lineplot_all_div_metrics_over_time(
    md_df, metric, x_axis, group_by_values, path_to_output
)

# Number of abx courses in cohort

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(5, 3), dpi=400)

min_age = 0
abx_df["abx_start_age_months_rounded1"] = abx_df["abx_start_age_months"].round()
max_age = int(
    max(
        md_df["age_months_rounded1"].max(),
        abx_df["abx_start_age_months_rounded1"].max(),
    )
)

df_grouped = abx_df.groupby("abx_start_age_months_rounded1").size()
all_ages = range(min_age, max_age + 1)
df_grouped = df_grouped.reindex(all_ages, fill_value=0)

df_grouped.plot.bar(color="darkred", ax=ax)
ax.set_xlim(min_age - 0.5, max_age + 0.5)
ax.set_xticks(range(min_age, max_age + 1, 10))
ax.set_xticklabels(ax.get_xticklabels(), rotation=0, ha="center", fontsize=8)
ax.set_xlabel("Age [months]")
ax.set_ylabel("# abx courses")
fig.suptitle("Number of abx courses over time", fontsize=12, y=0.95)
plt.tight_layout()

filename = os.path.join(path_to_output, "nb_abx_courses_over_time.pdf")
print(filename)
plt.savefig(filename, dpi=400, bbox_inches="tight", format="pdf")
plt.show()

# Number of abx courses per host

In [None]:
# select only unique samples with host_id and "abx_max_count_ever"
md_df_unique = md_df.drop_duplicates(["host_id", "max_abx_w_microbiome"]).copy()

In [None]:
# fraction of hosts with at most 1-3 abx courses in observed time period
hosts_w_abx = md_df_unique[md_df_unique["max_abx_w_microbiome"] > 0.0].host_id.nunique()
hosts_w_abx_le3 = md_df_unique[
    np.logical_and(
        md_df_unique["max_abx_w_microbiome"] >= 1.0,
        md_df_unique["max_abx_w_microbiome"] <= 3.0,
    )
].host_id.nunique()
frac_le3 = 100 * (hosts_w_abx_le3 / hosts_w_abx)

print(f"Fraction of hosts with 1-3 abx courses: {frac_le3:.1f} %")

In [None]:
# plot
fig, ax = plt.subplots(1, 1, figsize=(5, 3), dpi=400)
md_df_unique.groupby("max_abx_w_microbiome")["host_id"].count().plot(
    kind="bar", ax=ax, color="darkred"
)

# Annotate each bar with its value:
for container in ax.containers:
    ax.bar_label(container, label_type="edge", fontsize=6)

ax.set_ylim(0, 150)
ax.set_xticklabels(ax.get_xticklabels(), rotation=0, ha="center", fontsize=8)
ax.set_xticklabels(ax.get_xticks().astype(int))
ax.set_ylabel("# hosts")
ax.set_xlabel("Total number of abx courses")
ax.set_title("Number of abx courses per host")
plt.tight_layout()
filename = os.path.join(path_to_output, "nb_abx_courses_per_host.pdf")
print(filename)
plt.savefig(filename, dpi=400, bbox_inches="tight", format="pdf")
plt.show()

## Number of abx courses per sample

In [None]:
print(md_df.shape)
md_df_s = md_df[md_df["abx_any_last_t_dmonths"] <= 1.0].copy()
print(md_df_s.shape)

cols_to_keep = ["host_id", "abx_any_cumcount"]
md_df_s = md_df_s[cols_to_keep].copy()
md_df_s.head()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(9, 3), dpi=400)
md_df_sp = md_df_s.reset_index()
md_df_sp.groupby("abx_any_cumcount")["index"].count().plot(kind="bar", ax=ax)

# ax.set_xticklabels(ax.get_xticklabels(), rotation=0)
# ax.set_xticklabels(ax.get_xticks().astype(int))
ax.set_ylabel("Number of samples")
ax.set_xlabel("Cum count of abx courses (with matching samples 1 month after)")
ax.set_title("Cum count of abx courses per 1-month sample")
plt.tight_layout()
# filename = os.path.join(path_to_output, "nb_abx_courses_per_host.png")
# plt.savefig(filename, dpi=400, bbox_inches="tight")
plt.show()