# Description of feature table

## Setup

In [None]:
import os
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from src.utils import extract_color
from src.display_meta import display_diet_information_in_one, plot_box_violin
from src.viz_alpha_div import read_and_prep_abx_exposure_data

%load_ext autoreload
%autoreload 2
%matplotlib inline

plt.rcParams.update({"font.family": "DejaVu Sans"})
plt.style.use("tableau-colorblind10")

In [2]:
# USER DEFINED variables
tag = "20240806"
tag_output = "entero_genus"
path_to_data = "../data/final/"

# threshold for coloring microbiome samples after abx exposure
th_sample_after_abx_months = 3
# END USER DEFINED variables

In [3]:
# read processed feature table
path_to_ft = os.path.join(path_to_data, f"ft_vat19_anomaly_v{tag}_{tag_output}.tsv")
# read time-series of exact abx exposure data
path_to_abx = os.path.join(path_to_data, f"ts_vat19_abx_v{tag}.tsv")

# location to save all outputs of this notebook
path_to_output = os.path.join("../results", f"desc_{tag}_{tag_output}")
if not os.path.isdir(path_to_output):
    os.makedirs(path_to_output)

## Read datasets


In [4]:
# read and prep abx exposure data
abx_df = read_and_prep_abx_exposure_data(path_to_abx)

In [5]:
# read and prep metadata
md_df = pd.read_csv(path_to_ft, sep="\t", index_col=0)

# columns for below plots
md_df = md_df.assign(
    sample_lt_xm_after_abx=lambda df: df["abx_any_last_t_dmonths"]
    <= th_sample_after_abx_months,
    max_abx_w_microbiome=lambda df: df.groupby("host_id")["abx_any_cumcount"].transform(
        "max"
    ),
)
md_df.sort_values(
    [
        "abx_max_count_ever",
        "max_abx_w_microbiome",
        "host_id",
        "age_months_rounded05",
    ],
    ascending=[True, True, True, True],
    inplace=True,
)

In [6]:
# sort abx_df in same order and remove samples that don't exist in md_df
abx_events = pd.DataFrame()
abx_events["host_id"] = md_df["host_id"].unique()
abx_events = pd.merge(abx_events, abx_df, on="host_id", how="left")
del abx_df
assert abx_events.host_id.unique().tolist() == md_df.host_id.unique().tolist()

In [7]:
def filter_both_dfs(md_df, abx_events, condition):
    md_df_filtered = md_df.groupby("host_id").filter(condition).copy()
    abx_events_filtered = abx_events[
        abx_events["host_id"].isin(md_df_filtered.host_id.unique())
    ].copy()
    return md_df_filtered, abx_events_filtered


# separate abx and no abx
md_df_noabx = md_df[md_df["max_abx_w_microbiome"] == 0].copy()
abx_events_noabx = abx_events[
    abx_events["host_id"].isin(md_df_noabx.host_id.unique())
].copy()

md_df_abx = md_df[md_df["max_abx_w_microbiome"] > 0].copy()
assert md_df_noabx.shape[0] + md_df_abx.shape[0] == md_df.shape[0]


# distinguish between invisible and visible abx with this threshold
invisible_condition = lambda x: all(~x["sample_lt_xm_after_abx"])
md_df_abx_invisible, abx_events_abx_invisible = filter_both_dfs(
    md_df_abx, abx_events, invisible_condition
)

visible_condition = lambda x: any(x["sample_lt_xm_after_abx"])
md_df_abx_visible, abx_events_abx_visible = filter_both_dfs(
    md_df_abx, abx_events, visible_condition
)

assert md_df_abx_invisible.shape[0] + md_df_abx_visible.shape[0] == md_df_abx.shape[0]

## Visualize overview of samples and abx events

In [None]:
# fraction of "visible" abx samples from all available microbiome samples
all_samples = md_df.shape[0]
print(f"Count of samples: {all_samples}")
frac_abx = 100 * (md_df["sample_lt_xm_after_abx"] == True).sum() / all_samples

print(f"Fraction of potentially abx influenced samples: {frac_abx:.1f} %")

# fraction of samples <=24 months
samples_le_24 = md_df[md_df["age_months_rounded1"] <= 24.0].shape[0]
frac_le_24 = 100 * samples_le_24 / all_samples
print(f"Fraction of samples within first two years of life: {frac_le_24:.1f} %")

In [None]:
hide_ylabel_thickmarks = True  # hiding thickmarks of y-axis for slides
if hide_ylabel_thickmarks:
    plt.rcParams.update({"font.size": 6.5})
    fig, axs = plt.subplots(1, 3, figsize=(8, 6), sharex=True, dpi=400)
    markersize = 8
else:
    plt.rcParams.update({"font.size": 6})
    fig, axs = plt.subplots(1, 3, figsize=(9, 10), sharex=True, dpi=400)
    markersize = 10

dic_to_plot = {
    "w/o": [md_df_noabx, abx_events_noabx],
    'with "invisible"': [md_df_abx_invisible, abx_events_abx_invisible],
    "with visible": [md_df_abx_visible, abx_events_abx_visible],
}
i = 0

for title, df in dic_to_plot.items():
    # samples
    sns.scatterplot(
        x="age_months_rounded05",
        y="host_id",
        hue="sample_lt_xm_after_abx",
        palette={True: "sandybrown", False: "#004587"},
        data=df[0],
        ax=axs[i],
        s=markersize,
    )
    # abx events
    sns.scatterplot(
        x="abx_start_age_months",
        y="host_id",
        data=df[1],
        ax=axs[i],
        s=markersize * 1.5,
        marker="x",
        color="darkred",
        label="abx event",
    )

    axs[i].set_title(f"Infants {title} abx exposure ({df[0].host_id.nunique()})")
    axs[i].set_xlabel("Age [months]")
    axs[i].set_ylabel("Host ID")
    axs[i].margins(y=0.005)
    if i != 0:
        axs[i].set_ylabel("")
    if i != 2:
        axs[i].get_legend().remove()
    if hide_ylabel_thickmarks:
        axs[i].set_yticklabels([])
    i += 1

axs[2].legend(
    loc="upper right",
    bbox_to_anchor=(1.60, 1),
    title=f" sample <={th_sample_after_abx_months}m after abx",
)
plt.suptitle("Distribution of microbial samples over time", fontsize=10, y=1.0)
plt.tight_layout()
filename = os.path.join(
    path_to_output,
    f"overall_distribution_samples_t{hide_ylabel_thickmarks}.pdf",
)
plt.savefig(filename, dpi=400, bbox_inches="tight", format="pdf")
plt.show()

Unique host counts for different thresholds:

| <= `x` m after abx | # no abx | # invisible abx | # visible abx | 
|--------------------|----------|-----------------|---------------|
| 1                  | 140      | 56              | 85            |
| 2                  | "        | 34              | 107           |
| 3                  | "        | 26              | 115           |
| 4                  | "        | 21              | 120           |
| 5                  | "        | 19              | 122           |
| 6                  | "        | 17              | 124           |
| 9                  | "        | 6               | 135           |
| 12                 | "        | 2               | 139           |

## Display diet, delivery mode, sex and geolocation covariates

In [None]:
fig_diet, ax_diet = display_diet_information_in_one(
    md_df,
    ["diet_weaning", "diet_milk"],
    "age_months_rounded1",
    "samples",
    "Distribution of diet covariates over time",
)

filename = os.path.join(
    path_to_output,
    f"diet_covariates_over_time.pdf",
)
plt.savefig(filename, dpi=400, bbox_inches="tight", format="pdf")

In [None]:
md_df[["host_id", "delivery_mode"]].drop_duplicates().groupby("delivery_mode").count()

In [None]:
md_df[["host_id", "geo_location_name"]].drop_duplicates().groupby(
    "geo_location_name"
).count()

In [None]:
md_df[["host_id", "sex"]].drop_duplicates().groupby("sex").count()

## Visualize distribution of samples available 

In [None]:
plt.rcParams.update({"font.family": "DejaVu Sans", "font.size": 8})

fig, axs = plt.subplots(1, 3, figsize=(7, 3), dpi=400)
title_fontsize = 12
ylabel_fontsize = 10
# 1) Number of samples per host
print("Number of samples per host:")
samples_per_host = md_df.groupby("host_id")["age_days"].count()
print(samples_per_host.describe())
c1 = extract_color("tableau-colorblind10", 2)
plot_box_violin(samples_per_host, c1, axs[0])
axs[0].set_title("Sample count per host", fontsize=title_fontsize, y=1.01)
axs[0].set_ylabel("# samples", fontsize=ylabel_fontsize)
axs[0].set_ylim(bottom=0)

# 2) Duration between samples
print("Duration between samples:")
days_between = (
    md_df.groupby("host_id")["age_days"]
    .apply(lambda x: x.sort_values().diff())
    .dropna()
)
print(days_between.describe())
c2 = "#8e4585"  # plum
plot_box_violin(days_between, c2, axs[1])
axs[1].set_title("Duration between samples", fontsize=title_fontsize, y=1.01)
axs[1].set_ylabel("Days", fontsize=ylabel_fontsize)
axs[1].set_ylim(bottom=0)

# 3) First sample distribution
print("First sample distribution:")
first_sample = md_df.groupby("host_id")["age_days"].min()
print(first_sample.describe())
c3 = extract_color("tableau-colorblind10", 4)
plot_box_violin(first_sample, c3, axs[2])
axs[2].set_title("First sample", fontsize=title_fontsize, y=1.01)
axs[2].set_ylabel("Age [days]", fontsize=ylabel_fontsize)
axs[2].set_ylim(bottom=0)


plt.tight_layout()
filename = os.path.join(
    path_to_output,
    f"sample_characteristics.pdf",
)
plt.savefig(filename, dpi=400, bbox_inches="tight", format="pdf")
plt.show()

In [None]:
plt.rcParams.update({"font.family": "DejaVu Sans", "font.size": 8})
fig, axs = plt.subplots(1, 3, figsize=(6, 3), sharey=True, dpi=300)

dic_to_plot = {
    "w/o abx": md_df_noabx,
    "w abx": md_df_abx,
    "all": md_df,
    # 'with "invisible" abx': md_df_abx_invisible,
    # "with visible abx": md_df_abx_visible,
}
i = 0

for title, df in dic_to_plot.items():
    samples_per_host = df.groupby("host_id")["age_days"].agg("count")
    c = extract_color("tableau-colorblind10", 0)
    plot_box_violin(samples_per_host, c, axs[i])

    axs[i].set_title(title)
    axs[i].set_ylabel("")
    axs[i].set_ylim(bottom=0)
    i += 1
axs[0].set_ylabel("# samples", fontsize=10)
plt.suptitle("Number of samples per host", fontsize=12, y=1.0)
plt.tight_layout()
filename = os.path.join(path_to_output, "nb_samples_per_host.png")
plt.savefig(filename, dpi=400, bbox_inches="tight")
plt.show()

## Visualize 1st, 2nd and 3rd abx exposure

In [None]:
# how many samples are there in visible abx hosts with sample after 1st infant's abx exposure?
ls_cols = [
    "host_id",
    "age_months_rounded05",
    "abx_any_cumcount",
    "sample_lt_xm_after_abx",
    "abx_any_last_t_dmonths",
]
all_abx = md_df_abx.host_id.nunique()

for i in [1.0, 2.0, 3.0]:
    # count the number of hosts with a microbial sample after their first abx exposure
    first_abx_exp_sample = (md_df_abx["abx_any_cumcount"] == i) & (
        md_df_abx_visible["sample_lt_xm_after_abx"] == True
    )
    num_hosts = md_df_abx.loc[
        first_abx_exp_sample,
        "host_id",
    ].nunique()

    print(
        f"Of the \033[1m{all_abx}\033[0m hosts with abx exposure,"
        f" \033[1m{round(100*num_hosts/all_abx,1)} % ({num_hosts}\033[0m)"
        f" have a sample <= {th_sample_after_abx_months} months after {i}-th abx exposure."
    )

    # What's the average age of hosts at nth abx exposure?
    fig, ax = plt.subplots(figsize=(6, 1), dpi=400)

    first_abx = (
        md_df_abx.loc[first_abx_exp_sample, ["host_id", "age_months_rounded05"]]
        .groupby("host_id")
        .first()
    )
    print(f"Mean age: {first_abx['age_months_rounded05'].mean():.2f} months")
    plot_box_violin(first_abx["age_months_rounded05"], c, ax, horizontal=True)
    ax.set_xlabel("Age [months]", fontsize=6)
    ax.tick_params(axis="x", labelsize=6)
    ax.set_xlim(-0.5, 38.5)
    if i == 1:
        suff = "1st"
    elif i == 2:
        suff = "2nd"
    elif i == 3:
        suff = "3rd"
    ax.set_title(f"Age at {suff} abx exposure", fontsize=7)
    plt.show()