# Explore alpha diversity in dataset


## Setup

In [1]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib.ticker import FixedLocator
from src.viz_alpha_div import (
    assign_columns_for_plots,
    boxplot_all_div_metrics_over_time,
    lineplot_all_div_metrics_over_time,
    read_and_prep_abx_exposure_data,
)

%load_ext autoreload
%autoreload 2
%matplotlib inline

plt.rcParams.update({"font.family": "DejaVu Sans"})
plt.style.use("tableau-colorblind10")

In [2]:
# USER DEFINED variables
tag = "20240806"
tag_output = "entero_genus"
path_to_data = "../data/final/"

# END USER DEFINED variables

In [3]:
# Setting all paths needed
path_to_ft = os.path.join(path_to_data, f"ft_vat19_anomaly_v{tag}_{tag_output}.tsv")
path_to_abx = os.path.join(path_to_data, f"ts_vat19_abx_v{tag}.tsv")

# location to save all outputs of this notebook
path_to_output = os.path.join("../results", f"desc_{tag}_{tag_output}")
if not os.path.isdir(path_to_output):
    os.makedirs(path_to_output)

## Read datasets

In [4]:
abx_df = read_and_prep_abx_exposure_data(path_to_abx)
md_df = pd.read_csv(path_to_ft, sep="\t")
md_df = assign_columns_for_plots(md_df)

# add simple abx no and yes feature
md_df["abx"] = "no"
md_df.loc[md_df["max_abx_w_microbiome"] > 0.0, "abx"] = "yes"

div_metrics = ["div_alpha_faith_pd", "div_alpha_observed_features", "div_alpha_shannon"]

In [None]:
noabx = md_df.loc[md_df["max_abx_w_microbiome"] == 0.0, :]
hosts_noabx = noabx.host_id.unique()
print(f"Number of unique hosts w/o abx: {noabx.host_id.nunique()}")

abx = md_df.loc[md_df["max_abx_w_microbiome"] > 0.0, :]
hosts_abx = abx.host_id.unique()
print(f"Number of unique hosts with abx: {abx.host_id.nunique()}")

# remove hosts with only abx after sample collection from abx cohort:
abx_df = abx_df[abx_df.host_id.isin(hosts_abx)].copy()
print(
    f"Number of unique hosts with abx and microbial samples after: {abx_df.host_id.nunique()}"
)

## Diversity in all noabx infants over time

In [None]:
x_axis = "age_months_rounded1"
boxplot_all_div_metrics_over_time(noabx, div_metrics, x_axis, path_to_output)

## Diversity in all noabx infants split by covariates

In [None]:
metric = "div_alpha_faith_pd"
group_by_values = [
    "delivery_mode",
    "diet_milk_s",
    "diet_weaning",
    "geo_location_name",
    "sex",
]
lineplot_all_div_metrics_over_time(
    noabx, metric, x_axis, group_by_values, path_to_output
)

## Abx/ no abx effect on alpha diversity over time

In [None]:
# add simple abx no and yes feature
md_df["abx"] = "no"
md_df.loc[md_df["abx_max_count_ever"] > 0.0, "abx"] = "yes"

group_by_values = ["abx", "abx", "abx", "abx"]  # just for lazy scaling purposes
lineplot_all_div_metrics_over_time(
    md_df, metric, x_axis, group_by_values, path_to_output
)

In [None]:
# get density of abx events over time
fig, ax = plt.subplots(1, 1, figsize=(7, 3), dpi=400)
palette = sns.color_palette("husl", 2)
abx_df["abx_start_age_months_rounded1"] = abx_df["abx_start_age_months"].round()
abx_df["abx_start_age_months_rounded1"].hist(bins=range(0, 39), ax=ax, color=palette[1])
max_age = 39.0
ax.xaxis.set_major_locator(FixedLocator(np.arange(0, max_age, 1.0)))
ax.set_xticklabels(np.arange(0, max_age, 1.0), rotation=90)

ax.set_xlabel("age_months_rounded1")
ax.set_ylabel("Number of abx courses")
ax.set_title("Number of abx courses in cohort")
ax.set_xlim(-0.5, 38.5)
ax.grid(False)

plt.tight_layout()
plt.show()

# Number of abx courses per host

In [None]:
# select only unique samples with host_id and "abx_max_count_ever"
md_df_unique = md_df.drop_duplicates(["host_id", "max_abx_w_microbiome"]).copy()

In [None]:
# fraction of hosts with 1-3 abx courses in observed time period
hosts_w_abx = md_df_unique[md_df_unique["max_abx_w_microbiome"] > 0.0].host_id.nunique()
hosts_w_abx_le3 = md_df_unique[
    np.logical_and(
        md_df_unique["max_abx_w_microbiome"] >= 1.0,
        md_df_unique["max_abx_w_microbiome"] <= 3.0,
    )
].host_id.nunique()
frac_le3 = 100 * (hosts_w_abx_le3 / hosts_w_abx)

print(f"Fraction of hosts with 1-3 abx courses: {frac_le3:.1f} %")

In [None]:
# plot
fig, ax = plt.subplots(1, 1, figsize=(9, 3), dpi=400)
md_df_unique.groupby("max_abx_w_microbiome")["host_id"].count().plot(kind="bar", ax=ax)

ax.set_xticklabels(ax.get_xticklabels(), rotation=0)
ax.set_xticklabels(ax.get_xticks().astype(int))
ax.set_ylabel("Number of hosts")
ax.set_xlabel("Total number of abx courses (with matching samples)")
ax.set_title("Total number of abx courses per host")
plt.tight_layout()
filename = os.path.join(path_to_output, "nb_abx_courses_per_host.png")
plt.savefig(filename, dpi=400, bbox_inches="tight")
plt.show()

## Number of abx courses per sample

In [None]:
print(md_df.shape)
md_df_s = md_df[md_df["abx_any_last_t_dmonths"] <= 1.0].copy()
print(md_df_s.shape)

cols_to_keep = ["host_id", "abx_any_cumcount"]
md_df_s = md_df_s[cols_to_keep].copy()
md_df_s.head()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(9, 3), dpi=400)
md_df_sp = md_df_s.reset_index()
md_df_sp.groupby("abx_any_cumcount")["index"].count().plot(kind="bar", ax=ax)

# ax.set_xticklabels(ax.get_xticklabels(), rotation=0)
# ax.set_xticklabels(ax.get_xticks().astype(int))
ax.set_ylabel("Number of samples")
ax.set_xlabel("Cum count of abx courses (with matching samples 1 month after)")
ax.set_title("Cum count of abx courses per 1-month sample")
plt.tight_layout()
# filename = os.path.join(path_to_output, "nb_abx_courses_per_host.png")
# plt.savefig(filename, dpi=400, bbox_inches="tight")
plt.show()