# Check inferred nucleotide sequences

**Goals:**
* Get # sequences per sample and sequence length after each of the former sequence fetch and processing steps.

In [None]:
import os
import pandas as pd
import numpy as np
import qiime2 as q2
import matplotlib.pyplot as plt

from IPython.display import display

%load_ext autoreload
%autoreload 2

%matplotlib inline

In [None]:
# plotting settings
plt.style.use("tableau-colorblind10")
titlesize = 14
labelsize = 13
ticklabel = 12
alpha = 0.5
plt.rcParams.update({"font.size": labelsize})

In [None]:
tag = "20240806"
path2md = f"../data/raw/metadata_proc_v{tag}.tsv"
# define path2data from path2md
path2data = os.path.dirname(path2md)
# read metadata
df_md = pd.read_csv(path2md, sep="\t", index_col=0, dtype="str")
studies_ls = [x for x in df_md["study_cohort_name"].unique()]

### Check sequencing counts and lengths after each processing step


In [None]:
for study_name in studies_ls:
    print(study_name)
    counts_df = pd.read_csv(
        os.path.join(path2data, "check_seqs", f"stats_{study_name}.csv"), index_col=0
    )
    display(counts_df)

### Check dada2 stats

In [None]:
df_stats = pd.DataFrame()

# get stats and print per study
for study_name in studies_ls:
    stats = q2.Artifact.load(os.path.join(path2data, f"dada2stats_{study_name}.qza"))
    stats = stats.view(q2.Metadata).to_dataframe()

    stats["percentage_input_lost_filter2merge"] = (
        stats["percentage of input passed filter"] - stats["percentage of input merged"]
    )

    cols2print = [x for x in stats.columns if x.startswith("percentage")]
    df_stats[study_name] = stats[cols2print].mean()

df_stats

### Inspect feature table (OTUs) stats
Note: these are not fully comparable: as here we need to rarefy (adjust equal sampling depth) for a proper comparison. Rarefaction is performed in a bootstrapped (n=500) manner when calculating the alpha diversity metrics.

In [None]:
df_final = pd.DataFrame()
for study_name in studies_ls:
    counts_df = pd.read_csv(
        os.path.join(path2data, "check_seqs", f"stats_{study_name}.csv"), index_col=0
    )
    df_final[study_name] = counts_df.iloc[-1, 1:].T

df_final = df_final.T.copy()

In [None]:
def color_gradient(val, cmap, min_val, max_val, alpha=1.0):
    norm_val = (val - min_val) / (max_val - min_val)
    rgba_color = cmap(norm_val)
    color = (
        f"rgba({rgba_color[0]*255}, {rgba_color[1]*255}, {rgba_color[2]*255}, {alpha})"
    )
    return f"background-color: {color}"


def column_gradient(column, cmap, alpha=1.0):
    min_val = column.min()
    max_val = column.max()
    return [color_gradient(val, cmap, min_val, max_val, alpha=alpha) for val in column]


cmap = plt.get_cmap("RdYlGn")
opacity = 0.3  # Adjust the opacity value between 0 and 1
styled_df = df_final.style.apply(
    lambda col: column_gradient(col, cmap, alpha=opacity), axis=0
)
styled_df