# Check inferred nucleotide sequences

**Goals:**
* Get # sequences per sample and sequence length after each of the former sequence fetch and processing steps.

In [1]:
import os
import pandas as pd
import numpy as np
import qiime2 as q2
import matplotlib.pyplot as plt

from IPython.display import display

%load_ext autoreload
%autoreload 2

%matplotlib inline

  "cipher": algorithms.TripleDES,
  "class": algorithms.TripleDES,


In [2]:
# plotting settings
plt.style.use("tableau-colorblind10")
titlesize = 14
labelsize = 13
ticklabel = 12
alpha = 0.5
plt.rcParams.update({"font.size": labelsize})

In [3]:
tag = "20240725"
path2md = f"../data/raw/metadata_proc_v{tag}.tsv"
# define path2data from path2md
path2data = os.path.dirname(path2md)
# read metadata
df_md = pd.read_csv(path2md, sep="\t", index_col=0, dtype="str")
studies_ls = [x for x in df_md["study_cohort_name"].unique()]

### Check sequencing counts and lengths after each processing step


In [4]:
for study_name in studies_ls:
    print(study_name)
    counts_df = pd.read_csv(
        os.path.join(path2data, "check_seqs", f"stats_{study_name}.csv"), index_col=0
    )
    display(counts_df)

vatanen19_abx


Unnamed: 0,sample_count,total_nb_sequences,mean_nb_seq_p_sample,mean_nb_unique_seq_p_sample,median_len_sequences
raw,1098.0,,52235.20765,,175.0
trim,1098.0,,52235.20765,,175.0
denoise,1098.0,6304.0,36160.632969,68.724954,253.0
cluster,1098.0,2020.0,36112.623862,64.02459,253.0


vatanen19_t1d


Unnamed: 0,sample_count,total_nb_sequences,mean_nb_seq_p_sample,mean_nb_unique_seq_p_sample,median_len_sequences
raw,655.0,,62734.505344,,175.0
trim,655.0,,62729.215267,,175.0
denoise,655.0,3386.0,39773.603053,73.232061,253.0
cluster,655.0,1684.0,39664.870229,68.448855,253.0


vatanen19_karelia


Unnamed: 0,sample_count,total_nb_sequences,mean_nb_seq_p_sample,mean_nb_unique_seq_p_sample,median_len_sequences
raw,1451.0,,68414.84907,,175.0
trim,1451.0,,68414.826327,,175.0
denoise,1451.0,8045.0,34750.093728,62.620262,253.0
cluster,1451.0,2761.0,34708.864232,58.530669,253.0


### Check dada2 stats

In [5]:
df_stats = pd.DataFrame()

# get stats and print per study
for study_name in studies_ls:
    stats = q2.Artifact.load(os.path.join(path2data, f"dada2stats_{study_name}.qza"))
    stats = stats.view(q2.Metadata).to_dataframe()

    stats["percentage_input_lost_filter2merge"] = (
        stats["percentage of input passed filter"] - stats["percentage of input merged"]
    )

    cols2print = [x for x in stats.columns if x.startswith("percentage")]
    df_stats[study_name] = stats[cols2print].mean()

df_stats

Unnamed: 0,vatanen19_abx,vatanen19_t1d,vatanen19_karelia
percentage of input passed filter,84.469918,70.279969,68.018849
percentage of input merged,80.296803,68.230397,57.085803
percentage of input non-chimeric,71.00337,64.70542,52.651192
percentage_input_lost_filter2merge,4.173115,2.049573,10.933046


### Inspect feature table (OTUs) stats
Note: these are not fully comparable: as here we need to rarefy (adjust equal sampling depth) for a proper comparison. Rarefaction is performed in a bootstrapped (n=500) manner when calculating the alpha diversity metrics.

In [6]:
df_final = pd.DataFrame()
for study_name in studies_ls:
    counts_df = pd.read_csv(
        os.path.join(path2data, "check_seqs", f"stats_{study_name}.csv"), index_col=0
    )
    df_final[study_name] = counts_df.iloc[-1, 1:].T

df_final = df_final.T.copy()

In [7]:
def color_gradient(val, cmap, min_val, max_val, alpha=1.0):
    norm_val = (val - min_val) / (max_val - min_val)
    rgba_color = cmap(norm_val)
    color = (
        f"rgba({rgba_color[0]*255}, {rgba_color[1]*255}, {rgba_color[2]*255}, {alpha})"
    )
    return f"background-color: {color}"


def column_gradient(column, cmap, alpha=1.0):
    min_val = column.min()
    max_val = column.max()
    return [color_gradient(val, cmap, min_val, max_val, alpha=alpha) for val in column]


cmap = plt.get_cmap("RdYlGn")
opacity = 0.3  # Adjust the opacity value between 0 and 1
styled_df = df_final.style.apply(
    lambda col: column_gradient(col, cmap, alpha=opacity), axis=0
)
styled_df

  norm_val = (val - min_val) / (max_val - min_val)


Unnamed: 0,total_nb_sequences,mean_nb_seq_p_sample,mean_nb_unique_seq_p_sample,median_len_sequences
vatanen19_abx,2020.0,36112.623862,64.02459,253.0
vatanen19_t1d,1684.0,39664.870229,68.448855,253.0
vatanen19_karelia,2761.0,34708.864232,58.530669,253.0
