# Compute Taxa Abundance in the *Candidatus* Electrothrix communis RB Sample

The purpose of this notebook is to compute the taxa abundance in the same way it was done for the Kalø Vig and Løgten samples (see notebook `compute_abundance.ipynb`) and merge this data with contamination, completeness and taxonomic data. The "marine_gs" abbreviation stands for "Marine Golden Standard" which is an internal name for the *Candidatus* Electrothrix communis RB species.

In [1]:
import os
import re
from pathlib import Path

import pandas as pd

In [2]:
marine_gs_illumina_contig_depth = pd.read_table(
    "../../../mapping/results/2022-05-19/marine_gs_illumina_bins_contig_depths/marine_gs_illumina_bam_contig_depths.tsv",
    usecols=["contigName", "totalAvgDepth"],
)

In [3]:
# Remove S1C from contig names to match them with bin contig names
marine_gs_illumina_contig_depth["contigName"] = marine_gs_illumina_contig_depth[
    "contigName"
].str[3:]

In [4]:
def compute_depth(contigs_path, depth_file):
    """
    Take bin contigs from a path, merge them with a depth file,
    and return a dataframe with bin ids and their average depths (i.e., abundances).

    Parameters
    ----------
    contig_path : str
        Directory with tsv files for each bin that contain contig ids of that bin.
    depth_file : pd.DataFrame
        DataFrame containing contig names and their average total depth.

    Returns
    -------
    pd.DataFrame
        Dataframe with bin ids and their relative depths (i.e., abundances).
    """
    # Pattern to extract bins names from filenames
    pattern = re.compile(".+?(?=.fa_contig_names)")

    # Lists to store values for dataframe
    bin_ids = []
    avg_depth = []

    for tsv in os.listdir(contigs_path):
        contig_names = pd.read_table(contigs_path + tsv, names=["contigName"])

        # Merge contig names with Average Depth file
        merged = contig_names.merge(depth_file)

        # Append values to lists that will be used to populate the dataframe
        bin_ids.append(re.match(pattern, tsv).group(0))
        avg_depth.append(merged["totalAvgDepth"].mean())

    # DataFrame to save bin names and average depth (i.e., abundance of the bin)
    df = pd.DataFrame({"Bin Id": bin_ids, "AvgDepth": avg_depth})
    return df

In [5]:
marine_gs_illumina_abund = compute_depth(
    "marine_gs_illumina_contig_names/", marine_gs_illumina_contig_depth
)

Now merge these tables with big quality tables.

In [6]:
# Open taxonomical and quality table
marine_gs_illumina_taxa_quality = pd.read_csv(
    "../../../taxonomy/results/2022-05-19/marine_gs_illumina_taxa_quality.csv"
)

# Merge
marine_gs_illumina_merged = marine_gs_illumina_taxa_quality.merge(
    marine_gs_illumina_abund
)

# Save to csv file
marine_gs_illumina_merged.to_csv(
    "../../../taxonomy/results/2022-05-19/marine_gs_illumina_taxa_quality_abund.csv", index=False
)