# Usecase 2: Ocean temperature prediction data preparation

This notebook prepares the dataset for the ocean temperature usecase following the general data preparation approach outlined in [the original publication by Sunagawa et al. 2015](https://www.science.org/doi/10.1126/science.1261359). It can be run in the following conda environment:

This notebook can be run in the following conda environment (last command must be launched from root of this repos):
```shell
mamba env create -f environment_prep_data.yml
conda activate ritme_examples_prep_data
pip install -e .
qiime dev refresh-cache
```

## Setup

In [None]:
import os
import re
import subprocess

import numpy as np
import pandas as pd
import qiime2 as q2
import skbio
from qiime2 import Artifact
from skbio import TreeNode

from src.meta_fetch import fetch_mitag_metadata
from src.seq_fetch_n_process import fetch_mitag_otus

%load_ext autoreload
%autoreload 2

%matplotlib inline

In [None]:
######## USER INPUTS ########
# where to save all the data
destination_folder = "../../data/u2_tara_ocean"
# URL to metadata
url_metadata = "https://ocean-microbiome.embl.de/data/OM.CompanionTables.xlsx"
# URL to otu counts
url_otu_counts = (
    "https://ocean-microbiome.embl.de/data/miTAG.taxonomic.profiles.release.tsv.gz"
)
######## END USER INPUTS #####

## Fetch and process metadata

In [None]:
fetch_mitag_metadata(destination_folder, url_metadata)

In [None]:
# get tabs W1: sea basins for stratification from OM.CompanionTables
md_w1 = pd.read_excel(
    "../../data/u2_tara_ocean/OM.CompanionTables.xlsx", sheet_name="Table W1"
)

# rename sample_id column and set as index
sample_id = [x for x in md_w1.columns if x.startswith("Sample label")]
md_w1.rename(columns={sample_id[0]: "sample_id"}, inplace=True)
md_w1.rename(columns={"PANGAEA sample identifier": "PANGAEA Sample ID"}, inplace=True)

# rename sea basin column and select
ocean_col = [x for x in md_w1.columns if x.startswith("Ocean and sea regions")]
md_w1.rename(columns={ocean_col[0]: "ocean_basin"}, inplace=True)

# add potential covariates to also consider for modelling
sampling_depth_col = [x for x in md_w1.columns if x.startswith("Sampling depth")]
md_w1.rename(columns={sampling_depth_col[0]: "sampling_depth_m"}, inplace=True)

env_ft_col = [x for x in md_w1.columns if x.startswith("Environmental Feature")]
md_w1.rename(columns={env_ft_col[0]: "env_feature"}, inplace=True)

lat_col = [x for x in md_w1.columns if x.startswith("Latitude")]
md_w1.rename(columns={lat_col[0]: "latitude"}, inplace=True)

lon_col = [x for x in md_w1.columns if x.startswith("Longitude")]
md_w1.rename(columns={lon_col[0]: "longitude"}, inplace=True)


md_selected = md_w1[
    [
        "sample_id",
        "PANGAEA Sample ID",
        "ocean_basin",
        "sampling_depth_m",
        "env_feature",
        "latitude",
        "longitude",
    ]
].copy()
md_selected.head()

In [None]:
md_w8 = pd.read_excel(
    "../../data/u2_tara_ocean/OM.CompanionTables.xlsx", sheet_name="Table W8"
)
temp_col = [x for x in md_w8.columns if "temperature" in x.lower()]
md_w8.rename(columns={temp_col[0]: "temperature_mean_degc"}, inplace=True)
md_w8_selected = md_w8[["PANGAEA Sample ID", "temperature_mean_degc"]].copy()

In [None]:
# merge
md_merged = pd.merge(md_selected, md_w8_selected, how="inner", on="PANGAEA Sample ID")
md_merged.set_index("sample_id", inplace=True)
# drop column PANGAEA Sample ID
md_merged.drop(columns="PANGAEA Sample ID", inplace=True)
md_merged.head()

## Fetch and process sequences & taxonomy

OTU counts derived from miTAG sequences

In [None]:
# fetch raw count data
fetch_mitag_otus(destination_folder, url_otu_counts)

Raw feature table to be used by *ritme*: `otu_table_tara_ocean.tsv`

In [None]:
mitag_df = pd.read_csv(
    os.path.join(destination_folder, "miTAG.taxonomic.profiles.release.tsv"), sep="\t"
)
mitag_df.rename(columns={"OTU.rep": "Feature ID"}, inplace=True)

In [None]:
# extract only feature table & save
cols_to_extract = [
    col
    for col in mitag_df.columns
    if col.startswith("Feature ID") or col.startswith("TARA")
]

ft_df = mitag_df[cols_to_extract]
ft_df.set_index("Feature ID", inplace=True)
ft_df = ft_df.T
ft_df.columns.name = None

# save to file
ft_df.to_csv(os.path.join(destination_folder, "otu_table_tara_ocean.tsv"), sep="\t")

ft_df.head()

Feature table for publication-like modelling: `otu_table_tara_ocean_proc` with these additional steps:
* "We applied an additional low-abundance filter, which removed features whose relative abundance did not exceed 0.0001 in any sample." (1)
* "we applied a logarithmic transformation to relative abundances using the function log10(x + x0), where x is the original relative abundance and x0 is a small constant, and x0 < min(x)." (2)
* "Compositional data (see above) were normalized to ranks across samples and then used to learn a regression model to predict environmental measures." (3)

In [None]:
# (1) low abundance filtering of features
# create relative abundances
ft_df_rel = ft_df.apply(lambda row: row / row.sum(), axis=1)
# remove features with max abundance < 0.0001
print(ft_df_rel.shape)
ft_df_rel_pub = ft_df_rel.loc[:, (ft_df_rel.max(axis=0) >= 0.0001).values]
ft_df_rel_pub.shape

In [None]:
# (2) log transformation of relative abundances
PSEUDOCOUNT = 0.000001
ft_df_rel_pub_log = ft_df_rel_pub.apply(
    lambda x: x.apply(lambda y: np.log(y + PSEUDOCOUNT))
)

In [None]:
# (3) normalize compositional data to ranks across samples (wouldn't necessarily require log transform before - results same)
ft_df_rel_pub_log_ranked = ft_df_rel_pub_log.rank(axis=1, ascending=False)

In [None]:
# save to file
ft_df_rel_pub_log_ranked.to_csv(
    os.path.join(destination_folder, "otu_table_tara_ocean_proc.tsv"),
    sep="\t",
)

Extract taxonomy

In [None]:
# extract taxonomy table & save
cols_for_tax = [
    col
    for col in mitag_df.columns
    if col.startswith("Feature ID") or not col.startswith("TARA")
]
tax_df = mitag_df[cols_for_tax]
tax_df.set_index("Feature ID", inplace=True)

# replace empty space with "_"
tax_cols = ["Domain", "Phylum", "Class", "Order", "Family", "Genus"]
tax_df[tax_cols] = tax_df[tax_cols].apply(lambda col: col.str.replace(" ", "_"))

# compress taxonomy info into "Taxon" column with prefixes
prefixes = ["k__", "p__", "c__", "o__", "f__", "g__"]

tax_df.loc[:, "Taxon"] = tax_df.apply(
    lambda row: "; ".join(
        [
            f"{pre}{row[col]}"
            for pre, col in zip(prefixes, tax_cols)
            if pd.notna(row[col])
        ]
    ),
    axis=1,
)

# save to file
tax_df_to_save = tax_df[["Taxon"]].copy()
tax_art = q2.Artifact.import_data("FeatureData[Taxonomy]", tax_df_to_save)
tax_art.save(os.path.join(destination_folder, "taxonomy_tara_ocean.qza"))

tax_df_to_save.head()

## Build phylogenetic tree

In [None]:
command = f"../../src/create_phylogeny_u2.sh {destination_folder} 3"
subprocess.run(command, shell=True)

In [None]:
# Remove taxonomic information from leaves and add unclassified node

tree_file = os.path.join(destination_folder, "fasttree_tree_rooted_suna15.qza")
file_out = tree_file.replace("rooted_", "rooted_proc_")
if os.path.exists(file_out):
    print(f"Processed tree file {file_out} already exists. Skipping tree processing.")
else:
    phylogeny = q2.Artifact.load(tree_file)

    tree = phylogeny.view(skbio.TreeNode)

    # rename - removing tax info from leaves
    for node in tree.tips():
        # Extract the desired part of the node name using a regular expression
        match = re.match(r"([A-Za-z0-9_\.]+)\s+.*", node.name)
        if match:
            # Update the node name with the extracted part
            node.name = match.group(1)
    # add unclassified node
    node_unclassified = TreeNode(name="unclassified", length=1.0)
    tree.extend([node_unclassified])

    phylogeny_renamed = Artifact.import_data("Phylogeny[Rooted]", tree)
    phylogeny_renamed.save(file_out)

## Subset metadata according to feature table

In [None]:
md_index = md_merged.index.tolist()
ft_df_index = ft_df.index.tolist()

print(md_merged.shape)
md_subset = md_merged.loc[ft_df.index]

path_to_md = os.path.join(destination_folder, "md_tara_ocean.tsv")
md_subset.to_csv(path_to_md, sep="\t")
md_subset.shape