# TARA Oceans data analysis
This notebook contains the analysis of the samples fetched for the BioProject PRJEB1787. This BioProject comprises approx. 240 samples corresponding to the bacterial fraction of the collected sea water. All the samples were subject to taxonomic classification using Kraken 2 through the `classify-kraken2` action from the q2-moshpit QIIME 2 plugin.

In [None]:
import numpy as np
import qiime2 as q2
import pandas as pd
import plotly.io as pio
import seaborn as sns
import matplotlib as mp
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt

import os
import plotly.graph_objects as go

from matplotlib.colors import LinearSegmentedColormap
from qiime2.plugins import taxa, emperor
from skbio import OrdinationResults
from typing import Dict, Tuple

%matplotlib inline

In [None]:
def plot_abundances(
    df: pd.DataFrame, 
    ax, 
    labels: Dict,
    coord: Tuple,
    color_map: Dict
):
    """
    Plots a horizontal stacked bar chart of feature abundances from a DataFrame.

    The function sorts features by their abundance in the first sample, assigns colors based on those abundances 
    using the provided color map, and creates a bar plot. The y-axis labels are updated to show depth information,
    and the chart includes a title displaying the coordinates of the sampled location. The y-axis is inverted to 
    present the data in a more intuitive order.

    Parameters:
        - df (pd.DataFrame): A DataFrame where each column represents a different feature and each row represents 
                             a different sample. The first row is used to sort features by abundance.
        - ax (matplotlib.axes.Axes): The axes onto which the plot will be drawn.
        - labels (dict): A dictionary mapping original y-axis labels to descriptive labels that indicate depth.
        - coord (tuple): A tuple containing latitude and longitude coordinates of the sample location (lat, lon).
        - color_map (dict): A dictionary mapping feature names to their corresponding colors.
    """
    
    # Sort features (columns) by abundance in the first sample
    sorted_features = df.iloc[0].sort_values(ascending=False).index
    df = df[sorted_features]

    # Assign colors to features: top features use one color map, others use another
    feature_colors = [color_map[feature] for feature in df.columns]

    # Plot the DataFrame with the assigned colors
    df.plot(kind='barh', stacked=True, ax=ax, color=feature_colors)
    
    # Update y-axis labels with depth information
    new_labels = [labels[x.get_text()] for x in ax.get_yticklabels()]
    ax.set_yticklabels(new_labels)
    ax.set_ylabel("Depth")
    ax.set_title(f"Location: lat={coord[0]}, lon={coord[1]}")
    ax.legend().set_visible(False)
    ax.invert_yaxis()

In [None]:
data_dir = "./data"

metadata_fp = os.path.join(data_dir, "metadata.tsv")
metadata = pd.read_csv(metadata_fp, sep="\t", index_col=0)

bracken_shannon_fp = os.path.join(data_dir, "shannon_vector.qza")
bracken_df = q2.Artifact.load(bracken_shannon_fp).view(pd.Series)

pcoa_fp = os.path.join(data_dir, "bray_curtis_pcoa_results.qza")
pcoa = q2.Artifact.load(pcoa_fp)

ft = q2.Artifact.load(os.path.join(data_dir, "bracken_ft_rarefied.qza"))
taxonomy = q2.Artifact.load(os.path.join(data_dir, "bracken_taxonomy.qza"))

## Data preparation
In this section we will clean up the metadata and merge it with the Shannon diversity results.

Remove the "5-160" combined depths:

In [None]:
metadata = metadata[metadata["Depth"] != '5-160']

Convert depth to numeric:

In [None]:
metadata["Depth"] = pd.to_numeric(metadata["Depth"])

Add a new column for geocoordinates. For easier plotting, we will round the coordinates to the second decimal place and assume that whatever samples were taken within that range originate from the same spot on Earth. Since we have "start" and "end" coordinate for both, latitude and longitude, we will take the average of both.

In [None]:
# replace lat/lon that are out of range with NaN
for col in ["Latitude_Start", "Latitude_End", "Longitude_Start", "Longitude_End"]:
    metadata.loc[(metadata[col] < -180) | (metadata[col] > 180), col] = np.nan

# remove coordinates which do not match between _start and _end (one is negative, the other positive)
for (col_start, col_end) in [("Latitude_Start", "Latitude_End"), ("Longitude_Start", "Longitude_End")]:
    selection = ((metadata[col_start] < 0) & (metadata[col_end] > 0) | (metadata[col_start] > 0) & (metadata[col_end] < 0))
    metadata.loc[selection, [col_start, col_end]] = np.nan

# add lat/lon columns by averaging the values from *_Start and *_End columns
metadata["Lat"] = round((metadata["Latitude_Start"] + metadata["Latitude_End"]) / 2, 2)
metadata["Lon"] = round((metadata["Longitude_Start"] + metadata["Longitude_End"]) / 2, 2)

Add additional columns for the rounded coordinates to allow grouping samples that are located nearby:

In [None]:
metadata["Lat1"] = round(metadata["Lat"], 1)
metadata["Lon1"] = round(metadata["Lon"], 1)

Merge the diversity metrics with the metadata:

In [None]:
bracken_df = bracken_df.to_frame().merge(metadata, left_index=True, right_index=True)
pcoa_df = pcoa.view(OrdinationResults).samples.merge(bracken_df, left_index=True, right_index=True)

Find "duplicated" samples which we will treat as replicates. We assume that whatever was collected at the same geo-location and depth is a replicate.

In [None]:
duplicated = bracken_df.duplicated(["Longitude_Start", "Latitude_Start", "Depth"])

In [None]:
bracken_df[duplicated].sort_values(
    ["Longitude_Start", "Longitude_End", "Latitude_Start", "Latitude_End", "Depth"], inplace=False
).loc[:, ["Longitude_Start", "Longitude_End", "Latitude_Start", "Latitude_End", "Depth", "Bytes", "shannon_entropy", "temperature"]]

De-duplicate the table by calculating the average diversity from all replicates:

In [None]:
bracken_df_dedupl = bracken_df.groupby(["Lat", "Lon", "Depth"])["shannon_entropy"].mean()

In [None]:
bracken_df_dedupl = bracken_df_dedupl.reset_index()

Categorize depths based on quantiles for easier plotting:

In [None]:
depth_quantiles = bracken_df_dedupl["Depth"].quantile([0.4, 0.45,  0.5, 0.6, 0.7, 0.8, 0.9])

In [None]:
slices = []
for i, q in enumerate(depth_quantiles):
    start = round(depth_quantiles.iloc[i], 0)
    if i < len(depth_quantiles) - 1:
        end = round(depth_quantiles.iloc[i + 1], 0)
    else:
        end = float('inf')

    depth = f'{start}-{end}'
    condition1 = (start <= bracken_df_dedupl['Depth']) & (bracken_df_dedupl['Depth'] < end)
    condition2 = (start <= bracken_df['Depth']) & (bracken_df['Depth'] < end)
    
    bracken_df_dedupl.loc[condition1, 'Depth_category'] = depth
    bracken_df.loc[condition2, 'Depth_category'] = depth 
    
    slices.append((start, end))

bracken_df_dedupl.groupby("Depth_category").count()

## Bray-Curtis PCoA
We will plot the first two dimensions of the Bray-Curtis PCoA analysis to see how the samples cluster depending on the sampling depth.

In [None]:
category = "Depth"
fig1, ax1 = plt.subplots(1, 1, figsize=(6, 6))

color_count = pcoa_df[category].nunique()
base_color = sns.husl_palette(6, h=0.5)[1]
colormap = LinearSegmentedColormap.from_list("custom_cmap", ["white", base_color])

# Calculate the actual min and max of the depth values
min_depth = pcoa_df[category].min()
max_depth = pcoa_df[category].max()

# Calculate the color range for the colormap
lower_threshold = 0.2
upper_threshold = 1.0

colors = [
    colormap(i) for i in 
    np.linspace(lower_threshold, upper_threshold, color_count)
]

colors_map = {
    depth: color for depth, color in
    zip(pcoa_df[category].value_counts().sort_index().index, colors)
}

pcoa_df["Colors"] = pcoa_df[category].map(colors_map)
pcoa_df.sort_values(category, inplace=True, ascending=False)

# Create scatter plot
sns.scatterplot(
    data=pcoa_df, x=0, y=1, color=pcoa_df["Colors"], s=50, alpha=0.5, ax=ax1
)

# Create a color bar instead of a legend
norm = plt.Normalize(min_depth, max_depth)
sm = plt.cm.ScalarMappable(cmap=colormap, norm=norm)
sm.set_array([])

# Create a color bar
cbar = fig1.colorbar(
    sm, ax=ax1, orientation='vertical', fraction=0.045, pad=0.01
)
cbar.ax.invert_yaxis()
cbar.set_label(category)

# Define ticks as per actual depth values
tick_interval = 200
ticks = np.arange(
    int(min_depth // tick_interval) * tick_interval,
    int(max_depth // tick_interval + 1) * tick_interval,
    tick_interval
)

cbar.set_ticks(ticks)
cbar.set_ticklabels(ticks)

# Set axes labels and title
ax1.set_xlabel("PCoA 1")
ax1.set_ylabel("PCoA 2")
ax1.set_title("Bray-Curtis")

# Set equal aspect ratio for square plot
ax1.set_aspect('equal', adjustable='box')

plt.tight_layout()
plt.show()

In [None]:
fig1.savefig(os.path.join(data_dir, "figure1.svg"), dpi=300)

## Global diversity overview
Here we will look how the Shannon diversity correlates with the sample location and depth. Moreover, we will look at the abundances of the taxa at four different locations change with the sampling depth.

Let's use the `Lat1`/`Lon1` rounded coordinates to count how many samples per location there are. We will then select 4 locations with the highest counts of samples to analyse their per-depth abundances.

In [None]:
pcoa_df.groupby(["Lat1", "Lon1"]).count().sort_values("Depth")["Depth"].tail(15)

We need samples with at least 3 different depths - let's visually check that some of the coordinates above can provide us with those:

In [None]:
coordinates = [(-35.2, 26.3), (-9.0, -139.2), (-31.0, 4.7), (-30.2, -43.3)]

In [None]:
for (lat, lon) in coordinates:
    print(f"Lat: {lat}, Lon: {lon}")
    print(pcoa_df.loc[(pcoa_df["Lat1"] == lat) & (pcoa_df["Lon1"] == lon), "Depth"])

### Overview of all samples

In [None]:
colormap = mp.colormaps['Blues']
lower_threshold = 0.2
upper_threshold = 1.0
colors = [
    mcolors.rgb2hex(colormap(lower_threshold + (upper_threshold - lower_threshold) * (i / len(slices))))
    for i in range(len(slices))
]

scale = 0.15
fig2 = go.Figure()

In [None]:
bracken_df_dedupl['text'] = "Shannon diversity: " + bracken_df_dedupl['shannon_entropy'].astype(str)

Plot all the points:

In [None]:
for i in range(len(slices)):
    lim = slices[i]
    df_sub = bracken_df_dedupl[
        (lim[0] <= bracken_df_dedupl["Depth"]) & (bracken_df_dedupl["Depth"] < lim[1])
    ]
    fig2.add_trace(
        go.Scattergeo(
            locationmode='ISO-3',
            lon=df_sub['Lon'],
            lat=df_sub['Lat'],
            text=df_sub['text'],
            marker=dict(
                size=df_sub['shannon_entropy'] ** 2 / scale,
                color=colors[i],
                sizemode='area'
            ),
            name = '{0} - {1}'.format(lim[0], lim[1])
        )
    )

Add new points indicating the four locations which we will be analysing in more detail:

In [None]:
for lat, lon in coordinates:
    fig2.add_trace(
        go.Scattergeo(
            locationmode='ISO-3',
            lon=[lon],
            lat=[lat],
            text=[f"Location: lat={lat}, lon={lon}"],
            marker=dict(
                size=25,
                color="red",
                sizemode='area'
            ),
            name = f"Location: lat={lat}, lon={lon}"
        )
    )

In [None]:
fig2.update_layout(
    width=1200,
    height=650,
    showlegend=True,
    geo=dict(
        scope='world',
        landcolor='rgb(217, 217, 217)',
    ),
    legend=dict(
        yanchor="bottom", y=0.075,
        xanchor="left", x=0.75
    ),
    margin=dict(t=0)
)

fig2.show()

In [None]:
pio.write_image(fig2, os.path.join(data_dir, 'figure2.svg'))

### Abundance analysis per location

We collapse the table to the `order` level:

In [None]:
level = 4

ft_collapsed, = taxa.methods.collapse(
    table=ft,
    taxonomy=taxonomy,
    level=level
)

Next, we convert the artifacts to Pandas' objects and normalize the feature table:

In [None]:
ft_df = ft_collapsed.view(pd.DataFrame)
taxonomy_ser = taxonomy.view(pd.Series)

ft_df = ft_df.div(ft_df.sum(axis=1), axis=0)

We will collect all the _unique_ features across all the plots and find the top N features with highest abunndances: 

In [None]:
num_top_features = 5
top_features = []
other_features = []
for (lat, lon) in coordinates:
    metadata_sorted = metadata.sort_values(["Depth", "Lat1", "Lon1"], ascending=True, inplace=False)
    samples = metadata_sorted.loc[
        (metadata_sorted["Lat1"] == lat) & (metadata_sorted["Lon1"] == lon), ["Lat1", "Lon1", "Depth"]
    ].reset_index().groupby("Depth").first()["ID"].tolist()
    df = ft_df.loc[samples, :]
    df = df.loc[:, (df != 0).any(axis=0)]
    df = df.sort_values(by=df.index[0], axis=1, ascending=False)
    
    for i, feature in enumerate(df.columns):
        if i < num_top_features and feature not in top_features:
            top_features.append(feature)
        else:
            other_features.append(feature)
all_features = set(top_features + other_features)

Create two color maps: one for the top N features and the other one for all the remaining ones: 

In [None]:
top_color_list = sns.husl_palette(len(top_features), h=.5)
other_color_list = mp.colormaps['Greys'](np.linspace(0.2, 1.8, len(all_features) - len(top_features)))

color_map = {}
top_feature_count = 0
other_feature_count = 0
for feature in [*top_features, *other_features]:
    if feature in top_features and feature not in color_map:
        color_map[feature] = top_color_list[top_feature_count]
        top_feature_count += 1
    elif feature not in top_features and feature not in color_map:
        color_map[feature] = other_color_list[other_feature_count % len(other_color_list)]
        other_feature_count += 1
    else:
        continue

In [None]:
fig3, axes3 = plt.subplots(4, 1, figsize=(5, 8))
top_handles = []
top_labels = []

# Plot each subplot with the unified color map
metadata_sorted = metadata.sort_values(["Depth", "Lat1", "Lon1"], ascending=True, inplace=False)
for (lat, lon), ax in zip(coordinates, axes3):
    # Find sample IDs
    samples = metadata_sorted.loc[
        (metadata_sorted["Lat1"] == lat) & (metadata_sorted["Lon1"] == lon), ["Lat1", "Lon1", "Depth"]
    ].reset_index().groupby("Depth").first()["ID"].tolist()
    labels = dict(metadata_sorted.loc[samples, "Depth"])

    # Slice feature table
    df = ft_df.loc[samples, :]

    # Remove columns (features) with zeros across all samples
    df = df.loc[:, (df != 0).any(axis=0)]
    
    plot_abundances(df, ax, labels, (lat, lon), color_map)

    # Collect handles and labels for the top features
    handles_legend, labels_legend = ax.get_legend_handles_labels()
    for handle, label in zip(handles_legend[:num_top_features], labels_legend[:num_top_features]):
        label = label.split(";")[-1][3:]
        if label not in top_labels:
            top_handles.append(handle)
            top_labels.append(label)

# Add the combined legend for the top features from each subplot
fig3.legend(top_handles, top_labels, title='Top Features', loc='center right', bbox_to_anchor=(1.75, 0.5))

plt.tight_layout(rect=[0, 0, 0.85, 1])  # Adjust layout to make space for the legend
plt.show()

In [None]:
fig3.savefig(os.path.join(data_dir, "figure3.svg"), dpi=300)