# Plot Sample KO counts from metaT

Given a table with sample x KO populated with normalized counts, make interesting plots

## Setup

In [53]:
import os 
import re
import glob
import umap
import numpy as np
import pandas as pd
from time import time
from scipy import stats
from collections import * 
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

In [3]:
os.getcwd()

In [4]:
workdir = '/scratch/bgrodner/iron_ko_contigs'
os.chdir(workdir)


In [5]:
os.getcwd()

In [6]:
os.listdir()

## Load table and inspect

In [7]:
dir_combined = 'metat_search_results/dicts_iron_KO_contig/dicts_contig_count/tables_norm_count/combined'
fn_table_sample_ko_norm_counts = f'{dir_combined}/iron_KOs.txt-table_samples_KOs_norm_count.csv'

table = pd.read_csv(fn_table_sample_ko_norm_counts)
table.shape

In [8]:
table.columns

In [9]:
table[:3]

In [10]:
table.fn_KO.unique()

In [11]:
asss = table.assembly.unique()
asss

In [12]:
for ass in asss:
    print(ass)
    print(table.loc[table.assembly == ass, 'sample'].unique())

In [13]:
for ass in asss:
    print(ass)
    print(table.loc[table.assembly == ass, 'ammendment'].unique())

In [14]:
for ass in asss:
    print(ass)
    print(table.loc[table.assembly == ass, 'timepoint'].unique())

In [15]:
for ass in asss:
    print(ass)
    print(table.loc[table.assembly == ass, 'depth'].unique())

In [16]:
for ass in asss:
    print(ass)
    print(table.loc[table.assembly == ass, 'size'].unique())

In [17]:
for ass in asss:
    print(ass)
    print(table.loc[table.assembly == ass, 'rep'].unique())

## What is the distribution of the counts?

In [18]:
bins=100
size=(10,2)
for i in range(9,19):
    vals = table.iloc[:,i].values

    fig, ax = plt.subplots(figsize=(size))
    _ = ax.hist(vals, bins=bins)
    ax.set_yscale('log')
    plt.show()
    plt.close()

Is the outlier the same sample each time?

In [19]:
for i in range(9,19):
    row = table.iloc[:,i].argmax()
    print(table.iloc[row, :9])

What is the norm factor?

In [20]:
dir_norm_factors = '/scratch/bgrodner/iron_ko_contigs/metat_search_results/dicts_iron_KO_contig/dicts_contig_count/norm_factors/G2NS_kofam2021_ALL.csv-iron_KOs.txt-norm_factors'
fn_norm_factor = f'{dir_norm_factors}/G2NS_kofam2021_ALL.csv-iron_KOs.txt-G2NS.S06C1.15m.0_2um.B.tsv-norm_factor.txt'
with open(fn_norm_factor, 'r') as f:
    print(f.read())

Other norm factors in the file

In [21]:
fns_norm_factor = glob.glob(dir_norm_factors + '/*')
for fn in fns_norm_factor:
    with open(fn, 'r') as f:
        print(f.read())

### Test for lognormality

(based on https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-020-03892-w)

Shapiro-wilkes test for normality

Shapiro-wilkes test after log transform





In [22]:
nrm = []
lognrm = []
for i in range(9,table.shape[1]):
    vals = table.iloc[:,i].values
    vals_ln = np.log(vals)
    print(stats.shapiro(vals).pvalue, stats.shapiro(vals_ln).pvalue)


^ Basically not normal, sometimes maybe lognormal

Are there any KOs with no reads?

In [23]:
for i in range(9,table.shape[1]):
    if not table.iloc[:,i].sum():
        print(table.columns[i])

## Dimensional reduction

Functions

In [255]:
def general_plot(
    xlabel="", ylabel="", ft=12, dims=(5, 3), col="k", lw=1, pad=0, tr_spines=True
):
    fig, ax = plt.subplots(figsize=(dims[0], dims[1]), tight_layout={"pad": pad})
    for i in ax.spines:
        ax.spines[i].set_linewidth(lw)
    if not tr_spines:
        ax.spines["top"].set_visible(False)
        ax.spines["right"].set_visible(False)
    else:
        ax.spines["top"].set_color(col)
        ax.spines["right"].set_color(col)
    ax.spines["bottom"].set_color(col)
    ax.spines["left"].set_color(col)
    ax.tick_params(direction="in", labelsize=ft, color=col, labelcolor=col)
    ax.set_xlabel(xlabel, fontsize=ft, color=col)
    ax.set_ylabel(ylabel, fontsize=ft, color=col)
    ax.patch.set_alpha(0)
    return (fig, ax)


def plot_umap(
    embedding,
    figsize=(10, 10),
    markersize=10,
    alpha=0.5,
    colors="k",
    xticks=[],
    yticks=[],
    markerstyle='o'
):
    fig, ax = general_plot(dims=figsize)
    if isinstance(markerstyle, str):
        ax.scatter(
            embedding[:, 0],
            embedding[:, 1],
            s=markersize,
            alpha=alpha,
            c=colors,
            edgecolors="none",
            marker=markerstyle
        )
    else:
        for e0, e1, c, m in zip(
            embedding[:, 0], 
            embedding[:, 1],
            colors,
            markerstyle 
        ):
            ax.scatter(
                e0,
                e1,
                s=markersize,
                alpha=alpha,
                c=c,
                edgecolors="none",
                marker=m
            )
    ax.set_aspect("equal")
    if len(xticks) > 0:
        ax.set_xticks(xticks)
    if len(yticks) > 0:
        ax.set_yticks(yticks)
    ax.set_xlabel("UMAP 1")
    ax.set_ylabel("UMAP 2")
    return fig, ax


def get_colors_from_stringlist(stringlist, order="", cmap_name="viridis"):
    if not order:
        order = list(set(stringlist))
    n_strings = len(order)
    cmap = plt.get_cmap(cmap_name)
    colors = [cmap(i / n_strings) for i in range(n_strings)]
    dict_str_col = {s: c for s, c in zip(order, colors)}
    return [dict_str_col[s] for s in stringlist], dict_str_col


def plot_legend(color_dict, figsize=(2, 4)):
    fig, ax = general_plot(dims=figsize)
    # Create legend handles manually
    legend_handles = []
    for value, color in color_dict.items():
        patch = mpatches.Patch(color=color, label=value)
        legend_handles.append(patch)

    # Add legend to the plot
    ax.legend(handles=legend_handles)
    return fig, ax


def taxon_legend(
    taxon_names,
    taxon_colors,
    label_color="k",
    taxon_counts=[],
    text_shift_vh=(6, 0.15),
    ft=20,
    dims=(6, 10),
    lw=2,
    ylabel="Genus",
):
    fig, ax = general_plot(ylabel=ylabel, dims=dims, col=label_color, lw=lw, ft=ft)
    vals = np.repeat(10, len(taxon_names))
    y_pos = np.arange(len(taxon_names))
    ax.barh(y_pos, vals, align="center", color=taxon_colors)
    if len(taxon_counts) > 0:
        # Label the legend with counts
        for i, (v, c) in enumerate(zip(vals, taxon_counts)):
            ax.text(
                v - text_shift_vh[0],
                i + text_shift_vh[1],
                str(c),
                color="k",
                fontsize=ft,
            )
    # Remove the boundaries
    for spine in ax.spines.values():
        spine.set_visible(False)
    # Add the genus names to the y axis
    ax.set_yticks(y_pos)
    ax.set_xticks([])
    ax.set_yticklabels(taxon_names)
    ax.invert_yaxis()
    return (fig, ax)


def marker_legend(
    names,
    markers,
    label_color="k",
    text_shift_hv=(0.5, 0),
    markersize=50,
    ft=20,
    dims=(3, 3),
    lw=2,
    ylabel="Genus",
):
    fig, ax = general_plot(ylabel=ylabel, dims=dims, col=label_color, lw=lw, ft=ft)
    for i, (n, m) in enumerate(zip(names, markers)):
        ax.scatter(0, i, marker=m, c=label_color, s=markersize)
        # ax.text(
        #     0 - text_shift_hv[0],
        #     i + text_shift_hv[1],
        #     str(n),
        #     color=label_color,
        #     fontsize=ft,
        #     va="center",
        # )
    # Remove the boundaries
    for spine in ax.spines.values():
        spine.set_visible(False)
    # Add the genus names to the y axis

    ax.set_xlim([-1, 1])
    ax.set_ylim([-1, i + 1])

    ax.set_yticks(np.arange(i+1))
    ax.set_xticks([])
    ax.set_yticklabels(names)
    ax.invert_yaxis()
    ax.invert_yaxis()
    return fig, ax

### Compare assemblies

In [448]:
reducer = umap.UMAP(n_neighbors=11, metric='cosine')

table_vals = table.iloc[:, 9:].values

embedding = reducer.fit_transform(table_vals)
embedding.shape

In [449]:
plot_umap(embedding=embedding)

In [450]:



stringlist = table["assembly"].values
ass_order = [
    "D1PA",
    "G1PA",
    "G2PA",
    "G3PA",
    "G3.UW.PA",
    "G3PA.diel",
    "G1NS",
    "G2NS",
    "G3.UW.NS",
    "G5",
]
c, dict_ass_col = get_colors_from_stringlist(
    stringlist=stringlist,
    order=ass_order,
    cmap_name='tab10'
)

plot_umap(
    embedding=embedding, 
    colors=c
)
plt.show()
plt.close()

colors = [dict_ass_col[n] for n in ass_order]
taxon_legend(ass_order, colors, dims=(1,2), ylabel='Assembly', ft=8)

### Compare timepoints in D1 Diel

In [451]:
table_vals = table.iloc[:, 9:][table.assembly == 'D1PA'].values

embedding = reducer.fit_transform(table_vals)
embedding.shape

In [452]:

plot_umap(embedding=embedding)

In [453]:
stringlist = table.loc[table.assembly == 'D1PA', "timepoint"].values
order = [
    '200',
    '600',
    '1000',
    '1400',
    '1800',
    '2200'
]
c, dict_time_col = get_colors_from_stringlist(
    stringlist=stringlist,
    order=order,
    cmap_name='tab10'
)

plot_umap(
    embedding=embedding, 
    colors=c,
    alpha=1,
    markersize=20
)
plt.show()
plt.close()

colors = [dict_time_col[t] for t in order]
taxon_legend(order, colors, dims=(0.75, 1), ylabel="Time", ft=8)

### Compare timepoints in G3 Diel

In [454]:
bool_ass = table.assembly == "G3PA.diel"

table_vals_sub = table.iloc[:, 9:][bool_ass].values

embedding = reducer.fit_transform(table_vals_sub)
print(embedding.shape)
plot_umap(embedding=embedding, alpha=1, markersize=20)

Get timepoints

In [455]:
samples = table.loc[bool_ass, 'sample'].values
samples[:10]

Load metadata table

In [456]:
dir_diel = '../repo-armbrust-metat/gradients3/g3_diel_pa_metat'
fn_G3PA_diel_metadata = f'{dir_diel}/sample_metadata.csv'
G3PA_diel_metadata = pd.read_csv(fn_G3PA_diel_metadata)
G3PA_diel_metadata.shape, G3PA_diel_metadata[:3]

Build sample timepoint dict

In [457]:
dict_tp_sample = {}
for i, row in G3PA_diel_metadata.iterrows():
    sample = row.Alias1.split(' ')[0]
    dict_tp_sample[row.Datetime] = sample


Sort datetime

In [458]:
df_dt = G3PA_diel_metadata.Datetime.to_frame()
df_dt["day"] = [
    dt.split(' ')[0] for dt in df_dt.Datetime.values
]
df_dt["hour"] = [
    int(re.search(r"(?<=\s)\d+(?=\:)", dt)[0]) for dt in df_dt.Datetime.values
]
df_dt = df_dt.sort_values(by=['day','hour'], ascending=True)
datetime_sort = df_dt.Datetime.unique()
datetime_sort

Color umap by timepoint

In [459]:
stringlist = table.loc[bool_ass, "sample"].values

samples_sort = [dict_tp_sample[tp] for tp in datetime_sort]

c, dict_time_col = get_colors_from_stringlist(
    stringlist=stringlist, order=samples_sort, cmap_name="tab20"
)

plot_umap(
    embedding=embedding, 
    colors=c,
    alpha=1,
    markersize=40
)
plt.show()
plt.close()

names = datetime_sort
colors = [dict_time_col[s] for s in samples_sort]
taxon_legend(names, colors, dims=(1.5, 5), ylabel="Time", ft=8)

### Compare Lattitudes in G1PA

In [460]:
bool_ass = table.assembly == "G1PA"

table_vals_sub = table.iloc[:, 9:][bool_ass].values

embedding = reducer.fit_transform(table_vals_sub)
print(embedding.shape)
plot_umap(embedding=embedding, alpha=1, markersize=20)

Get latitudes

In [461]:
samples = table.loc[bool_ass, 'sample'].values
np.unique(samples)

Load metadata table

In [462]:
dir_diel = '../repo-armbrust-metat/gradients1/g1_station_pa_metat'
fn_G3PA_diel_metadata = f'{dir_diel}/sample_metadata.csv'

G3PA_diel_metadata = pd.read_csv(fn_G3PA_diel_metadata)
G3PA_diel_metadata.shape, G3PA_diel_metadata[:3]

Build sample latitude dict

In [463]:
row.Alias2

In [464]:
dict_tp_sample = {}
for i, row in G3PA_diel_metadata.iterrows():
    alias = row.Alias2
    if isinstance(alias, str):
        sample = alias.split('_')[0]
        dict_tp_sample[row.Latitude] = sample
dict_tp_sample

Sort latitudes

In [465]:
datetime_sort = sorted(list(dict_tp_sample.keys()))
datetime_sort

Color umap by latitude

In [466]:
stringlist = table.loc[bool_ass, "sample"].values

samples_sort = [dict_tp_sample[tp] for tp in datetime_sort]

c, dict_time_col = get_colors_from_stringlist(
    stringlist=stringlist, order=samples_sort, cmap_name="plasma_r"
)

plot_umap(
    embedding=embedding, 
    colors=c,
    alpha=1,
    markersize=40
)
plt.show()
plt.close()

names = [round(l, 1) for l in datetime_sort]
colors = [dict_time_col[s] for s in samples_sort]
taxon_legend(names, colors, dims=(1.5, 3), ylabel="Latitude", ft=8)

#### Separate by size fraction

In [467]:
sizes = table.loc[bool_ass, 'size'].unique()
sizes

In [468]:
markers = ['o','x']
dict_size_marker = dict(zip(sizes, markers))

sizes_all = table.loc[bool_ass, 'size'].values

markerstyle = [dict_size_marker[s] for s in sizes_all]

In [469]:
stringlist = table.loc[bool_ass, "sample"].values

samples_sort = [dict_tp_sample[tp] for tp in datetime_sort]

c, dict_time_col = get_colors_from_stringlist(
    stringlist=stringlist, order=samples_sort, cmap_name="plasma_r"
)

plot_umap(
    embedding=embedding, 
    colors=c,
    alpha=1,
    markersize=40,
    markerstyle=markerstyle
)
plt.show()
plt.close()

names = [round(l, 1) for l in datetime_sort]
colors = [dict_time_col[s] for s in samples_sort]
taxon_legend(names, colors, dims=(1.5, 3), ylabel="Latitude", ft=8)

marker_legend(
    sizes,
    markers,
    ylabel="Size Fraction",
    dims=(1, 1),
    ft=8,
    markersize=50,
    text_shift_hv=(1, 0),
)

### Compare Lattitudes in G1NS

In [470]:
bool_ass = table.assembly == "G1NS"

table_vals_sub = table.iloc[:, 9:][bool_ass].values

embedding = reducer.fit_transform(table_vals_sub)
print(embedding.shape)
plot_umap(embedding=embedding, alpha=1, markersize=20)

Get latitudes

In [471]:
samples = table.loc[bool_ass, 'sample'].values
np.unique(samples)

Load metadata table

In [472]:
dir_diel = '../repo-armbrust-metat/gradients1/g1_station_ns_metat'
fn_G3PA_diel_metadata = f'{dir_diel}/sample_metadata.csv'

G3PA_diel_metadata = pd.read_csv(fn_G3PA_diel_metadata)
G3PA_diel_metadata.shape, G3PA_diel_metadata[:3]

Build sample latitude dict

In [473]:
row.Alias2

In [474]:
dict_tp_sample = {}
for i, row in G3PA_diel_metadata.iterrows():
    alias = row.Alias2
    if isinstance(alias, str):
        sample = alias.split('_')[0]
        dict_tp_sample[row.Latitude] = sample
dict_tp_sample

Sort latitudes

In [475]:
datetime_sort = sorted(list(dict_tp_sample.keys()))
datetime_sort

Color umap by latitude

In [476]:
stringlist = table.loc[bool_ass, "sample"].values

samples_sort = [dict_tp_sample[tp] for tp in datetime_sort]

c, dict_time_col = get_colors_from_stringlist(
    stringlist=stringlist, order=samples_sort, cmap_name="plasma_r"
)

plot_umap(
    embedding=embedding, 
    colors=c,
    alpha=1,
    markersize=40
)
plt.show()
plt.close()

names = [round(l, 1) for l in datetime_sort]
colors = [dict_time_col[s] for s in samples_sort]
taxon_legend(names, colors, dims=(1.5, 3), ylabel="Latitude", ft=8)

#### Separate by size fraction

In [477]:
sizes = table.loc[bool_ass, 'size'].unique()
sizes

In [478]:
markers = ['o','x']
dict_size_marker = dict(zip(sizes, markers))

sizes_all = table.loc[bool_ass, 'size'].values

markerstyle = [dict_size_marker[s] for s in sizes_all]

In [479]:
stringlist = table.loc[bool_ass, "sample"].values

samples_sort = [dict_tp_sample[tp] for tp in datetime_sort]

c, dict_time_col = get_colors_from_stringlist(
    stringlist=stringlist, order=samples_sort, cmap_name="plasma_r"
)

plot_umap(
    embedding=embedding, 
    colors=c,
    alpha=1,
    markersize=40,
    markerstyle=markerstyle
)
plt.show()
plt.close()

names = [round(l, 1) for l in datetime_sort]
colors = [dict_time_col[s] for s in samples_sort]
taxon_legend(names, colors, dims=(1.5, 3), ylabel="Latitude", ft=8)

marker_legend(
    sizes,
    markers,
    ylabel="Size Fraction",
    dims=(1, 1),
    ft=8,
    markersize=50,
    text_shift_hv=(1, 0),
)

### Compare Lattitudes in G2PA

In [480]:
bool_ass = table.assembly == "G2PA"

table_vals_sub = table.iloc[:, 9:][bool_ass].values

embedding = reducer.fit_transform(table_vals_sub)
print(embedding.shape)
plot_umap(embedding=embedding, alpha=1, markersize=20)

Get latitudes

In [481]:
samples = table.loc[bool_ass, 'sample'].values
np.unique(samples)

Load metadata table

In [482]:
dir_diel = '../repo-armbrust-metat/gradients2/g2_station_pa_metat'
fn_G3PA_diel_metadata = f'{dir_diel}/sample_metadata.csv'

G3PA_diel_metadata = pd.read_csv(fn_G3PA_diel_metadata)
G3PA_diel_metadata.shape, G3PA_diel_metadata[:3]

Build sample latitude dict

In [483]:
dict_tp_sample = {}
for i, row in G3PA_diel_metadata.iterrows():
    alias = row.Alias2
    if isinstance(alias, str):
        if not np.isnan(row.Latitude):
            sample = alias.split('.')[1]
            dict_tp_sample[row.Latitude] = sample
dict_tp_sample

Sort latitudes

In [484]:
datetime_sort = sorted(list(dict_tp_sample.keys()))
datetime_sort

Color umap by latitude

In [485]:
stringlist = table.loc[bool_ass, "sample"].values

samples_sort = [dict_tp_sample[tp] for tp in datetime_sort]

c, dict_time_col = get_colors_from_stringlist(
    stringlist=stringlist, order=samples_sort, cmap_name="plasma_r"
)

plot_umap(
    embedding=embedding, 
    colors=c,
    alpha=1,
    markersize=40
)
plt.show()
plt.close()

names = [round(l, 1) for l in datetime_sort]
colors = [dict_time_col[s] for s in samples_sort]
taxon_legend(names, colors, dims=(1.5, 3), ylabel="Latitude", ft=8)

#### Separate by size fraction

In [486]:
sizes = table.loc[bool_ass, 'size'].unique()
sizes

In [487]:
markers = ['o','x']
dict_size_marker = dict(zip(sizes, markers))

sizes_all = table.loc[bool_ass, 'size'].values

markerstyle = [dict_size_marker[s] for s in sizes_all]

In [488]:
stringlist = table.loc[bool_ass, "sample"].values

samples_sort = [dict_tp_sample[tp] for tp in datetime_sort]

c, dict_time_col = get_colors_from_stringlist(
    stringlist=stringlist, order=samples_sort, cmap_name="plasma_r"
)

plot_umap(
    embedding=embedding, 
    colors=c,
    alpha=1,
    markersize=40,
    markerstyle=markerstyle
)
plt.show()
plt.close()

names = [round(l, 1) for l in datetime_sort]
colors = [dict_time_col[s] for s in samples_sort]
taxon_legend(names, colors, dims=(1.5, 3), ylabel="Latitude", ft=8)

marker_legend(
    sizes,
    markers,
    ylabel="Size Fraction",
    dims=(1, 1),
    ft=8,
    markersize=50,
    text_shift_hv=(1, 0),
)

### Compare Lattitudes in G2NS

In [489]:
bool_ass = table.assembly == "G2NS"

table_vals_sub = table.iloc[:, 9:][bool_ass].values

embedding = reducer.fit_transform(table_vals_sub)
print(embedding.shape)
plot_umap(embedding=embedding, alpha=1, markersize=20)

Get latitudes

In [490]:
samples = table.loc[bool_ass, 'sample'].values
np.unique(samples)

Load metadata table

In [491]:
dir_diel = '../repo-armbrust-metat/gradients2/g2_station_ns_metat'
fn_G3PA_diel_metadata = f'{dir_diel}/sample_metadata.csv'

G3PA_diel_metadata = pd.read_csv(fn_G3PA_diel_metadata)
G3PA_diel_metadata.shape, G3PA_diel_metadata[:3]

Build sample latitude dict

In [492]:
dict_tp_sample = {}
for i, row in G3PA_diel_metadata.iterrows():
    alias = row.Alias2
    if isinstance(alias, str):
        if not np.isnan(row.Latitude):
            sample = alias.split('.')[0]
            dict_tp_sample[row.Latitude] = sample
dict_tp_sample

Sort latitudes

In [493]:
datetime_sort = sorted(list(dict_tp_sample.keys()))
datetime_sort

Color umap by latitude

In [494]:
stringlist = table.loc[bool_ass, "sample"].values

samples_sort = [dict_tp_sample[tp] for tp in datetime_sort]

c, dict_time_col = get_colors_from_stringlist(
    stringlist=stringlist, order=samples_sort, cmap_name="plasma_r"
)

plot_umap(
    embedding=embedding, 
    colors=c,
    alpha=1,
    markersize=40
)
plt.show()
plt.close()

names = [round(l, 1) for l in datetime_sort]
colors = [dict_time_col[s] for s in samples_sort]
taxon_legend(names, colors, dims=(1.5, 3), ylabel="Latitude", ft=8)

#### Separate by size fraction

In [495]:
sizes = table.loc[bool_ass, 'size'].unique()
sizes

In [496]:
markers = ['o','x']
dict_size_marker = dict(zip(sizes, markers))

sizes_all = table.loc[bool_ass, 'size'].values

markerstyle = [dict_size_marker[s] for s in sizes_all]

In [497]:
stringlist = table.loc[bool_ass, "sample"].values

samples_sort = [dict_tp_sample[tp] for tp in datetime_sort]

c, dict_time_col = get_colors_from_stringlist(
    stringlist=stringlist, order=samples_sort, cmap_name="plasma_r"
)

plot_umap(
    embedding=embedding, 
    colors=c,
    alpha=1,
    markersize=40,
    markerstyle=markerstyle
)
plt.show()
plt.close()

names = [round(l, 1) for l in datetime_sort]
colors = [dict_time_col[s] for s in samples_sort]
taxon_legend(names, colors, dims=(1.5, 3), ylabel="Latitude", ft=8)

marker_legend(
    sizes,
    markers,
    ylabel="Size Fraction",
    dims=(1, 1),
    ft=8,
    markersize=50,
    text_shift_hv=(1, 0),
)

### Compare Lattitudes in G3PA

In [498]:
bool_ass = table.assembly == "G3PA"

table_vals_sub = table.iloc[:, 9:][bool_ass].values

embedding = reducer.fit_transform(table_vals_sub)
print(embedding.shape)
plot_umap(embedding=embedding, alpha=1, markersize=20)

Get latitudes

In [499]:
samples = table.loc[bool_ass, 'sample'].values
np.unique(samples)

Load metadata table

In [500]:
dir_diel = '../repo-armbrust-metat/gradients3/g3_uw_pa_metat'
fn_G3PA_diel_metadata = f'{dir_diel}/sample_metadata.csv'

G3PA_diel_metadata = pd.read_csv(fn_G3PA_diel_metadata)
G3PA_diel_metadata.shape, G3PA_diel_metadata[:3]

Build sample latitude dict

In [501]:
dict_tp_sample = defaultdict(list)
for i, row in G3PA_diel_metadata.iterrows():
    alias = row.Alias2
    if isinstance(alias, str):
        sample = alias
        dict_tp_sample[row.Latitude].append(sample)
dict_tp_sample

In [502]:
dict_sample_lat = {}
for i, row in G3PA_diel_metadata.iterrows():
    alias = row.Alias2
    if isinstance(alias, str):
        if not np.isnan(row.Latitude):
            sample = alias
            dict_sample_lat[sample] = row.Latitude
dict_sample_lat

Sort latitudes

In [503]:
datetime_sort = sorted(list(set(dict_sample_lat.values())))
datetime_sort

Color umap by latitude

In [504]:
stringlist = table.loc[bool_ass, "sample"].values

samples_sort = [s for tp in datetime_sort for s in dict_tp_sample[tp]]

c, dict_time_col = get_colors_from_stringlist(
    stringlist=stringlist, order=samples_sort, cmap_name="plasma_r"
)

plot_umap(
    embedding=embedding, 
    colors=c,
    alpha=1,
    markersize=40
)
plt.show()
plt.close()

names = [round(l, 1) for l in datetime_sort]
colors = [dict_time_col[s] for s in samples_sort]
taxon_legend(names, colors, dims=(1.5, 3), ylabel="Latitude", ft=8)

#### Separate by size fraction

In [505]:
sizes = table.loc[bool_ass, 'size'].unique()
sizes

In [506]:
markers = ['o','x']
dict_size_marker = dict(zip(sizes, markers))

sizes_all = table.loc[bool_ass, 'size'].values

markerstyle = [dict_size_marker[s] for s in sizes_all]

In [507]:
stringlist = table.loc[bool_ass, "sample"].values

samples_sort = [dict_tp_sample[tp] for tp in datetime_sort]

c, dict_time_col = get_colors_from_stringlist(
    stringlist=stringlist, order=samples_sort, cmap_name="plasma_r"
)

plot_umap(
    embedding=embedding, 
    colors=c,
    alpha=1,
    markersize=40,
    markerstyle=markerstyle
)
plt.show()
plt.close()

names = [round(l, 1) for l in datetime_sort]
colors = [dict_time_col[s] for s in samples_sort]
taxon_legend(names, colors, dims=(1.5, 3), ylabel="Latitude", ft=8)

marker_legend(
    sizes,
    markers,
    ylabel="Size Fraction",
    dims=(1, 1),
    ft=8,
    markersize=50,
    text_shift_hv=(1, 0),
)

### Compare Lattitudes in G3NS

In [508]:
bool_ass = table.assembly == "G3.UW.NS"

table_vals_sub = table.iloc[:, 9:][bool_ass].values

embedding = reducer.fit_transform(table_vals_sub)
print(embedding.shape)
plot_umap(embedding=embedding, alpha=1, markersize=20)

Get latitudes

In [509]:
samples = table.loc[bool_ass, 'sample'].values
np.unique(samples)

Load metadata table

In [510]:
dir_diel = '../repo-armbrust-metat/gradients3/g3_uw_ns_metat'
fn_G3PA_diel_metadata = f'{dir_diel}/sample_metadata.csv'

G3PA_diel_metadata = pd.read_csv(fn_G3PA_diel_metadata)
G3PA_diel_metadata.shape, G3PA_diel_metadata[:3]

Build sample latitude dict

In [511]:
dict_tp_sample = {}
for i, row in G3PA_diel_metadata.iterrows():
    alias = row.Alias1
    if isinstance(alias, str):
        if not np.isnan(row.Latitude):
            sample = alias.split(' ')[0]
            dict_tp_sample[row.Latitude] = sample
dict_tp_sample

Sort latitudes

In [512]:
datetime_sort = sorted(list(dict_tp_sample.keys()))
datetime_sort

Color umap by latitude

In [513]:
stringlist = table.loc[bool_ass, "sample"].values

samples_sort = [dict_tp_sample[tp] for tp in datetime_sort]

c, dict_time_col = get_colors_from_stringlist(
    stringlist=stringlist, order=samples_sort, cmap_name="plasma_r"
)

plot_umap(
    embedding=embedding, 
    colors=c,
    alpha=1,
    markersize=40
)
plt.show()
plt.close()

names = [round(l, 1) for l in datetime_sort]
colors = [dict_time_col[s] for s in samples_sort]
taxon_legend(names, colors, dims=(1.5, 3), ylabel="Latitude", ft=8)

#### Separate by size fraction

In [514]:
sizes = table.loc[bool_ass, 'size'].unique()
sizes

In [515]:
markers = ['o','x']
dict_size_marker = dict(zip(sizes, markers))

sizes_all = table.loc[bool_ass, 'size'].values

markerstyle = [dict_size_marker[s] for s in sizes_all]

In [516]:
stringlist = table.loc[bool_ass, "sample"].values

samples_sort = [dict_tp_sample[tp] for tp in datetime_sort]

c, dict_time_col = get_colors_from_stringlist(
    stringlist=stringlist, order=samples_sort, cmap_name="plasma_r"
)

plot_umap(
    embedding=embedding, 
    colors=c,
    alpha=1,
    markersize=40,
    markerstyle=markerstyle
)
plt.show()
plt.close()

names = [round(l, 1) for l in datetime_sort]
colors = [dict_time_col[s] for s in samples_sort]
taxon_legend(names, colors, dims=(1.5, 3), ylabel="Latitude", ft=8)

marker_legend(
    sizes,
    markers,
    ylabel="Size Fraction",
    dims=(1, 1),
    ft=8,
    markersize=50,
    text_shift_hv=(1, 0),
)

## Compare Lattitudes in G3PA am vs pm 

### 3um

Get G3PA

In [517]:
bool_ass = (table.assembly == "G3PA")

table_sub = table[bool_ass]
samples = table_sub['sample'].values
np.unique(samples).shape, np.unique(samples)

Load metadata table

In [518]:
dir_diel = '../repo-armbrust-metat/gradients3/g3_uw_pa_metat'
fn_metadata = f'{dir_diel}/sample_metadata.csv'

metadata = pd.read_csv(fn_metadata)
metadata.shape, metadata[:3]

Map alias to size and latitude

In [519]:
dict_sample_lat = {}
dict_sample_size = {}
for i, row in metadata.iterrows():
    sample = row.Alias2
    if sample:
        dict_sample_lat[sample] = row.Latitude
        dict_sample_size[sample] = row.Filter

dict_sample_lat, dict_sample_size

Get size 3um

In [520]:
bool_size = [True if dict_sample_size[s] == 3 else False for s in samples]

table_sub_sub1 = table_sub[bool_size]
samples_sub_sub1 = table_sub_sub1['sample'].values

table_sub_sub1.shape, samples_sub_sub1

Get latitudes

In [521]:
latitudes1 = [dict_sample_lat[s] for s in samples_sub_sub1]
latitudes1

Get G3PA PM

In [522]:
bool_ass = (table.assembly == "G3.UW.PA")

table_sub = table[bool_ass]
samples = table_sub['sample'].values
np.unique(samples).shape, np.unique(samples), table_sub[:5]

Filter by size

In [523]:
table_sub_sub2 = table_sub[table_sub['size'] == '3um']
samples_sub_sub2 = table_sub_sub2['sample'].values

table_sub_sub2.shape, samples_sub_sub2

Load metadata table

In [524]:
dir_diel = '../repo-armbrust-metat/gradients3/g3_uw_pa_pm_metat'
fn_metadata = f'{dir_diel}/sample_metadata.csv'

metadata = pd.read_csv(fn_metadata)
metadata.shape, metadata[:3]

Map alias to latitude

In [525]:
dict_sample_lat = {}
for i, row in metadata.iterrows():
    sample = row.Alias1.split(' ')[0]
    if sample:
        dict_sample_lat[sample] = row.Latitude

dict_sample_lat

Get latitudes

In [526]:
latitudes2 = [dict_sample_lat[s] for s in samples_sub_sub2]
latitudes2

Get colors

In [527]:
latitudes = latitudes1 + latitudes2

cmap_name = 'plasma_r'
cmap = plt.get_cmap(cmap_name)
mn, mx = np.min(latitudes), np.max(latitudes)
lat_norm = (np.array(latitudes) - mn) / (mx - mn)
colors = [cmap(l) for l in lat_norm]
len(colors)

Get time of day as markerstyle

In [528]:
markers = ['o']*len(latitudes1) + ['x']*len(latitudes2)
dict_time_marker = {'AM':'o','PM':'x'}

Get umap

In [531]:
vals1 = table_sub_sub1.iloc[:, 9:].values
vals2 = table_sub_sub2.iloc[:, 9:].values

vals = np.vstack([vals1, vals2])

reducer = umap.UMAP(n_neighbors=11, metric='cosine')
embedding = reducer.fit_transform(vals)

Plot UMAP

In [532]:
gradient = np.linspace(0,1,256)
np.vstack([gradient]*2).shape

In [533]:
plot_umap(
    embedding,
    markersize=50,
    alpha=1,
    colors=colors,
    markerstyle=markers
)


names_mlegend = list(dict_time_marker.keys())
markers_mlegend = list(dict_time_marker.values())
marker_legend(
    names_mlegend,
    markers_mlegend,
    ylabel="Experiment",
    dims=(1, 1),
    ft=8,
    markersize=50,
    text_shift_hv=(1, 0),
)


horizontal = False
scalevals = latitudes
dims=[1,1.5]
ft=8

gradient = np.linspace(0,1,256)
fig, ax = general_plot(dims=dims, ft=ft)
for i in ax.spines:
    ax.spines[i].set_visible(False)
if horizontal:
    gradient = np.vstack([gradient]*2)
    ax.set_xticks([0,256])
    ax.set_xticklabels([np.min(vals),np.max(vals)])
    ax.set_yticks([])
else:
    gradient = np.hstack([gradient[:,None]]*2)
    ax.set_yticks([0,255])
    ax.set_yticklabels([np.min(scalevals),np.max(scalevals)])
    ax.set_xticks([])
    ax.set_ylabel('Latitude')
                    
ax.imshow(gradient, cmap=cmap_name)
ax.invert_xaxis()
ax.invert_yaxis()




### 0.2um

Get G3PA

In [534]:
bool_ass = (table.assembly == "G3PA")

table_sub = table[bool_ass]
samples = table_sub['sample'].values
np.unique(samples).shape, np.unique(samples)

Load metadata table

In [535]:
dir_diel = '../repo-armbrust-metat/gradients3/g3_uw_pa_metat'
fn_metadata = f'{dir_diel}/sample_metadata.csv'

metadata = pd.read_csv(fn_metadata)
metadata.shape, metadata[:3]

Map alias to size and latitude

In [536]:
dict_sample_lat = {}
dict_sample_size = {}
for i, row in metadata.iterrows():
    sample = row.Alias2
    if sample:
        dict_sample_lat[sample] = row.Latitude
        dict_sample_size[sample] = row.Filter

dict_sample_lat, dict_sample_size

Get size 0.2um

In [537]:
bool_size = [True if dict_sample_size[s] == 0.2 else False for s in samples]

table_sub_sub1 = table_sub[bool_size]
samples_sub_sub1 = table_sub_sub1['sample'].values

table_sub_sub1.shape, samples_sub_sub1

Get latitudes

In [538]:
latitudes1 = [dict_sample_lat[s] for s in samples_sub_sub1]
latitudes1

Get G3PA PM

In [539]:
bool_ass = (table.assembly == "G3.UW.PA")

table_sub = table[bool_ass]
samples = table_sub['sample'].values
np.unique(samples).shape, np.unique(samples), table_sub[:5]

Filter by size

In [540]:
table_sub_sub2 = table_sub[table_sub['size'] == '0.2um']
samples_sub_sub2 = table_sub_sub2['sample'].values

table_sub_sub2.shape, samples_sub_sub2

Load metadata table

In [541]:
dir_diel = '../repo-armbrust-metat/gradients3/g3_uw_pa_pm_metat'
fn_metadata = f'{dir_diel}/sample_metadata.csv'

metadata = pd.read_csv(fn_metadata)
metadata.shape, metadata[:3]

Map alias to latitude

In [542]:
dict_sample_lat = {}
for i, row in metadata.iterrows():
    sample = row.Alias1.split(' ')[0]
    if sample:
        dict_sample_lat[sample] = row.Latitude

dict_sample_lat

Get latitudes

In [543]:
latitudes2 = [dict_sample_lat[s] for s in samples_sub_sub2]
latitudes2

Get colors

In [544]:
latitudes = latitudes1 + latitudes2

cmap_name = 'plasma_r'
cmap = plt.get_cmap(cmap_name)
mn, mx = np.min(latitudes), np.max(latitudes)
lat_norm = (np.array(latitudes) - mn) / (mx - mn)
colors = [cmap(l) for l in lat_norm]
len(colors)

Get time of day as markerstyle

In [545]:
markers = ['o']*len(latitudes1) + ['x']*len(latitudes2)
dict_time_marker = {'AM':'o','PM':'x'}

Get umap

In [546]:
vals1 = table_sub_sub1.iloc[:, 9:].values
vals2 = table_sub_sub2.iloc[:, 9:].values

vals = np.vstack([vals1, vals2])

reducer = umap.UMAP(n_neighbors=11, metric='cosine')
embedding = reducer.fit_transform(vals)

Plot UMAP

In [547]:
plot_umap(
    embedding,
    markersize=50,
    alpha=1,
    colors=colors,
    markerstyle=markers
)


names_mlegend = list(dict_time_marker.keys())
markers_mlegend = list(dict_time_marker.values())
marker_legend(
    names_mlegend,
    markers_mlegend,
    ylabel="Experiment",
    dims=(1, 1),
    ft=8,
    markersize=50,
    text_shift_hv=(1, 0),
)

## G5

In [548]:
bool_ass = table.assembly == "G5"

table_sub = table[bool_ass]
samples_sub = table_sub['sample']
samples_sub

Get mix and RR

In [549]:
bool_mix = (table_sub['sample'] == 'Mix1') | (table_sub['sample'] == 'Mix2')
bool_RR = (table_sub['sample'] == 'RR1') | (table_sub['sample'] == 'RR2')

table_RR = table_sub[bool_RR]
table_mix = table_sub[bool_mix]

table_RR.shape, table_mix.shape


### RR

Get ammendments as color

In [550]:
ammendments = table_RR['ammendment'].values
colors, dict_amm_col = get_colors_from_stringlist(ammendments, order="", cmap_name="tab10")


Get RR location as marker

In [551]:
rr_loc = table_RR['sample'].values
dict_rr_mark = {'RR1':'o','RR2':'x'}
markers = [dict_rr_mark[r] for r in rr_loc]

Get Umap

In [554]:
vals = table_RR.iloc[:, 9:].values

reducer = umap.UMAP(n_neighbors=11)
embedding = reducer.fit_transform(vals)

Plot UMAP

In [555]:
plot_umap(
    embedding,
    figsize=(5,5),
    markersize=50,
    alpha=1,
    colors=colors,
    markerstyle=markers
)


names_legend = list(dict_amm_col.keys())
colors_legend = list(dict_amm_col.values())
taxon_legend(names_legend, colors_legend, dims=(1, 2), ylabel="Ammendment", ft=8)

names_mlegend = list(dict_rr_mark.keys())
markers_mlegend = list(dict_rr_mark.values())
marker_legend(
    names_mlegend,
    markers_mlegend,
    ylabel="Experiment",
    dims=(1, 1),
    ft=8,
    markersize=50,
    text_shift_hv=(1, 0),
)

### Mix

Get ammendments as color

In [556]:
ammendments = table_mix['ammendment'].values
colors, dict_amm_col = get_colors_from_stringlist(ammendments, order="", cmap_name="tab10")


Get RR location as marker

In [557]:
rr_loc = table_mix['sample'].values
dict_rr_mark = {'Mix1':'o','Mix2':'x'}
markers = [dict_rr_mark[r] for r in rr_loc]

Get Umap

In [560]:
vals = table_mix.iloc[:, 9:].values

reducer = umap.UMAP(n_neighbors=11)
embedding = reducer.fit_transform(vals)

Plot UMAP

In [561]:
plot_umap(
    embedding,
    figsize=(5,5),
    markersize=50,
    alpha=1,
    colors=colors,
    markerstyle=markers
)


names_legend = list(dict_amm_col.keys())
colors_legend = list(dict_amm_col.values())
taxon_legend(names_legend, colors_legend, dims=(1, 2), ylabel="Ammendment", ft=8)

names_mlegend = list(dict_rr_mark.keys())
markers_mlegend = list(dict_rr_mark.values())
marker_legend(
    names_mlegend,
    markers_mlegend,
    ylabel="Experiment",
    dims=(1, 1),
    ft=8,
    markersize=50,
    text_shift_hv=(1, 0),
)

## KOs as points

Invert table

Take log

Spearman correlation matrix to distance metric