# Preamble

## Imports

In [None]:
import pandas as pd
from glob import glob
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA, NMF
from skbio.stats.ordination import pcoa
from skbio.stats.distance import DissimilarityMatrix
import numpy as np
import seaborn as sns
import sqlite3
import matplotlib as mpl
from scipy.spatial.distance import pdist, squareform

from scripts.lib.plotting import load_style
from scipy.spatial.distance import pdist, squareform
from sklearn.cluster import AgglomerativeClustering


loaded_style = load_style('paper')
savefig = loaded_style['savefig']
fullwidth = loaded_style['fullwidth']
halfwidth = loaded_style['halfwidth']

con = sqlite3.connect('data/core.muri2.2.denorm.db')
#sns.set_context('notebook')

def idxwhere(x):
    return x[x].index

def ident(x):
    return x

## Metadata

In [None]:
color_map = { 'ormerod-starch': 'blue'
            , 'ormerod-host': 'purple'
            , 'ormerod-plant': 'green'
            , 'public': 'grey'
            , 'here': 'black'
            }

subgroups = ['ormerod-plant', 'ormerod-host', 'ormerod-starch', 'public', 'here']
size_map = {'ormerod-plant': 10, 'ormerod-host': 10, 'ormerod-starch': 10, 'public': 10, 'here': 20}
marker_map = {'ormerod-plant': 'o', 'ormerod-host': 'o', 'ormerod-starch': 'o', 'public': 'o', 'here': '^'}

#mpl.rcParams['text.usetex'] = True

In [None]:
mag = (
    pd.read_sql(
    """
SELECT *
FROM genome
JOIN checkm USING (genome_id)
JOIN genome_group USING (genome_id)
WHERE genome_group_id = 'muri2'
    """,
    index_col='genome_id',
    con=con,
    )
)

rename_map = {
                 'Barnesiella_viscericola_DSM_18177': 'Bv'
               , 'Bacteroides_ovatus_ATCC_8483': 'Bo'
               , 'Bacteroides_thetaiotaomicron_VPI5482': 'Bt'
               , 'Porphyromonas_gingivalis_ATCC_33277': 'Pg'
               , 'Homeothermus_arabinoxylanisolvens': 'Ha'
               , 'Muribaculum_intestinale_DSM_28989T': 'Mi'
               , 'Duncaniella_muris_DSM_103720T': 'Dm'
               , 'Duncaniella_freteri_DSM_108168T': 'Df'
               , 'Duncaniella_dubosii_DSM_107170T': 'Dd'
               , 'Paramuribaculum_intestinale_DSM_100749T': 'Pi'
               , 'Amulumruptor_caecigallinarius': 'Ac'
               } 

italic_list = [ 'Barnesiella_viscericola_DSM_18177'
              , 'Bacteroides_ovatus_ATCC_8483'
              , 'Bacteroides_thetaiotaomicron_VPI5482'
              , 'Porphyromonas_gingivalis_ATCC_33277'
              , 'Homeothermus_arabinoxylanisolvens'
              , 'Muribaculum_intestinale_DSM_28989T'
              , 'Duncaniella_muris_DSM_103720T'
              , 'Duncaniella_freteri_DSM_108168T'
              , 'Duncaniella_dubosii_DSM_107170T'
              , 'Paramuribaculum_intestinale_DSM_100749T'
              , 'Amulumruptor_caecigallinarius'
              ]

def rename_mag(genome_id):
    if genome_id in rename_map:
        return rename_map[genome_id]
    else:
        return genome_id

In [None]:
opf_to_architecture = pd.read_sql(
    """
SELECT * FROM opf_to_architecture;
    """,
    index_col='opf_id',
    con=con,
).squeeze()

cazy_function = pd.read_sql("""
SELECT domain_id AS func_id, cazy_description AS description
FROM cazy_domain
                       """, con=con, index_col=['func_id'])

# PROTOTYPE

In [None]:
def plot_pca_ordination(d, meta, xy=('PC1', 'PC2'),
                    marker=None, color=None, size=None, annot=None,
                    derep_kwargs=dict(linkage='complete', distance_threshold=0.0),
                    annotation_list=[], ax=None,
                    scatter_kwargs=None,
                    annotate_kwargs=None):
    x, y = xy
    if not ax:
        fig, ax = plt.subplots()
    if marker is None:
        marker = lambda m: 'o'
    if color is None:
        color = lambda m: 'black'
    if size is None:
        size = lambda m: 10
    if annot is None:
        annot = lambda m: ''
    if scatter_kwargs is None:
        scatter_kwargs = {}
    if annotate_kwargs is None:
        annotate_kwargs = {}
        
    # Fuzzy dereplication of genomes:
    # Drop entries with only zeros.
    is_empty = d.sum(1) == 0
    d_noempty = d[~is_empty]
    
    dmat = pd.DataFrame(squareform(pdist(d_noempty,
                            metric='correlation')
                     ), index=d_noempty.index, columns=d_noempty.index)
    clust = AgglomerativeClustering(n_clusters=None, affinity='precomputed',
                                    **derep_kwargs,
                                   ).fit_predict(dmat)
    clust = pd.Series(clust, index=d_noempty.index)
    # Add back a cluster for empty items.
    clust = np.concatenate([clust, pd.Series(-1, index=idxwhere(is_empty))])

    # Perform transformation.
    d_fit = d.groupby(clust).mean()
    fit = PCA().fit(d_fit)
    ordin = pd.DataFrame(fit.transform(d), index=d.index)
    ordin.columns = [f'PC{i}' for i in ordin.columns + 1]
    perc_explained = pd.Series(fit.explained_variance_ratio_, index=ordin.columns)

    d0 = (
            ordin
            .join(meta)
            .assign(
                __color=color,
                __size=size,
                __marker=marker,
                __annot=annot,
            )
        )
    for marker, d1 in d0.groupby('__marker'):
        ax.scatter(
            x, y,
            data=d1,
            color='__color',
            s='__size',
            marker=marker,
            **scatter_kwargs,
            label='__nolegend__',
        )

    for genome_id, (_x, _y, __annot) in d0[[x, y, '__annot']].iterrows():
        ax.annotate(__annot,
                    xy=(_x, _y),
                    va='center', ha='center', **annotate_kwargs)

    ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
    ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
    return ax, fit, d0

def plot_nmf_ordination(d, meta, nfactors=2, xy=('PC1', 'PC2'),
                    marker=None, color=None, size=None,
                    derep_kwargs=dict(linkage='complete', distance_threshold=0.0),
                    annotation_list=[], ax=None,
                    scatter_kwargs=None,
                    annotate_kwargs=None):
    x, y = xy
    if not ax:
        fig, ax = plt.subplots()
    if marker is None:
        marker = lambda m: 'o'
    if color is None:
        color = lambda m: 'black'
    if size is None:
        size = lambda m: 10
    if scatter_kwargs is None:
        scatter_kwargs = {}
    if annotate_kwargs is None:
        annotate_kwargs = {}
        
    # Drop entries with only zeros.
    d_nodup = d.drop_duplicates()
    d_nodup = d_nodup.drop(idxwhere(d_nodup.sum(1) == 0))
    
    dmat = pd.DataFrame(squareform(pdist(d_nodup,
                            metric='correlation')
                     ), index=d_nodup.index, columns=d_nodup.index)
    clust = AgglomerativeClustering(n_clusters=None, affinity='precomputed',
                                    **derep_kwargs,
                                   ).fit_predict(dmat)
    clust = pd.Series(clust, index=d_nodup.index)

    # Perform transformation.
    fit = NMF(n_components=nfactors, max_iter=1000).fit(d)
    ordin = pd.DataFrame(fit.transform(d), index=d.index)
    ordin.columns = [f'PC{i}' for i in ordin.columns + 1]
#    perc_explained = pd.Series(fit.explained_variance_ratio_, index=ordin.columns)

    d0 = (
            ordin
            .join(meta)
            .assign(
                __color=color(meta),
                __size=size(meta),
                __marker=marker(meta),
            )
        )
    for marker, d1 in d0.groupby('__marker'):
        ax.scatter(
            x, y,
            data=d1,
            color='__color',
            s='__size',
            marker=marker,
            **scatter_kwargs,
            label='__nolegend__',
        )

    for genome_id in annotation_list:
        ax.annotate(genome_id,
                    xy=(ordin.loc[genome_id][x], ordin.loc[genome_id][y]),
                    va='center', ha='center', **annotate_kwargs)
    ax.set_xlabel(x)
    ax.set_ylabel(y)
    return ax, fit


def plot_pcoa_ordination(d, meta, nfactors=3, xy=('PC1', 'PC2'),
                    marker=None, color=None, size=None,
                    derep_kwargs=dict(linkage='complete', distance_threshold=0.0),
                    annotation_list=[], ax=None,
                    scatter_kwargs=None,
                    annotate_kwargs=None):
    x, y = xy
    if not ax:
        fig, ax = plt.subplots()
    if marker is None:
        marker = lambda m: 'o'
    if color is None:
        color = lambda m: 'black'
    if size is None:
        size = lambda m: 10
    if scatter_kwargs is None:
        scatter_kwargs = {}
    if annotate_kwargs is None:
        annotate_kwargs = {}
        
    # Drop entries with only zeros.
    d_nozeros = d.drop(idxwhere(d.sum(1) == 0))
    
    dmat = DissimilarityMatrix(pdist(d_nozeros,
                                     metric='jaccard'),
                               ids=d_nozeros.index)

    # Perform transformation.
    fit = pcoa(dmat)
    ordin = fit.samples
    perc_explained =  fit.proportion_explained

    d0 = (
            ordin
            .join(meta)
            .assign(
                __color=color(meta),
                __size=size(meta),
                __marker=marker(meta),
            )
        )
    for marker, d1 in d0.groupby('__marker'):
        ax.scatter(
            x, y,
            data=d1,
            color='__color',
            s='__size',
            marker=marker,
            **scatter_kwargs,
            label='__nolegend__',
        )

    for genome_id in annotation_list:
        ax.annotate(genome_id,
                    xy=(ordin.loc[genome_id][x], ordin.loc[genome_id][y]),
                    va='center', ha='center', **annotate_kwargs)
    ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
    ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
    return ax, fit

## GH Domains

### Raw

#### No other filters

In [None]:
# Glycoside hydrolase tallies as in Ormerod (not counting e.g. GH13_1 separately from GH13_2)

gh_count = (
    pd.read_sql(
        """
SELECT genome_id, cazy_family AS func_id, COUNT(feature_id) AS tally
FROM (SELECT
          feature_id,
          SUBSTR(domain_id, 0,
                 CASE WHEN INSTR(domain_id, '_')
                      THEN INSTR(domain_id, '_')
                      ELSE LENGTH(domain_id) + 1
                 END) AS cazy_family
      FROM feature_x_cazy_minimal_domain
      WHERE score > 5.
        AND cazy_family LIKE 'GH%'
      )
JOIN feature USING (feature_id)
JOIN sequence USING (sequence_id)
-- JOIN (SELECT DISTINCT feature_id, genome_id
--       FROM (SELECT feature_id AS seed_id FROM pul_susC)
--       JOIN feature_distance USING (seed_id)
--       JOIN feature USING (feature_id)
--       JOIN sequence USING (sequence_id)
--       WHERE distance < 10000  -- Choose filter distance from susC or remove entirely
--      ) USING (feature_id)
-- JOIN feature_localization USING (feature_id)
WHERE func_id NOT NULL
--   AND localization IN ('OM', 'IM', 'PP')  -- Choose localization filter
GROUP BY genome_id, func_id
        """,
        con=con, index_col=['genome_id', 'func_id']
        )
        .tally
        .unstack('func_id', fill_value=0)
        .reindex(mag.index)
        .fillna(0)
            )

plot_pca_ordination(
    gh_count.loc[mag.subgroup.isin(subgroups)].apply(np.sqrt),
    xy=('PC1', 'PC2'),
    meta=mag,
    color=lambda m: m.subgroup.map(color_map),
    size=lambda m: m.subgroup.map(size_map),
    annotation_list=idxwhere(mag.subgroup.isin(['here'])),
    scatter_kwargs=dict(alpha=0.75, linewidths=0),
    annotate_kwargs=dict(fontsize=8),
)

#### Exported

In [None]:
# Exported glycoside hydrolase domains
gh_export_count = (
    pd.read_sql(
        """
SELECT genome_id, cazy_family AS func_id, COUNT(feature_id) AS tally
FROM (SELECT
          feature_id,
          SUBSTR(domain_id, 0,
                 CASE WHEN INSTR(domain_id, '_')
                      THEN INSTR(domain_id, '_')
                      ELSE LENGTH(domain_id) + 1
                 END) AS cazy_family
      FROM feature_x_cazy_minimal_domain
      WHERE score > 5.
        AND cazy_family LIKE 'GH%'
      )
JOIN feature USING (feature_id)
JOIN sequence USING (sequence_id)
-- JOIN (SELECT DISTINCT feature_id
--       FROM (SELECT feature_id AS seed_id FROM pul_susC)
--       JOIN feature_distance USING (seed_id)
--       JOIN feature USING (feature_id)
--       JOIN sequence USING (sequence_id)
--       WHERE distance < 10000  -- Choose filter distance from susC or remove entirely
--      ) USING (feature_id)
JOIN feature_localization USING (feature_id)
WHERE func_id NOT NULL
  AND localization IN ('OM', 'IM', 'PP')  -- Choose localization filter
GROUP BY genome_id, func_id
        """,
        con=con, index_col=['genome_id', 'func_id']
        )
        .tally
        .unstack('func_id', fill_value=0)
        .reindex(mag.index)
        .fillna(0)
            )

plot_pca_ordination(
    gh_export_count.loc[mag.subgroup.isin(subgroups)].apply(np.sqrt),
    meta=mag,
    color=lambda m: m.subgroup.map(color_map),
    size=lambda m: m.subgroup.map(size_map),
    annotation_list=idxwhere(mag.subgroup.isin(['here'])),
    scatter_kwargs=dict(alpha=0.75, linewidths=0),
    annotate_kwargs=dict(fontsize=8),
)

#### Exported and PUL localized

In [None]:
# Exported glycoside hydrolase domains
gh_pul_export_count = (
    pd.read_sql(
        """
SELECT genome_id, cazy_family AS func_id, COUNT(feature_id) AS tally
FROM (SELECT
          feature_id,
          SUBSTR(domain_id, 0,
                 CASE WHEN INSTR(domain_id, '_')
                      THEN INSTR(domain_id, '_')
                      ELSE LENGTH(domain_id) + 1
                 END) AS cazy_family
      FROM feature_x_cazy_minimal_domain
      WHERE score > 5.
        AND cazy_family LIKE 'GH%'
      )
JOIN feature USING (feature_id)
JOIN sequence USING (sequence_id)
JOIN (SELECT feature_id FROM closest_PUL_susC WHERE distance < 10000) USING (feature_id)
JOIN feature_localization USING (feature_id)
WHERE func_id NOT NULL
  AND localization IN ('OM', 'IM', 'PP')  -- Choose localization filter
GROUP BY genome_id, func_id
        """,
        con=con, index_col=['genome_id', 'func_id']
        )
        .tally
        .unstack('func_id', fill_value=0)
        .reindex(mag.index)
        .fillna(0)
            )

plot_pca_ordination(
    gh_pul_export_count.loc[mag.subgroup.isin(subgroups)].apply(np.cbrt),
    meta=mag,
    color=lambda m: m.subgroup.map(color_map),
    size=lambda m: m.subgroup.map(size_map),
    annotation_list=idxwhere(mag.subgroup.isin(['here'])),
    scatter_kwargs=dict(alpha=0.75, linewidths=0),
    annotate_kwargs=dict(fontsize=8),
)

### OPF clusters

#### No other filters

In [None]:
# Glycoside hydrolase tallies as in Ormerod (not counting e.g. GH13_1 separately from GH13_2)

opf_gh_count = (
    pd.read_sql(
        """
SELECT genome_id, opf_id AS func_id, COUNT(feature_id) AS tally
FROM (SELECT
          feature_id,
          SUBSTR(domain_id, 0,
                 CASE WHEN INSTR(domain_id, '_')
                      THEN INSTR(domain_id, '_')
                      ELSE LENGTH(domain_id) + 1
                 END) AS cazy_family
      FROM feature_x_cazy_minimal_domain
      WHERE score > 5.
        AND cazy_family LIKE 'GH%'
      )
JOIN feature USING (feature_id)
JOIN feature_to_opf USING (feature_id)
JOIN sequence USING (sequence_id)
-- JOIN (SELECT DISTINCT feature_id, genome_id
--       FROM (SELECT feature_id AS seed_id FROM pul_susC)
--       JOIN feature_distance USING (seed_id)
--       JOIN feature USING (feature_id)
--       JOIN sequence USING (sequence_id)
--       WHERE distance < 10000  -- Choose filter distance from susC or remove entirely
--      ) USING (feature_id)
-- JOIN feature_localization USING (feature_id)
WHERE func_id NOT NULL
--   AND localization IN ('OM', 'IM', 'PP')  -- Choose localization filter
GROUP BY genome_id, func_id
        """,
        con=con, index_col=['genome_id', 'func_id']
        )
        .tally
        .unstack('func_id', fill_value=0)
        .reindex(mag.index)
        .fillna(0)
            )

plot_pca_ordination(
    opf_gh_count.loc[mag.subgroup.isin(subgroups)].apply(np.sqrt),
    meta=mag,
    color=lambda m: m.subgroup.map(color_map),
    size=lambda m: m.subgroup.map(size_map),
    annotation_list=idxwhere(mag.subgroup.isin(['here'])),
    scatter_kwargs=dict(alpha=0.75, linewidths=0),
    annotate_kwargs=dict(fontsize=8),
)

#### Exported

In [None]:
# Exported glycoside hydrolase domains
opf_gh_export_count = (
    pd.read_sql(
        """
SELECT genome_id, opf_id AS func_id, COUNT(feature_id) AS tally
FROM (SELECT
          feature_id,
          SUBSTR(domain_id, 0,
                 CASE WHEN INSTR(domain_id, '_')
                      THEN INSTR(domain_id, '_')
                      ELSE LENGTH(domain_id) + 1
                 END) AS cazy_family
      FROM feature_x_cazy_minimal_domain
      WHERE score > 5.
        AND cazy_family LIKE 'GH%'
      )
JOIN feature USING (feature_id)
JOIN feature_to_opf USING (feature_id)
JOIN sequence USING (sequence_id)
-- JOIN (SELECT DISTINCT feature_id
--       FROM (SELECT feature_id AS seed_id FROM pul_susC)
--       JOIN feature_distance USING (seed_id)
--       JOIN feature USING (feature_id)
--       JOIN sequence USING (sequence_id)
--       WHERE distance < 10000  -- Choose filter distance from susC or remove entirely
--      ) USING (feature_id)
JOIN feature_localization USING (feature_id)
WHERE func_id NOT NULL
  AND localization IN ('OM', 'IM', 'PP')  -- Choose localization filter
GROUP BY genome_id, func_id
        """,
        con=con, index_col=['genome_id', 'func_id']
        )
        .tally
        .unstack('func_id', fill_value=0)
        .reindex(mag.index)
        .fillna(0)
            )

plot_pca_ordination(
    opf_gh_export_count.loc[mag.subgroup.isin(subgroups)].apply(np.sqrt),
    meta=mag,
    color=lambda m: m.subgroup.map(color_map),
    size=lambda m: m.subgroup.map(size_map),
    annotation_list=idxwhere(mag.subgroup.isin(['here'])),
    scatter_kwargs=dict(alpha=0.75, linewidths=0),
    annotate_kwargs=dict(fontsize=8),
)

#### Exported and PUL localized

In [None]:
# Exported glycoside hydrolase domains
opf_gh_pul_export_count = (
    pd.read_sql(
        """
SELECT genome_id, opf_id AS func_id, COUNT(feature_id) AS tally
FROM (SELECT
          feature_id,
          SUBSTR(domain_id, 0,
                 CASE WHEN INSTR(domain_id, '_')
                      THEN INSTR(domain_id, '_')
                      ELSE LENGTH(domain_id) + 1
                 END) AS cazy_family
      FROM feature_x_cazy_minimal_domain
      WHERE score > 5.
        AND cazy_family LIKE 'GH%'
      )
JOIN feature USING (feature_id)
JOIN feature_to_opf USING (feature_id)
JOIN sequence USING (sequence_id)
JOIN (SELECT feature_id FROM closest_PUL_susC WHERE distance < 10000) USING (feature_id)
JOIN feature_localization USING (feature_id)
WHERE func_id NOT NULL
  AND localization IN ('OM', 'IM', 'PP')  -- Choose localization filter
GROUP BY genome_id, func_id
        """,
        con=con, index_col=['genome_id', 'func_id']
        )
        .tally
        .unstack('func_id', fill_value=0)
        .reindex(mag.index)
        .fillna(0)
            )

ax, fit = plot_pca_ordination(
    opf_gh_pul_export_count.loc[mag.subgroup.isin(subgroups)].apply(np.sqrt),
    xy=('PC1', 'PC2'),
    meta=mag,
    color=lambda m: m.subgroup.map(color_map),
    size=lambda m: m.subgroup.map(size_map),
    annotation_list=idxwhere(mag.subgroup.isin(['here'])),
    scatter_kwargs=dict(alpha=0.75, linewidths=0),
    annotate_kwargs=dict(fontsize=8),
)

## GH+CBM Domains

### Raw

#### No other filters

In [None]:
# Glycoside hydrolase tallies as in Ormerod (not counting e.g. GH13_1 separately from GH13_2)

cazy_count = (
    pd.read_sql(
        """
SELECT genome_id, cazy_family AS func_id, COUNT(feature_id) AS tally
FROM (SELECT
          feature_id,
          SUBSTR(domain_id, 0,
                 CASE WHEN INSTR(domain_id, '_')
                      THEN INSTR(domain_id, '_')
                      ELSE LENGTH(domain_id) + 1
                 END) AS cazy_family
      FROM feature_x_cazy_minimal_domain
      WHERE score > 5.
        AND (cazy_family LIKE 'GH%' OR cazy_family LIKE 'CBM%')        
      )
JOIN feature USING (feature_id)
JOIN sequence USING (sequence_id)
-- JOIN (SELECT DISTINCT feature_id, genome_id
--       FROM (SELECT feature_id AS seed_id FROM pul_susC)
--       JOIN feature_distance USING (seed_id)
--       JOIN feature USING (feature_id)
--       JOIN sequence USING (sequence_id)
--       WHERE distance < 10000  -- Choose filter distance from susC or remove entirely
--      ) USING (feature_id)
-- JOIN feature_localization USING (feature_id)
WHERE func_id NOT NULL
--   AND localization IN ('OM', 'IM', 'PP')  -- Choose localization filter
GROUP BY genome_id, func_id
        """,
        con=con, index_col=['genome_id', 'func_id']
        )
        .tally
        .unstack('func_id', fill_value=0)
        .reindex(mag.index)
        .fillna(0)
            )

plot_pca_ordination(
    cazy_count.loc[mag.subgroup.isin(subgroups)].apply(np.sqrt),
    meta=mag,
    color=lambda m: m.subgroup.map(color_map),
    size=lambda m: m.subgroup.map(size_map),
    annotation_list=idxwhere(mag.subgroup.isin(['here'])),
    scatter_kwargs=dict(alpha=0.75, linewidths=0),
    annotate_kwargs=dict(fontsize=8),
)

#### Exported

In [None]:
# Exported glycoside hydrolase domains
cazy_export_count = (
    pd.read_sql(
        """
SELECT genome_id, cazy_family AS func_id, COUNT(feature_id) AS tally
FROM (SELECT
          feature_id,
          SUBSTR(domain_id, 0,
                 CASE WHEN INSTR(domain_id, '_')
                      THEN INSTR(domain_id, '_')
                      ELSE LENGTH(domain_id) + 1
                 END) AS cazy_family
      FROM feature_x_cazy_minimal_domain
      WHERE score > 5.
        AND (cazy_family LIKE 'GH%' OR cazy_family LIKE 'CBM%')        
      )
JOIN feature USING (feature_id)
JOIN sequence USING (sequence_id)
-- JOIN (SELECT DISTINCT feature_id
--       FROM (SELECT feature_id AS seed_id FROM pul_susC)
--       JOIN feature_distance USING (seed_id)
--       JOIN feature USING (feature_id)
--       JOIN sequence USING (sequence_id)
--       WHERE distance < 10000  -- Choose filter distance from susC or remove entirely
--      ) USING (feature_id)
JOIN feature_localization USING (feature_id)
WHERE func_id NOT NULL
  AND localization IN ('OM', 'IM', 'PP')  -- Choose localization filter
GROUP BY genome_id, func_id
        """,
        con=con, index_col=['genome_id', 'func_id']
        )
        .tally
        .unstack('func_id', fill_value=0)
        .reindex(mag.index)
        .fillna(0)
            )

plot_pca_ordination(
    cazy_export_count.loc[mag.subgroup.isin(subgroups)].apply(np.sqrt),
    meta=mag,
    color=lambda m: m.subgroup.map(color_map),
    size=lambda m: m.subgroup.map(size_map),
    annotation_list=idxwhere(mag.subgroup.isin(['here'])),
    scatter_kwargs=dict(alpha=0.75, linewidths=0),
    annotate_kwargs=dict(fontsize=8),
)

#### In PUL

In [None]:
# Exported glycoside hydrolase domains
cazy_pul_count = (
    pd.read_sql(
        """
SELECT genome_id, cazy_family AS func_id, COUNT(feature_id) AS tally
FROM (SELECT
          feature_id,
          SUBSTR(domain_id, 0,
                 CASE WHEN INSTR(domain_id, '_')
                      THEN INSTR(domain_id, '_')
                      ELSE LENGTH(domain_id) + 1
                 END) AS cazy_family
      FROM feature_x_cazy_minimal_domain
      WHERE score > 5.
        AND (cazy_family LIKE 'GH%' OR cazy_family LIKE 'CBM%')        
      )
JOIN feature USING (feature_id)
JOIN sequence USING (sequence_id)
JOIN (SELECT feature_id FROM closest_PUL_susC WHERE distance < 10000) USING (feature_id)
-- JOIN feature_localization USING (feature_id)
WHERE func_id NOT NULL
--   AND localization IN ('OM', 'IM', 'PP')  -- Choose localization filter
GROUP BY genome_id, func_id
        """,
        con=con, index_col=['genome_id', 'func_id']
        )
        .tally
        .unstack('func_id', fill_value=0)
        .reindex(mag.index)
        .fillna(0)
            )

plot_pca_ordination(
    cazy_pul_count.loc[mag.subgroup.isin(subgroups)].apply(np.sqrt),
    meta=mag,
    color=lambda m: m.subgroup.map(color_map),
    size=lambda m: m.subgroup.map(size_map),
    annotation_list=idxwhere(mag.subgroup.isin(['here'])),
    scatter_kwargs=dict(alpha=0.75, linewidths=0),
    annotate_kwargs=dict(fontsize=8),
)

#### Exported and PUL localized

In [None]:
# Exported glycoside hydrolase domains
cazy_pul_export_count = (
    pd.read_sql(
        """
SELECT genome_id, cazy_family AS func_id, COUNT(feature_id) AS tally
FROM (SELECT
          feature_id,
          SUBSTR(domain_id, 0,
                 CASE WHEN INSTR(domain_id, '_')
                      THEN INSTR(domain_id, '_')
                      ELSE LENGTH(domain_id) + 1
                 END) AS cazy_family
      FROM feature_x_cazy_minimal_domain
      WHERE score > 5.
        AND (cazy_family LIKE 'GH%' OR cazy_family LIKE 'CBM%')        
      )
JOIN feature USING (feature_id)
JOIN sequence USING (sequence_id)
JOIN (SELECT feature_id FROM closest_PUL_susC WHERE distance < 10000) USING (feature_id)
JOIN feature_localization USING (feature_id)
WHERE func_id NOT NULL
  AND localization IN ('OM', 'IM', 'PP')  -- Choose localization filter
GROUP BY genome_id, func_id
        """,
        con=con, index_col=['genome_id', 'func_id']
        )
        .tally
        .unstack('func_id', fill_value=0)
        .reindex(mag.index)
        .fillna(0)
            )

plot_pca_ordination(
    cazy_pul_export_count.loc[mag.subgroup.isin(subgroups)].apply(np.sqrt),
    xy=('PC1', 'PC2'),
    meta=mag,
    color=lambda m: m.subgroup.map(color_map),
    size=lambda m: m.subgroup.map(size_map),
    annotation_list=idxwhere(mag.subgroup.isin(['here'])),
    scatter_kwargs=dict(alpha=0.75, linewidths=0),
    annotate_kwargs=dict(fontsize=8),
)

### OPF clusters

#### No other filters

In [None]:
# Glycoside hydrolase tallies as in Ormerod (not counting e.g. GH13_1 separately from GH13_2)

opf_cazy_count = (
    pd.read_sql(
        """
SELECT genome_id, opf_id AS func_id, COUNT(feature_id) AS tally
FROM (SELECT
          feature_id,
          SUBSTR(domain_id, 0,
                 CASE WHEN INSTR(domain_id, '_')
                      THEN INSTR(domain_id, '_')
                      ELSE LENGTH(domain_id) + 1
                 END) AS cazy_family
      FROM feature_x_cazy_minimal_domain
      WHERE score > 5.
        AND (cazy_family LIKE 'GH%' OR cazy_family LIKE 'CBM%')        
      )
JOIN feature USING (feature_id)
JOIN feature_to_opf USING (feature_id)
JOIN sequence USING (sequence_id)
-- JOIN (SELECT DISTINCT feature_id, genome_id
--       FROM (SELECT feature_id AS seed_id FROM pul_susC)
--       JOIN feature_distance USING (seed_id)
--       JOIN feature USING (feature_id)
--       JOIN sequence USING (sequence_id)
--       WHERE distance < 10000  -- Choose filter distance from susC or remove entirely
--      ) USING (feature_id)
-- JOIN feature_localization USING (feature_id)
WHERE func_id NOT NULL
--   AND localization IN ('OM', 'IM', 'PP')  -- Choose localization filter
GROUP BY genome_id, func_id
        """,
        con=con, index_col=['genome_id', 'func_id']
        )
        .tally
        .unstack('func_id', fill_value=0)
        .reindex(mag.index)
        .fillna(0)
            )

plot_pca_ordination(
    opf_cazy_count.loc[mag.subgroup.isin(subgroups)].apply(np.sqrt),
    meta=mag,
    color=lambda m: m.subgroup.map(color_map),
    size=lambda m: m.subgroup.map(size_map),
    annotation_list=idxwhere(mag.subgroup.isin(['here'])),
    scatter_kwargs=dict(alpha=0.75, linewidths=0),
    annotate_kwargs=dict(fontsize=8),
)

#### Exported

In [None]:
# Exported glycoside hydrolase domains
opf_cazy_export_count = (
    pd.read_sql(
        """
SELECT genome_id, opf_id AS func_id, COUNT(feature_id) AS tally
FROM (SELECT
          feature_id,
          SUBSTR(domain_id, 0,
                 CASE WHEN INSTR(domain_id, '_')
                      THEN INSTR(domain_id, '_')
                      ELSE LENGTH(domain_id) + 1
                 END) AS cazy_family
      FROM feature_x_cazy_minimal_domain
      WHERE score > 5.
        AND (cazy_family LIKE 'GH%' OR cazy_family LIKE 'CBM%')        
      )
JOIN feature USING (feature_id)
JOIN feature_to_opf USING (feature_id)
JOIN sequence USING (sequence_id)
-- JOIN (SELECT DISTINCT feature_id
--       FROM (SELECT feature_id AS seed_id FROM pul_susC)
--       JOIN feature_distance USING (seed_id)
--       JOIN feature USING (feature_id)
--       JOIN sequence USING (sequence_id)
--       WHERE distance < 10000  -- Choose filter distance from susC or remove entirely
--      ) USING (feature_id)
JOIN feature_localization USING (feature_id)
WHERE func_id NOT NULL
  AND localization IN ('OM', 'IM', 'PP')  -- Choose localization filter
GROUP BY genome_id, func_id
        """,
        con=con, index_col=['genome_id', 'func_id']
        )
        .tally
        .unstack('func_id', fill_value=0)
        .reindex(mag.index)
        .fillna(0)
            )

plot_pca_ordination(
    opf_cazy_export_count.loc[mag.subgroup.isin(subgroups)].apply(np.sqrt),
    meta=mag,
    color=lambda m: m.subgroup.map(color_map),
    size=lambda m: m.subgroup.map(size_map),
    annotation_list=idxwhere(mag.subgroup.isin(['here'])),
    scatter_kwargs=dict(alpha=0.75, linewidths=0),
    annotate_kwargs=dict(fontsize=8),
)

#### Exported and PUL localized

In [None]:
# Exported glycoside hydrolase domains
opf_cazy_pul_export_count = (
    pd.read_sql(
        """
SELECT genome_id, opf_id AS func_id, COUNT(feature_id) AS tally
FROM (SELECT
          feature_id,
          SUBSTR(domain_id, 0,
                 CASE WHEN INSTR(domain_id, '_')
                      THEN INSTR(domain_id, '_')
                      ELSE LENGTH(domain_id) + 1
                 END) AS cazy_family
      FROM feature_x_cazy_minimal_domain
      WHERE score > 5.
        AND (cazy_family LIKE 'GH%' OR cazy_family LIKE 'CBM%')        
      )
JOIN feature USING (feature_id)
JOIN feature_to_opf USING (feature_id)
JOIN sequence USING (sequence_id)
JOIN (SELECT feature_id FROM closest_PUL_susC WHERE distance < 10000) USING (feature_id)
JOIN feature_localization USING (feature_id)
WHERE func_id NOT NULL
  AND localization IN ('OM', 'IM', 'PP')  -- Choose localization filter
GROUP BY genome_id, func_id
        """,
        con=con, index_col=['genome_id', 'func_id']
        )
        .tally
        .unstack('func_id', fill_value=0)
        .reindex(mag.index)
        .fillna(0)
            )

ax, fit = plot_pca_ordination(
    opf_cazy_pul_export_count.loc[mag.subgroup.isin(subgroups)].apply(np.sqrt),
    xy=('PC1', 'PC2'),
    meta=mag,
    color=lambda m: m.subgroup.map(color_map),
    size=lambda m: m.subgroup.map(size_map),
    annotation_list=idxwhere(mag.subgroup.isin(['here'])),
    scatter_kwargs=dict(alpha=0.75, linewidths=0),
    annotate_kwargs=dict(fontsize=8),
)

# Figure

In [None]:
label_list = [
                'Homeothermus_arabinoxylanisolvens'
              , 'Muribaculum_intestinale_DSM_28989T'
              , 'Duncaniella_muris_DSM_103720T'
              , 'Duncaniella_freteri_DSM_108168T'
              , 'Duncaniella_dubosii_DSM_107170T'
              , 'Paramuribaculum_intestinale_DSM_100749T'
              , 'Amulumruptor_caecigallinarius'
              ]

dd = [
    ('CAZy domains (all)', cazy_count, ('PC1', 'PC2')),
#     ('GH and CBM domains\n(with SP)', cazy_export_count, ('PC1', 'PC2')),
#     ('GH and CBM domains\n(in PUL)', cazy_pul_count, ('PC1', 'PC2')),
    ('CAZy domains (with SP, in PUL)', cazy_pul_export_count, ('PC1', 'PC2')),
#    ('OPFs containing CAZy domains (all)', opf_cazy_count, ('PC1', 'PC2')),
    ('OPFs containing CAZy domains (with SP)', opf_cazy_export_count, ('PC1', 'PC2')),


]

fig, axs = plt.subplots(3, 1, figsize=(halfwidth, halfwidth * 2.5))
axs = np.asarray(axs)

for (title, d, xy), ax in zip(dd, axs.flatten()):
    ax, pca, d0 = plot_pca_ordination(
        d.loc[mag.subgroup.isin(subgroups)].apply(lambda x: x),
        derep_kwargs={'linkage': 'complete', 'distance_threshold': 0.1},
        xy=xy,
        meta=mag,
        color=lambda m: m.subgroup.map(color_map),
        size=lambda m: m.subgroup.map(size_map),
        annot=lambda m: (
            m.index.to_series().replace(rename_map)
            .where(
                (
                    m.index.to_series().isin(label_list)
                    | m.subgroup.isin(['here'])
                ),
                ''
            )
        ),
        scatter_kwargs=dict(alpha=0.75, linewidths=0),
        annotate_kwargs=dict(fontsize=8),
        ax=ax
    )
    ax.set_title(title)
    
for subgroup in ['public', 'ormerod-starch', 'ormerod-host', 'ormerod-plant', 'here']:
    ax.scatter([], [], s=size_map[subgroup], c=color_map[subgroup], marker=marker_map[subgroup], label=subgroup)
ax.legend(loc='upper right')
fig.tight_layout()
fig.savefig('fig/cazy_ordination.pdf')