# Preamble

## Imports

In [None]:
import pandas as pd
from glob import glob
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA, NMF
from skbio.stats.ordination import pcoa
from skbio.stats.distance import DissimilarityMatrix
import numpy as np
import seaborn as sns
import sqlite3
import matplotlib as mpl
from scipy.spatial.distance import pdist, squareform

from scripts.lib.plotting import load_style
from scipy.spatial.distance import pdist, squareform
from sklearn.cluster import AgglomerativeClustering


loaded_style = load_style('paper')
savefig = loaded_style['savefig']
fullwidth = loaded_style['fullwidth']
halfwidth = loaded_style['halfwidth']

con = sqlite3.connect('data/core.muri2.2.denorm.db')
#sns.set_context('notebook')

def idxwhere(x):
    return x[x].index

def ident(x):
    return x

In [None]:
def plot_pca_ordination(
    d, meta, xy=('PC1', 'PC2'),
    marker=None, color=None, size=None, annot=None, zorder=None, alpha=None,
    derep_kwargs=dict(linkage='complete', distance_threshold=0.0),
    ax=None,
    scatter_kwargs={},
    textoffset=0.0,
    annot_kwargs=None,
    show_feats = [], feat_scale = 1.0,
    feat_arrow_kwargs=dict(arrowprops=dict(arrowstyle="-", lw=1, color='k')),
    feat_label_kwargs={},
):
    x, y = xy
    if not ax:
        fig, ax = plt.subplots()
    if marker is None:
        marker = lambda m: 'o'
    if color is None:
        color = lambda m: 'black'
    if size is None:
        size = lambda m: 10
    if annot is None:
        annot = lambda m: ''
    if zorder is None:
        zorder = lambda m: 1
    if alpha is None:
        alpha = lambda m: 1
        
    # Fuzzy dereplication of genomes:
    # Drop entries with only zeros.
    is_empty = d.sum(1) == 0
    d_noempty = d[~is_empty]
    
    dmat = pd.DataFrame(squareform(pdist(d_noempty,
                            metric='cosine')
                     ), index=d_noempty.index, columns=d_noempty.index)
    clust = AgglomerativeClustering(n_clusters=None, affinity='precomputed',
                                    **derep_kwargs,
                                   ).fit_predict(dmat)
    clust = pd.Series(clust, index=d_noempty.index)
    # Add back a cluster for empty items.
    clust = np.concatenate([clust, pd.Series(-1, index=idxwhere(is_empty))])

    # Perform transformation.
    d_fit = d.groupby(clust).mean()
    d_feats = np.eye(d_fit.shape[1])
    fit = PCA().fit(d_fit)
    ordin = pd.DataFrame(fit.transform(d), index=d.index)
    comps = pd.DataFrame(fit.components_.T, index=d.columns)
    ordin.columns = comps.columns = [f'PC{i}' for i in ordin.columns + 1]
    perc_explained = pd.Series(fit.explained_variance_ratio_, index=ordin.columns)

    d0 = (
            ordin
            .join(meta)
            .assign(
                __color=color,
                __size=size,
                __marker=marker,
                __annot=annot,
                __zorder=zorder,
                __alpha=alpha,
            )
        )
    for (marker, zorder, alpha), d1 in d0.groupby(['__marker', '__zorder', '__alpha']):
        ax.scatter(
            x, y,
            data=d1,
            color='__color',
            s='__size',
            marker=marker, zorder=zorder, alpha=alpha,
            **scatter_kwargs,
            label='__nolegend__',
        )
        
    for feat in show_feats:
        _x = feat_scale * comps.loc[feat, x]
        _y = feat_scale * comps.loc[feat, y]
        ax.annotate("", xy=(_x, _y), xytext=(0, 0), **feat_arrow_kwargs, annotation_clip=False)
        ax.annotate(feat, xy=(_x, _y), ha='center', va='center', **feat_label_kwargs)

    xleft, xright = ax.get_xlim()
    ytop, ybottom = ax.get_ylim()
    xwidth = np.abs(xright - xleft)
    yheight = np.abs(ytop - ybottom)
    
    for genome_id, (_x, _y, __annot) in d0[[x, y, '__annot']].iterrows():
        if not __annot:
            continue
        ax.annotate(__annot,
                    xy=(_x, _y),
                    xytext=(textoffset, textoffset), textcoords='offset points',
                    va='center', ha='center', **annot_kwargs)

    ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
    ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
    return ax, fit, d0, comps

## Metadata

In [None]:
color_map = { 'ormerod-starch': 'blue'
            , 'ormerod-host': 'purple'
            , 'ormerod-plant': 'green'
            , 'public': 'grey'
            , 'here': 'black'
            }
alpha_map = { 'ormerod-starch': 0.5
            , 'ormerod-host': 0.5
            , 'ormerod-plant': 0.5
            , 'public': 0.25
            , 'here': 0.9
            }
subgroups = ['ormerod-plant', 'ormerod-host', 'ormerod-starch', 'here', 'public']
size_map = {'ormerod-plant': 10, 'ormerod-host': 10, 'ormerod-starch': 10, 'public': 10, 'here': 20}
marker_map = {'ormerod-plant': 'o', 'ormerod-host': 'o', 'ormerod-starch': 'o', 'public': 'o', 'here': '^'}
zorder_map = {'ormerod-plant': 1, 'ormerod-host': 1, 'ormerod-starch': 1, 'public': 0, 'here': 2}
label_list = [
                'Homeothermus_arabinoxylanisolvens'
              , 'Muribaculum_intestinale_DSM_28989T'
              , 'Duncaniella_muris_DSM_103720T'
              , 'Duncaniella_freteri_DSM_108168T'
              , 'Duncaniella_dubosii_DSM_107170T'
              , 'Paramuribaculum_intestinale_DSM_100749T'
              , 'Amulumruptor_caecigallinarius'
              ]
#mpl.rcParams['text.usetex'] = True

In [None]:
mag = (
    pd.read_sql(
    """
SELECT *
FROM genome
JOIN checkm USING (genome_id)
JOIN genome_group USING (genome_id)
WHERE genome_group_id = 'muri2'
    """,
    index_col='genome_id',
    con=con,
    )
)

rename_map = {
                 'Barnesiella_viscericola_DSM_18177': 'Bv'
               , 'Bacteroides_ovatus_ATCC_8483': 'Bo'
               , 'Bacteroides_thetaiotaomicron_VPI5482': 'Bt'
               , 'Porphyromonas_gingivalis_ATCC_33277': 'Pg'
               , 'Homeothermus_arabinoxylanisolvens': 'Ha'
               , 'Muribaculum_intestinale_DSM_28989T': 'Mi'
               , 'Duncaniella_muris_DSM_103720T': 'Dm'
               , 'Duncaniella_freteri_DSM_108168T': 'Df'
               , 'Duncaniella_dubosii_DSM_107170T': 'Dd'
               , 'Paramuribaculum_intestinale_DSM_100749T': 'Pi'
               , 'Amulumruptor_caecigallinarius': 'Ac'
               } 

italic_list = [ 'Barnesiella_viscericola_DSM_18177'
              , 'Bacteroides_ovatus_ATCC_8483'
              , 'Bacteroides_thetaiotaomicron_VPI5482'
              , 'Porphyromonas_gingivalis_ATCC_33277'
              , 'Homeothermus_arabinoxylanisolvens'
              , 'Muribaculum_intestinale_DSM_28989T'
              , 'Duncaniella_muris_DSM_103720T'
              , 'Duncaniella_freteri_DSM_108168T'
              , 'Duncaniella_dubosii_DSM_107170T'
              , 'Paramuribaculum_intestinale_DSM_100749T'
              , 'Amulumruptor_caecigallinarius'
              ]

def rename_mag(genome_id):
    if genome_id in rename_map:
        return rename_map[genome_id]
    else:
        return genome_id

In [None]:
opf_to_architecture = pd.read_sql(
    """
SELECT * FROM opf_to_architecture;
    """,
    index_col='opf_id',
    con=con,
).squeeze()

## Domains

### No other filters

In [None]:
# Glycoside hydrolase tallies as in Ormerod (not counting e.g. GH13_1 separately from GH13_2)

cazy_count = (
    pd.read_sql(
        """
SELECT genome_id, cazy_family AS func_id, COUNT(feature_id) AS tally
FROM (SELECT
          feature_id,
          SUBSTR(domain_id, 0,
                 CASE WHEN INSTR(domain_id, '_')
                      THEN INSTR(domain_id, '_')
                      ELSE LENGTH(domain_id) + 1
                 END) AS cazy_family
      FROM feature_x_cazy_minimal_domain
      WHERE score > 5.
        AND (cazy_family LIKE 'GH%' OR cazy_family LIKE 'CBM%')        
      )
JOIN feature USING (feature_id)
JOIN sequence USING (sequence_id)
-- JOIN (SELECT DISTINCT feature_id, genome_id
--       FROM (SELECT feature_id AS seed_id FROM pul_susC)
--       JOIN feature_distance USING (seed_id)
--       JOIN feature USING (feature_id)
--       JOIN sequence USING (sequence_id)
--       WHERE distance < 10000  -- Choose filter distance from susC or remove entirely
--      ) USING (feature_id)
-- JOIN feature_localization USING (feature_id)
WHERE func_id NOT NULL
--   AND localization IN ('OM', 'IM', 'PP')  -- Choose localization filter
GROUP BY genome_id, func_id
        """,
        con=con, index_col=['genome_id', 'func_id']
        )
        .tally
        .unstack('func_id', fill_value=0)
        .reindex(mag.index)
        .fillna(0)
            )

ax, pca, d0, comps = plot_pca_ordination(
    cazy_count.loc[mag.subgroup.isin(subgroups)].apply(lambda x: np.sqrt(x)),
    xy=('PC1', 'PC2'),
    meta=mag,
    show_feats=['GH13', 'GH43', 'GH20'], feat_scale=10.0,
    derep_kwargs={'linkage': 'complete', 'distance_threshold': 0.1},
    color=lambda m: m.subgroup.map(color_map),
    size=lambda m: m.subgroup.map(size_map),
    marker=lambda m: m.subgroup.map(marker_map),
    zorder=lambda m: m.subgroup.map(zorder_map),
    alpha=lambda m: m.subgroup.map(alpha_map),
    annot=lambda m: (
        m.index.to_series().replace(rename_map)
        .where(
            (
                m.index.to_series().isin(label_list)
                | m.subgroup.isin(['here'])
            ),
            ''
        )
    ),
    scatter_kwargs=dict(linewidths=0),
    annot_kwargs=dict(fontsize=8,
                     ),
    textoffset=0.03,
)

In [None]:
comps['PC2'].sort_values(ascending=False).head(10)

In [None]:
comps['PC2'].sort_values(ascending=True)

In [None]:
sns.violinplot(
    x='subgroup', y='total_cazy_domain_proteins',
    data=mag.assign(total_cazy_domain_proteins=cazy_count.sum(1))[~(mag.subgroup == 'outgroup')],
    palette=color_map,
)

### Exported

In [None]:
# Exported glycoside hydrolase domains
cazy_export_count = (
    pd.read_sql(
        """
SELECT genome_id, cazy_family AS func_id, COUNT(feature_id) AS tally
FROM (SELECT
          feature_id,
          SUBSTR(domain_id, 0,
                 CASE WHEN INSTR(domain_id, '_')
                      THEN INSTR(domain_id, '_')
                      ELSE LENGTH(domain_id) + 1
                 END) AS cazy_family
      FROM feature_x_cazy_minimal_domain
      WHERE score > 5.
        AND (cazy_family LIKE 'GH%' OR cazy_family LIKE 'CBM%')        
      )
JOIN feature USING (feature_id)
JOIN sequence USING (sequence_id)
-- JOIN (SELECT DISTINCT feature_id
--       FROM (SELECT feature_id AS seed_id FROM pul_susC)
--       JOIN feature_distance USING (seed_id)
--       JOIN feature USING (feature_id)
--       JOIN sequence USING (sequence_id)
--       WHERE distance < 10000  -- Choose filter distance from susC or remove entirely
--      ) USING (feature_id)
JOIN feature_localization USING (feature_id)
WHERE func_id NOT NULL
  AND localization IN ('OM', 'IM', 'PP')  -- Choose localization filter
GROUP BY genome_id, func_id
        """,
        con=con, index_col=['genome_id', 'func_id']
        )
        .tally
        .unstack('func_id', fill_value=0)
        .reindex(mag.index)
        .fillna(0)
            )


d = cazy_export_count

ax, pca, d0, comps = plot_pca_ordination(
    d.loc[mag.subgroup.isin(subgroups)].apply(lambda x: x),
    xy=('PC1', 'PC2'),
    meta=mag,
    show_feats=['GH13'], feat_scale=10.0,
    derep_kwargs={'linkage': 'complete', 'distance_threshold': 0.1},
    color=lambda m: m.subgroup.map(color_map),
    size=lambda m: m.subgroup.map(size_map),
    marker=lambda m: m.subgroup.map(marker_map),
    zorder=lambda m: m.subgroup.map(zorder_map),
    alpha=lambda m: m.subgroup.map(alpha_map),
    annot=lambda m: (
        m.index.to_series().replace(rename_map)
        .where(
            (
                m.index.to_series().isin(label_list)
                | m.subgroup.isin(['here'])
            ),
            ''
        )
    ),
    scatter_kwargs=dict(linewidths=0),
    annot_kwargs=dict(fontsize=8,
                     ),
    textoffset=0.03,
)

### In PUL

In [None]:
# Exported glycoside hydrolase domains
cazy_pul_count = (
    pd.read_sql(
        """
SELECT genome_id, cazy_family AS func_id, COUNT(feature_id) AS tally
FROM (SELECT
          feature_id,
          SUBSTR(domain_id, 0,
                 CASE WHEN INSTR(domain_id, '_')
                      THEN INSTR(domain_id, '_')
                      ELSE LENGTH(domain_id) + 1
                 END) AS cazy_family
      FROM feature_x_cazy_minimal_domain
      WHERE score > 5.
        AND (cazy_family LIKE 'GH%' OR cazy_family LIKE 'CBM%')        
      )
JOIN feature USING (feature_id)
JOIN sequence USING (sequence_id)
JOIN (SELECT feature_id FROM closest_PUL_susC WHERE distance < 10000) USING (feature_id)
-- JOIN feature_localization USING (feature_id)
WHERE func_id NOT NULL
--   AND localization IN ('OM', 'IM', 'PP')  -- Choose localization filter
GROUP BY genome_id, func_id
        """,
        con=con, index_col=['genome_id', 'func_id']
        )
        .tally
        .unstack('func_id', fill_value=0)
        .reindex(mag.index)
        .fillna(0)
            )

d = cazy_pul_count

ax, pca, d0, comps = plot_pca_ordination(
    cazy_pul_count.loc[mag.subgroup.isin(subgroups)].apply(lambda x: x),
    xy=('PC1', 'PC2'),
    meta=mag,
    show_feats=[], feat_scale=20.0,
    derep_kwargs={'linkage': 'complete', 'distance_threshold': 0.1},
    color=lambda m: m.subgroup.map(color_map),
    size=lambda m: m.subgroup.map(size_map),
    marker=lambda m: m.subgroup.map(marker_map),
    zorder=lambda m: m.subgroup.map(zorder_map),
    alpha=lambda m: m.subgroup.map(alpha_map),
    annot=lambda m: (
        m.index.to_series().replace(rename_map)
        .where(
            (
                m.index.to_series().isin(label_list)
                | m.subgroup.isin(['here'])
            ),
            ''
        )
    ),
    scatter_kwargs=dict(linewidths=0),
    annot_kwargs=dict(fontsize=8,
                     ),
    textoffset=0.03,
)

### Exported and PUL localized

In [None]:
# Exported glycoside hydrolase domains
cazy_pul_export_count = (
    pd.read_sql(
        """
SELECT genome_id, cazy_family AS func_id, COUNT(feature_id) AS tally
FROM (SELECT
          feature_id,
          SUBSTR(domain_id, 0,
                 CASE WHEN INSTR(domain_id, '_')
                      THEN INSTR(domain_id, '_')
                      ELSE LENGTH(domain_id) + 1
                 END) AS cazy_family
      FROM feature_x_cazy_minimal_domain
      WHERE score > 5.
        AND (cazy_family LIKE 'GH%' OR cazy_family LIKE 'CBM%')        
      )
JOIN feature USING (feature_id)
JOIN sequence USING (sequence_id)
JOIN (SELECT feature_id FROM closest_PUL_susC WHERE distance < 10000) USING (feature_id)
JOIN feature_localization USING (feature_id)
WHERE func_id NOT NULL
  AND localization IN ('OM', 'IM', 'PP')  -- Choose localization filter
GROUP BY genome_id, func_id
        """,
        con=con, index_col=['genome_id', 'func_id']
        )
        .tally
        .unstack('func_id', fill_value=0)
        .reindex(mag.index)
        .fillna(0)
            )

d = cazy_pul_export_count

ax, pca, d0, comps = plot_pca_ordination(
    d.loc[mag.subgroup.isin(subgroups)].apply(lambda x: np.sqrt(x)),
    xy=('PC1', 'PC2'),
    meta=mag,
    show_feats=['GH13', 'GH43', 'CBM44'], feat_scale=10.0,
    derep_kwargs={'linkage': 'complete', 'distance_threshold': 0.1},
    color=lambda m: m.subgroup.map(color_map),
    size=lambda m: m.subgroup.map(size_map),
    marker=lambda m: m.subgroup.map(marker_map),
    zorder=lambda m: m.subgroup.map(zorder_map),
    alpha=lambda m: m.subgroup.map(alpha_map),
    annot=lambda m: (
        m.index.to_series().replace(rename_map)
        .where(
            (
                m.index.to_series().isin(label_list)
                | m.subgroup.isin(['here'])
            ),
            ''
        )
    ),
    scatter_kwargs=dict(linewidths=0),
    annot_kwargs=dict(fontsize=8,
                     ),
    textoffset=0.03,
)

In [None]:
comps['PC1'].sort_values(ascending=False).tail(10)

In [None]:
comps['PC2'].sort_values(ascending=False).tail(10)

## OPF clusters

### No other filters

In [None]:
# Glycoside hydrolase tallies as in Ormerod (not counting e.g. GH13_1 separately from GH13_2)

opf_cazy_count = (
    pd.read_sql(
        """
SELECT genome_id, opf_id AS func_id, COUNT(feature_id) AS tally
FROM (SELECT
          feature_id,
          SUBSTR(domain_id, 0,
                 CASE WHEN INSTR(domain_id, '_')
                      THEN INSTR(domain_id, '_')
                      ELSE LENGTH(domain_id) + 1
                 END) AS cazy_family
      FROM feature_x_cazy_minimal_domain
      WHERE score > 5.
        AND (cazy_family LIKE 'GH%' OR cazy_family LIKE 'CBM%')        
      )
JOIN feature USING (feature_id)
JOIN feature_to_opf USING (feature_id)
JOIN sequence USING (sequence_id)
-- JOIN (SELECT DISTINCT feature_id, genome_id
--       FROM (SELECT feature_id AS seed_id FROM pul_susC)
--       JOIN feature_distance USING (seed_id)
--       JOIN feature USING (feature_id)
--       JOIN sequence USING (sequence_id)
--       WHERE distance < 10000  -- Choose filter distance from susC or remove entirely
--      ) USING (feature_id)
-- JOIN feature_localization USING (feature_id)
WHERE func_id NOT NULL
--   AND localization IN ('OM', 'IM', 'PP')  -- Choose localization filter
GROUP BY genome_id, func_id
        """,
        con=con, index_col=['genome_id', 'func_id']
        )
        .tally
        .unstack('func_id', fill_value=0)
        .reindex(mag.index)
        .fillna(0)
            )

plot_pca_ordination(
    opf_cazy_count.loc[mag.subgroup.isin(subgroups)].apply(np.sqrt),
    meta=mag,
    color=lambda m: m.subgroup.map(color_map),
    size=lambda m: m.subgroup.map(size_map),
    annotation_list=idxwhere(mag.subgroup.isin(['here'])),
    scatter_kwargs=dict(alpha=0.75, linewidths=0),
    annotate_kwargs=dict(fontsize=8),
)

### Exported

In [None]:
# Exported glycoside hydrolase domains
opf_cazy_export_count = (
    pd.read_sql(
        """
SELECT genome_id, opf_id AS func_id, COUNT(feature_id) AS tally
FROM (SELECT
          feature_id,
          SUBSTR(domain_id, 0,
                 CASE WHEN INSTR(domain_id, '_')
                      THEN INSTR(domain_id, '_')
                      ELSE LENGTH(domain_id) + 1
                 END) AS cazy_family
      FROM feature_x_cazy_minimal_domain
      WHERE score > 5.
        AND (cazy_family LIKE 'GH%' OR cazy_family LIKE 'CBM%')        
      )
JOIN feature USING (feature_id)
JOIN feature_to_opf USING (feature_id)
JOIN sequence USING (sequence_id)
-- JOIN (SELECT DISTINCT feature_id
--       FROM (SELECT feature_id AS seed_id FROM pul_susC)
--       JOIN feature_distance USING (seed_id)
--       JOIN feature USING (feature_id)
--       JOIN sequence USING (sequence_id)
--       WHERE distance < 10000  -- Choose filter distance from susC or remove entirely
--      ) USING (feature_id)
JOIN feature_localization USING (feature_id)
WHERE func_id NOT NULL
  AND localization IN ('OM', 'IM', 'PP')  -- Choose localization filter
GROUP BY genome_id, func_id
        """,
        con=con, index_col=['genome_id', 'func_id']
        )
        .tally
        .unstack('func_id', fill_value=0)
        .reindex(mag.index)
        .fillna(0)
            )

d = opf_cazy_export_count

ax, pca, d0, comps = plot_pca_ordination(
    d.loc[mag.subgroup.isin(subgroups)].apply(lambda x: x),
    xy=('PC1', 'PC2'),
    meta=mag,
    show_feats=['Opf01909', 'Opf00965', 'Opf02013', 'Opf00428'], feat_scale=10.0,
    derep_kwargs={'linkage': 'complete', 'distance_threshold': 0.1},
    color=lambda m: m.subgroup.map(color_map),
    size=lambda m: m.subgroup.map(size_map),
    marker=lambda m: m.subgroup.map(marker_map),
    zorder=lambda m: m.subgroup.map(zorder_map),
    alpha=lambda m: m.subgroup.map(alpha_map),
    annot=lambda m: (
        m.index.to_series().replace(rename_map)
        .where(
            (
                m.index.to_series().isin(label_list)
                | m.subgroup.isin(['here'])
            ),
            ''
        )
    ),
    scatter_kwargs=dict(linewidths=0),
    annot_kwargs=dict(fontsize=8,
                     ),
    textoffset=0.03,
)

In [None]:
comps['PC1'].sort_values()

In [None]:
opf_to_architecture.loc['Opf01132']

In [None]:
opf_to_architecture.loc[comps['PC2'].sort_values().head(10).index]

In [None]:
opf_to_architecture.loc[['Opf01909', 'Opf01768', 'Opf02388', 'Opf03190', 'Opf00965', 'Opf03199', 'Opf02584']]

In [None]:
d[['Opf01909', 'Opf01768', 'Opf02388', 'Opf03190', 'Opf00965', 'Opf03199', 'Opf02584', 'Opf02888']]

In [None]:
opf_of_interest = idxwhere(
    (d.loc['B1A'] > 0)
    & (d.loc['B2'] > 0)
    & (d.loc[['B3', 'B4', 'B5', 'B6', 'B7', 'B8']] == 0).all()
)

In [None]:
d[opf_of_interest]

In [None]:
opf_to_architecture.loc[opf_of_interest]

In [None]:
comps['PC2'].sort_values(ascending=False).head(20)

In [None]:
opf_to_architecture.loc[['Opf00428', 'Opf02379', 'Opf00076', 'Opf01060', 'Opf02135', 'Opf01108']]

In [None]:
(mag.completeness / 100).min()

### Exported and PUL localized

In [None]:
# Exported glycoside hydrolase domains
opf_cazy_pul_export_count = (
    pd.read_sql(
        """
SELECT genome_id, opf_id AS func_id, COUNT(feature_id) AS tally
FROM (SELECT
          feature_id,
          SUBSTR(domain_id, 0,
                 CASE WHEN INSTR(domain_id, '_')
                      THEN INSTR(domain_id, '_')
                      ELSE LENGTH(domain_id) + 1
                 END) AS cazy_family
      FROM feature_x_cazy_minimal_domain
      WHERE score > 5.
        AND (cazy_family LIKE 'GH%' OR cazy_family LIKE 'CBM%')        
      )
JOIN feature USING (feature_id)
JOIN feature_to_opf USING (feature_id)
JOIN sequence USING (sequence_id)
JOIN (SELECT feature_id FROM closest_PUL_susC WHERE distance < 10000) USING (feature_id)
JOIN feature_localization USING (feature_id)
WHERE func_id NOT NULL
  AND localization IN ('OM', 'IM', 'PP')  -- Choose localization filter
GROUP BY genome_id, func_id
        """,
        con=con, index_col=['genome_id', 'func_id']
        )
        .tally
        .unstack('func_id', fill_value=0)
        .reindex(mag.index)
        .fillna(0)
            )


d = opf_cazy_pul_export_count

ax, pca, d0, comps = plot_pca_ordination(
    d.loc[mag.subgroup.isin(subgroups)].apply(lambda x: x),
    xy=('PC1', 'PC2'),
    meta=mag,
    show_feats=['Opf02000'], feat_scale=10.0,
    derep_kwargs={'linkage': 'complete', 'distance_threshold': 0.1},
    color=lambda m: m.subgroup.map(color_map),
    size=lambda m: m.subgroup.map(size_map),
    marker=lambda m: m.subgroup.map(marker_map),
    zorder=lambda m: m.subgroup.map(zorder_map),
    alpha=lambda m: m.subgroup.map(alpha_map),
    annot=lambda m: (
        m.index.to_series().replace(rename_map)
        .where(
            (
                m.index.to_series().isin(label_list)
                | m.subgroup.isin(['here'])
            ),
            ''
        )
    ),
    scatter_kwargs=dict(linewidths=0),
    annot_kwargs=dict(fontsize=8,
                     ),
    textoffset=0.03,
)

In [None]:
comps['PC2'].sort_values(ascending=False).head(10)

In [None]:
opf_to_architecture.loc[['Opf02584', 'Opf03199', 'Opf02000', 'Opf02013', 'Opf04335', 'Opf04912', 'Opf01862']]

In [None]:
comps['PC1'].sort_values()

In [None]:
d.loc['Homeothermus_arabinoxylanisolvens']

# Figure

In [None]:
dd = [
    ('CAZy domains (all)', cazy_count, ('PC1', 'PC2'), ['GH13', 'GH20', 'GH43'], False, True),
    ('OPFs containing CAZy domains (exported)', opf_cazy_export_count, ('PC1', 'PC2'), ['Opf01909', 'Opf00965', 'Opf01132', 'Opf00428'], False, False),
]

fig, axs = plt.subplots(nrows=2, ncols=1, figsize=(halfwidth, halfwidth * 1.5))
axs = np.asarray(axs)

for (title, d, xy, show_feats, flipx, flipy), ax in zip(dd, axs.flatten()):
    ax, pca, d0, comps = plot_pca_ordination(
        d.loc[mag.subgroup.isin(subgroups)].apply(lambda x: x),
        derep_kwargs={'linkage': 'complete', 'distance_threshold': 0.2},
        xy=xy,
        meta=mag,
        show_feats=show_feats, feat_scale=20.0,
        feat_arrow_kwargs=dict(arrowprops=dict(arrowstyle="->", lw=0.5, color='rebeccapurple', shrinkA=0, shrinkB=0)),
        feat_label_kwargs=dict(color='rebeccapurple', fontsize=5),
        color=lambda m: m.subgroup.map(color_map),
        size=lambda m: m.subgroup.map(size_map),
        marker=lambda m: m.subgroup.map(marker_map),
        zorder=lambda m: m.subgroup.map(zorder_map),
        alpha=lambda m: m.subgroup.map(alpha_map),
        annot=lambda m: (
            m.index.to_series().replace(rename_map)
            .where(
                (
                    m.index.to_series().isin(label_list)
                    | m.subgroup.isin(['here'])
                ),
                ''
            )
        ),
        scatter_kwargs=dict(linewidths=0),
        annot_kwargs=dict(fontsize=8,
#                           arrowprops=dict(lw=0.3, color='k', arrowstyle='->'),
#                           bbox=dict(pad=-3, facecolor="none", edgecolor="none")
                         ),
        textoffset=4.5,
        ax=ax
    )
    ax.set_title(title)
    if flipx:
        ax.invert_xaxis()
    if flipy:
        ax.invert_yaxis()
    
legend_map = {'public': 'public', 'ormerod-starch': '$\\alpha$-glucan',
              'ormerod-host': 'host-glycan', 'ormerod-plant': 'plant-glycan',
              'here': 'novel'}
for subgroup in subgroups:
    ax.scatter([], [], s=size_map[subgroup], c=color_map[subgroup], marker=marker_map[subgroup], label=legend_map[subgroup])

fig.tight_layout()

ax.legend(ncol=2, bbox_to_anchor=(1, 1))
fig.savefig('fig/cazy_ordination.pdf', bbox_inches='tight', )

In [None]:
ax, pca, d0, comps = plot_pca_ordination(
    opf_cazy_export_count.loc[mag.subgroup.isin(subgroups)].apply(lambda x: x),
    xy=('PC1', 'PC2'),
    meta=mag,
    show_feats=['Opf01909', 'Opf02388', 'Opf03190', 'Opf00965', 'Opf02205', 'Opf01338'], feat_scale=20.0,
    derep_kwargs={'linkage': 'complete', 'distance_threshold': 0.2},
    color=lambda m: m.subgroup.map(color_map),
    size=lambda m: m.subgroup.map(size_map),
    marker=lambda m: m.subgroup.map(marker_map),
    zorder=lambda m: m.subgroup.map(zorder_map),
    alpha=lambda m: m.subgroup.map(alpha_map),
    annot=lambda m: (
        m.index.to_series().replace(rename_map)
        .where(
            (
                m.index.to_series().isin(label_list)
                | m.subgroup.isin(['here'])
            ),
            ''
        )
    ),
    scatter_kwargs=dict(linewidths=0),
    annot_kwargs=dict(fontsize=8,
                     ),
    textoffset=0.03,
)

In [None]:
comps['PC2'].to_frame().join(opf_to_architecture).sort_values('PC2')

In [None]:
comps['PC1'].to_frame().join(opf_to_architecture).sort_values('PC1')

In [None]:
(opf_cazy_export_count[['Opf01909', 'Opf00965']] > 0).groupby(mag.subgroup).sum().assign(total=mag.subgroup.value_counts())

In [None]:
(opf_cazy_export_count[['Opf01909', 'Opf00965']])[mag.subgroup == 'here']

In [None]:
opf_to_architecture.loc[['Opf01909', 'Opf00965', 'Opf00428']]

In [None]:
opf00428_domains = pd.read_sql("""
SELECT feature_id, domain_id, score, domain_start, domain_stop
FROM feature_details
LEFT JOIN (SELECT * FROM feature_x_cazy_domain) USING (feature_id)
WHERE opf_id IN ('Opf00428')
  AND score > 10
""", con=con, index_col=['feature_id', 'domain_id'])

sns.heatmap(opf00428_domains.score.unstack(fill_value=0), norm=mpl.colors.PowerNorm(1/2))

In [None]:
opf00428_domains[['domain_start', 'domain_stop']].stack().unstack([-2, -1])[['GH20', 'CBM32', 'GH138']]#.unstack('domain_stop')

In [None]:
opf00428_domains.score.unstack('domain_id', fill_value=0)['GH20'].sort_values()

In [None]:
opf01132_domains = pd.read_sql("""
SELECT feature_id, domain_id, score, domain_start, domain_stop
FROM feature_details
LEFT JOIN (SELECT * FROM feature_x_cazy_domain) USING (feature_id)
WHERE opf_id IN ('Opf01132')
  AND score > 10
""", con=con, index_col=['feature_id', 'domain_id'])

sns.heatmap(opf01132_domains.score.groupby(level=['feature_id', 'domain_id']).max().unstack(fill_value=0), norm=mpl.colors.PowerNorm(1/2))

In [None]:
opf01132_domains[['domain_start', 'domain_stop']].stack().unstack([-2, -1])#.unstack('domain_stop')

In [None]:
comps['PC2'].sort_values(ascending=True).head()

In [None]:
comps['PC2'].sort_values(ascending=False).head(10)

In [None]:
d['Opf00428']

# Features distinguishing responders

In [None]:
pd.read_sql("""
SELECT DISTINCT genome_id, feature_id, localization, product_description, domain_id, score, subgroup, coverage_ratio
FROM feature_details
JOIN feature_x_cazy_minimal_domain USING (feature_id)
JOIN sequence USING (sequence_id)
JOIN genome_group USING (genome_id)
LEFT JOIN variant_cross_coverage USING (genome_id, feature_id)
WHERE localization IN ('OM', 'IM')
  AND (domain_id LIKE 'GH13%' OR domain_id LIKE 'GH66%')
  AND score > 5
  AND subgroup = 'here'
  ;
""", con=con)

In [None]:
pd.read_sql("""
SELECT DISTINCT genome_id, feature_id, localization, product_description, subgroup, coverage_ratio
FROM feature_details
JOIN sequence USING (sequence_id)
JOIN genome_group USING (genome_id)
LEFT JOIN variant_cross_coverage USING (genome_id, feature_id)
  WHERE opf_id = 'Opf01405'
  ;
""", con=con).head(20)

In [None]:
pd.read_sql("""
SELECT DISTINCT genome_id, feature_id, opf_id, ko_id, localization, product_description, subgroup, coverage_ratio
FROM feature_details
JOIN sequence USING (sequence_id)
JOIN genome_group USING (genome_id)
LEFT JOIN feature_x_ko USING (feature_id)
LEFT JOIN variant_cross_coverage USING (genome_id, feature_id)
  WHERE ko_id = 'K21575'
  ;
""", con=con)