# Preamble

## Imports

In [None]:
import pandas as pd
from glob import glob
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE, MDS
import numpy as np
import seaborn as sns
import sqlite3
import matplotlib as mpl
from scipy.spatial.distance import pdist, squareform

from scripts.lib.plotting import load_style

loaded_style = load_style('paper')
savefig = loaded_style['savefig']
fullwidth = loaded_style['fullwidth']
halfwidth = loaded_style['halfwidth']

con = sqlite3.connect('data/core.muri.2.denorm.db')
#sns.set_context('notebook')

def idxwhere(x):
    return x[x].index

## Metadata

In [None]:
color_map = { 'starch': 'blue'
            , 'host': 'purple'
            , 'plant': 'green'
            }

#mpl.rcParams['text.usetex'] = True

In [None]:
mag = (
    pd.read_table('meta/genome.tsv', index_col='genome_id')
    .drop(['Paramuribaculum_intestinale_DSM_100764', 'Zag1', 'Zag10'])
)

rename_map = {
                 'Barnesiella_viscericola_DSM_18177': 'Bv'
               , 'Bacteroides_ovatus_ATCC_8483': 'Bo'
               , 'Bacteroides_thetaiotaomicron_VPI5482': 'Bt'
               , 'Porphyromonas_gingivalis_ATCC_33277': 'Pg'
               , 'Homeothermus_arabinoxylanisolvens': 'Ha'
               , 'Muribaculum_intestinale_DSM_28989T': 'Mi'
               , 'Duncaniella_muris_DSM_103720T': 'Dm'
               , 'Duncaniella_freteri_DSM_108168T': 'Df'
               , 'Duncaniella_dubosii_DSM_107170T': 'Dd'
               , 'Paramuribaculum_intestinale_DSM_100749T': 'Pi'
               , 'Amulumruptor_caecigallinarius': 'Ac'
               } 

italic_list = [ 'Barnesiella_viscericola_DSM_18177'
              , 'Bacteroides_ovatus_ATCC_8483'
              , 'Bacteroides_thetaiotaomicron_VPI5482'
              , 'Porphyromonas_gingivalis_ATCC_33277'
              , 'Homeothermus_arabinoxylanisolvens'
              , 'Muribaculum_intestinale_DSM_28989T'
              , 'Duncaniella_muris_DSM_103720T'
              , 'Duncaniella_freteri_DSM_108168T'
              , 'Duncaniella_dubosii_DSM_107170T'
              , 'Paramuribaculum_intestinale_DSM_100749T'
              , 'Amulumruptor_caecigallinarius'
              ]

def rename_mag(genome_id):
    if genome_id in rename_map:
        return rename_map[genome_id]
    else:
        return genome_id

# COGs

In [None]:
data = (pd.read_sql("""
SELECT genome_id, cog_id AS func_id, COUNT(feature_id) AS tally
FROM feature JOIN sequence USING (sequence_id)
JOIN feature_to_cog USING (feature_id)
WHERE func_id NOT NULL
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
data = data.reindex(mag.index).dropna()

function = pd.read_sql("""
SELECT cog_id AS func_id, function_category, description
FROM cog
                       """, con=con, index_col=['func_id'])

function = function.loc[data.columns]

In [None]:
((data.loc[['B1A', 'B1B']] > 0)
      .T.apply(lambda x: pd.Series({ 'just_A': x.B1A > x.B1B
                                   , 'just_B': x.B1B > x.B1A
                                   , 'both': x.B1A & x.B1B
                                   }), axis=1)).sum()

## Ordination

### Ormerod

In [None]:
feats = ['COG3507',
#         'COG3866',
         'COG4677', 'COG2730', 'COG3693', 'COG0366', 'COG3525', 'COG3119']

d = data[feats].apply(np.sqrt)
fit = PCA().fit(d.loc[mag.genome_type == 'ormerod'])
ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

fig, ax = plt.subplots(figsize=(10, 10))

x, y = 'PC1', 'PC2'
text_offset_x = 0.08
text_offset_y = 0


ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
ax.set_yticklabels([])
ax.set_xticklabels([])


# Plot Ormerod
for guild in ['plant', 'host', 'starch']:
    d1 = ordin[(mag.genome_type == 'ormerod') & (mag.ormerod_guild == guild)]
    ax.scatter(x, y, data=d1,
               c=color_map[guild], label=guild)

# Plot Mine
ax.scatter(x, y, data=ordin[(mag.genome_type == 'here')],
           color='black', marker='^', alpha=1,
           label='__nolegend__')
for genome_id, coords in ordin[(mag.genome_type == 'here')].iterrows():
    coords = coords
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Plot Reference
ref_mags = ['Muribaculum_intestinale_DSM_28989T']
ax.scatter(x, y, data=ordin.loc[ref_mags],
           color='grey', alpha=1,
           label='__nolegend__')

text_offset_x = -0.08
text_offset_y = 0.1
labeled_mags = ['Muribaculum_intestinale_DSM_28989T', 'M6', 'Homeothermus_arabinoxylanisolvens']
for genome_id in labeled_mags:
    coords = ordin.loc[genome_id]
    if genome_id in italic_list:
        textstyle = 'italic'
    else:
        textstyle = 'normal'
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='right', va='center', fontsize=12, style=textstyle)
    
# Setup legend
ax.legend(loc='lower right')

#fig.savefig('/Users/bjsmith/Desktop/ormerod_ordination.pdf')

### Carbs

In [None]:
feats = function[function.function_category == 'G'].index

d = data[feats].apply(np.sqrt)
fit = PCA().fit(d.loc[mag.genome_type == 'ormerod'])
ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

fig, ax = plt.subplots(figsize=(10, 10))

x, y = 'PC1', 'PC2'
text_offset_x = 0.08
text_offset_y = 0


ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
ax.set_yticklabels([])
ax.set_xticklabels([])


# Plot Ormerod
for ormerod_guild in ['plant', 'host', 'starch']:
    d1 = ordin[(mag.genome_type == 'ormerod') & (mag.ormerod_guild == ormerod_guild)]
    ax.scatter(x, y, data=d1,
               c=color_map[ormerod_guild], label=ormerod_guild)

# Plot Mine
ax.scatter(x, y, data=ordin[(mag.genome_type == 'here')],
           color='black', marker='^', alpha=1,
           label='__nolegend__')
for genome_id, coords in ordin[(mag.genome_type == 'here')].iterrows():
    coords = coords
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Plot Reference
ref_mags = ['Muribaculum_intestinale_DSM_28989T']
ax.scatter(x, y, data=ordin.loc[ref_mags],
           color='grey', alpha=1,
           label='__nolegend__')

text_offset_x = -0.08
text_offset_y = 0.1
labeled_mags = ['Muribaculum_intestinale_DSM_28989T', 'M6', 'Homeothermus_arabinoxylanisolvens']
for genome_id in labeled_mags:
    coords = ordin.loc[genome_id]
    if genome_id in italic_list:
        textstyle = 'italic'
    else:
        textstyle = 'normal'
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='right', va='center', fontsize=12, style=textstyle)
    
# Setup legend
ax.legend(loc='lower right')

#fig.savefig('/Users/bjsmith/Desktop/ormerod_ordination.pdf')

### All

In [None]:
feats = data.columns[((data > 0).sum() > 2)]

d = data[feats].apply(np.sqrt)
fit = PCA().fit(d.loc[mag.genome_type == 'ormerod'])
ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

fig, ax = plt.subplots(figsize=(10, 10))

x, y = 'PC1', 'PC2'
text_offset_x = 0.08
text_offset_y = 0


ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
ax.set_yticklabels([])
ax.set_xticklabels([])


# Plot Ormerod
for ormerod_guild in ['plant', 'host', 'starch']:
    d1 = ordin[(mag.genome_type == 'ormerod') & (mag.ormerod_guild == ormerod_guild)]
    ax.scatter(x, y, data=d1,
               c=color_map[ormerod_guild], label=ormerod_guild)

# Plot Mine
ax.scatter(x, y, data=ordin[(mag.genome_type == 'here')],
           color='black', marker='^', alpha=1,
           label='__nolegend__')
for genome_id, coords in ordin[(mag.genome_type == 'here')].iterrows():
    coords = coords
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Plot Reference
ref_mags = ['Muribaculum_intestinale_DSM_28989T']
ax.scatter(x, y, data=ordin.loc[ref_mags],
           color='grey', alpha=1,
           label='__nolegend__')

text_offset_x = -0.08
text_offset_y = 0.1
labeled_mags = ['Muribaculum_intestinale_DSM_28989T', 'M6', 'Homeothermus_arabinoxylanisolvens']
for genome_id in labeled_mags:
    coords = ordin.loc[genome_id]
    if genome_id in italic_list:
        textstyle = 'italic'
    else:
        textstyle = 'normal'
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='right', va='center', fontsize=12, style=textstyle)
    
# Setup legend
ax.legend(loc='lower right')

#fig.savefig('/Users/bjsmith/Desktop/ormerod_ordination.pdf')

## Frequencies

In [None]:
func_summ = {'all': (data > 0).sum()}
for ormerod_guild in mag.ormerod_guild.dropna().unique():
    func_summ[ormerod_guild] = (data.loc[mag.ormerod_guild == ormerod_guild] > 0).mean()
    
for genome_type in mag.genome_type.dropna().unique():
    func_summ[genome_type] = (data.loc[mag.genome_type == genome_type] > 0).sum()

func_summary = pd.DataFrame(func_summ).join(function)

In [None]:
(func_summary
             [ (data.loc['B1A'] > 0)
             & (data.loc['B1B'] > 0)
             & (data.loc['B2'] > 0)
             ]
             .sort_values(['here', 'starch'], ascending=[True, False])
             .head(10)
)

# OPFs

In [None]:
data = (pd.read_sql("""
SELECT genome_id, opf_id AS func_id, COUNT(feature_id) AS tally
FROM feature JOIN sequence USING (sequence_id)
JOIN feature_to_opf USING (feature_id)
WHERE func_id NOT NULL
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
data = data.reindex(mag.index)

function = pd.read_sql("""
SELECT
    opf_id AS func_id
  , architecture
  , ko_id
  , ko.description AS ko_description
  , cog_id
  , cog.description AS cog_description
  , function_category AS cog_category
FROM opf_to_architecture
LEFT JOIN opf_to_ko USING (opf_id)
LEFT JOIN ko USING (ko_id)
LEFT JOIN opf_to_cog USING (opf_id)
LEFT JOIN cog USING (cog_id)
                       """, con=con, index_col=['func_id'])

function = function.loc[data.columns]

In [None]:
function

In [None]:
((data.loc[['B1A', 'B1B']] > 0)
      .T.apply(lambda x: pd.Series({ 'just_A': x.B1A > x.B1B
                                   , 'just_B': x.B1B > x.B1A
                                   , 'both': x.B1A & x.B1B
                                   }), axis=1)).sum()

## Ordination

### Carb COG OPFs

In [None]:
feats = function[function.cog_category.isin(['G'])].index
d = data[feats].apply(np.sqrt).dropna()
fit = PCA().fit(d.loc[mag.genome_type == 'ormerod'])
ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

fig, ax = plt.subplots(figsize=(10, 10))

x, y = 'PC1', 'PC2'
text_offset_x = 0.08
text_offset_y = 0


ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
ax.set_yticklabels([])
ax.set_xticklabels([])


# Plot Ormerod
for ormerod_guild in ['plant', 'host', 'starch']:
    d1 = ordin[(mag.genome_type == 'ormerod') & (mag.ormerod_guild == ormerod_guild)]
    ax.scatter(x, y, data=d1,
               c=color_map[ormerod_guild], label=ormerod_guild)

# Plot Mine
ax.scatter(x, y, data=ordin[(mag.genome_type == 'here')],
           color='black', marker='^', alpha=1,
           label='__nolegend__')
for genome_id, coords in ordin[(mag.genome_type == 'here')].iterrows():
    coords = coords
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Plot Reference
ref_mags = ['Muribaculum_intestinale_DSM_28989T']
ax.scatter(x, y, data=ordin.loc[ref_mags],
           color='grey', alpha=1,
           label='__nolegend__')

text_offset_x = -0.05
text_offset_y = 0.05
labeled_mags = ['Muribaculum_intestinale_DSM_28989T', 'M6', 'Homeothermus_arabinoxylanisolvens']
for genome_id in labeled_mags:
    coords = ordin.loc[genome_id]
    if genome_id in italic_list:
        textstyle = 'italic'
    else:
        textstyle = 'normal'
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='right', va='center', fontsize=12, style=textstyle)
    
# Setup legend
ax.legend(loc='lower right')

# Save unzoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_unzoomed.pdf')

# Zoom view
#ax.set_xlim(-5, 5)
#ax.set_ylim(-8, 2)

# Save zoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_zoomed.pdf')

### SusC/SusD/SusEF OPFs

In [None]:
susC_OPF_list = pd.read_sql("""
SELECT DISTINCT opf_id
FROM feature_details
JOIN susC USING (feature_id)
WHERE opf_id NOT NULL
""", con=con).dropna()
susC_OPF_list = list(susC_OPF_list.opf_id)

susD_OPF_list = pd.read_sql("""
SELECT DISTINCT opf_id
FROM feature_details
JOIN susD USING (feature_id)
WHERE opf_id NOT NULL
""", con=con).dropna()
susD_OPF_list = list(susD_OPF_list.opf_id)

susEF_OPF_list = pd.read_sql("""
SELECT DISTINCT opf_id
FROM feature_details
JOIN susEF USING (feature_id)
WHERE opf_id NOT NULL
""", con=con).dropna()
susEF_OPF_list = list(susEF_OPF_list.opf_id)

In [None]:
feats = list(set(function.index) & set(susC_OPF_list + susD_OPF_list + susEF_OPF_list))
d = data[feats].apply(np.sqrt)
fit = PCA().fit(d.loc[mag.genome_type.isin(['ormerod'])])
ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

fig, ax = plt.subplots(figsize=(10, 10))

x, y = 'PC1', 'PC2'
text_offset_x = 0.08
text_offset_y = 0


ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
ax.set_yticklabels([])
ax.set_xticklabels([])


# Plot Ormerod
for ormerod_guild in ['plant', 'host', 'starch']:
    d1 = ordin[(mag.genome_type == 'ormerod') & (mag.ormerod_guild == ormerod_guild)]
    ax.scatter(x, y, data=d1,
               c=color_map[ormerod_guild], label=ormerod_guild)

# Plot Mine
ax.scatter(x, y, data=ordin[(mag.genome_type == 'here')],
           color='black', marker='^', alpha=1,
           label='__nolegend__')
for genome_id, coords in ordin[(mag.genome_type == 'here')].iterrows():
    coords = coords
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Plot Reference
ref_mags = ['Muribaculum_intestinale_DSM_28989T']
ax.scatter(x, y, data=ordin.loc[ref_mags],
           color='grey', alpha=1,
           label='__nolegend__')

text_offset_x = -0.08
text_offset_y = 0.1
labeled_mags = ['Muribaculum_intestinale_DSM_28989T', 'M6', 'Homeothermus_arabinoxylanisolvens']
for genome_id in labeled_mags:
    coords = ordin.loc[genome_id]
    if genome_id in italic_list:
        textstyle = 'italic'
    else:
        textstyle = 'normal'
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='right', va='center', fontsize=12, style=textstyle)
    
# Setup legend
ax.legend(loc='upper right')

# Save unzoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_unzoomed.pdf')

# Zoom view
#ax.set_xlim(-5, 5)
#ax.set_ylim(-8, 2)

# Save zoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_zoomed.pdf')

### GH Containing OPFs

In [None]:
gh_OPF_list = pd.read_sql("""
SELECT DISTINCT opf_id
FROM feature_to_opf
LEFT JOIN feature_x_cazy_minimal_domain USING (feature_id)
WHERE opf_id NOT NULL
  AND domain_id LIKE 'GH%'
""", con=con).dropna()
gh_OPF_list = list(gh_OPF_list.opf_id)

In [None]:
feats = list(set(function.index) & set(gh_OPF_list))
d = data[feats].apply(np.sqrt)
fit = PCA().fit(d.loc[mag.genome_type.isin(['ormerod'])])
ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

fig, ax = plt.subplots(figsize=(10, 10))

x, y = 'PC1', 'PC2'
text_offset_x = 0.08
text_offset_y = 0


ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
ax.set_yticklabels([])
ax.set_xticklabels([])


# Plot Ormerod
for ormerod_guild in ['plant', 'host', 'starch']:
    d1 = ordin[(mag.genome_type == 'ormerod') & (mag.ormerod_guild == ormerod_guild)]
    ax.scatter(x, y, data=d1,
               c=color_map[ormerod_guild], label=ormerod_guild)

# Plot Mine
ax.scatter(x, y, data=ordin[(mag.genome_type == 'here')],
           color='black', marker='^', alpha=1,
           label='__nolegend__')
for genome_id, coords in ordin[(mag.genome_type == 'here')].iterrows():
    coords = coords
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Plot Reference
ref_mags = ['Muribaculum_intestinale_DSM_28989T']
ax.scatter(x, y, data=ordin.loc[ref_mags],
           color='grey', alpha=1,
           label='__nolegend__')

text_offset_x = -0.08
text_offset_y = 0.1
labeled_mags = ['Muribaculum_intestinale_DSM_28989T', 'M6', 'Homeothermus_arabinoxylanisolvens']
for genome_id in labeled_mags:
    coords = ordin.loc[genome_id]
    if genome_id in italic_list:
        textstyle = 'italic'
    else:
        textstyle = 'normal'
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='right', va='center', fontsize=12, style=textstyle)
    
# Setup legend
ax.legend(loc='upper right')

# Save unzoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_unzoomed.pdf')

# Zoom view
#ax.set_xlim(-5, 5)
#ax.set_ylim(-8, 2)

# Save zoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_zoomed.pdf')

### All OPFs

In [None]:
feats = data.columns[((data > 0).sum() > 2)]
d = data[feats].apply(np.sqrt)
fit = PCA().fit(d.loc[mag.genome_type == 'ormerod'])
ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

fig, ax = plt.subplots(figsize=(10, 10))

x, y = 'PC1', 'PC2'
text_offset_x = 0.08
text_offset_y = 0


ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
ax.set_yticklabels([])
ax.set_xticklabels([])


# Plot Ormerod
for ormerod_guild in ['plant', 'host', 'starch']:
    d1 = ordin[(mag.genome_type == 'ormerod') & (mag.ormerod_guild == ormerod_guild)]
    ax.scatter(x, y, data=d1,
               c=color_map[ormerod_guild], label=ormerod_guild)

# Plot Mine
ax.scatter(x, y, data=ordin[(mag.genome_type == 'here')],
           color='black', marker='^', alpha=1,
           label='__nolegend__')
for genome_id, coords in ordin[(mag.genome_type == 'here')].iterrows():
    coords = coords
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Plot Reference
ref_mags = ['Muribaculum_intestinale_DSM_28989T']
ax.scatter(x, y, data=ordin.loc[ref_mags],
           color='grey', alpha=1,
           label='__nolegend__')

text_offset_x = -0.08
text_offset_y = 0.1
labeled_mags = ['Muribaculum_intestinale_DSM_28989T', 'M6', 'Homeothermus_arabinoxylanisolvens']
for genome_id in labeled_mags:
    coords = ordin.loc[genome_id]
    if genome_id in italic_list:
        textstyle = 'italic'
    else:
        textstyle = 'normal'
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='right', va='center', fontsize=12, style=textstyle)
    
# Setup legend
ax.legend(loc='lower right')

# Save unzoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_unzoomed.pdf')

# Zoom view
#ax.set_xlim(-5, 5)
#ax.set_ylim(-8, 2)

# Save zoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_zoomed.pdf')

## Frequencies

In [None]:
func_summ = {'all': (data > 0).sum()}
for ormerod_guild in mag.ormerod_guild.dropna().unique():
    func_summ[ormerod_guild] = (data.loc[mag.ormerod_guild == ormerod_guild] > 0).mean()
    
for genome_type in mag.genome_type.dropna().unique():
    func_summ[genome_type] = (data.loc[mag.genome_type == genome_type] > 0).sum()

func_summary = pd.DataFrame(func_summ).join(function)
func_summary['susC'] = False
func_summary.loc[susC_OPF_list, 'susC'] = True
func_summary['susD'] = False
func_summary.loc[susD_OPF_list, 'susD'] = True
func_summary['susEF'] = False
func_summary.loc[susEF_OPF_list, 'susEF'] = True

In [None]:
(func_summary
             [ (data.loc['B1A'] > 0)
#             & (data.loc['B1B'] > 0)
             & (data.loc['B2'] > 0)
             & (func_summary.ormerod > 1)
             ]
             .sort_values(['here', 'starch'], ascending=[True, False])
             .head(10)
)

In [None]:
(func_summary
             [ (data.loc['B1A'] > 0)
#             & (data.loc['B1B'] > 0)
             & (data.loc['B2'] > 0)
             & (func_summary.susC | func_summary.susD)
             ]
             .sort_values(['here', 'starch'], ascending=[True, False])
)

In [None]:
(func_summary
             [ (data.loc['B1A'] > 0)
#             & (data.loc['B1B'] > 0)
             & (data.loc['B2'] > 0)
             & (func_summary.ormerod > 1)
             ]
             .sort_values(['here', 'starch'], ascending=[True, False])
             .head(10)
)

In [None]:
(func_summary
             [ (data.loc['B1A'] > 0)
             & (data.loc['B1B'] == 0)
             & (data.loc['B2'] > 0)
             ]
             .sort_values(['starch'], ascending=[False])
)

# KOs

In [None]:
data = (pd.read_sql("""
SELECT genome_id, ko_id AS func_id, COUNT(feature_id) AS tally
FROM feature JOIN sequence USING (sequence_id)
JOIN feature_x_ko USING (feature_id)
WHERE func_id NOT NULL
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
data = data.reindex(mag.index)

function = pd.read_sql("""
SELECT ko_id AS func_id, description
FROM ko
                       """, con=con, index_col=['func_id'])

function = function.loc[data.columns]

In [None]:
((data.loc[['B1A', 'B1B']] > 0)
      .T.apply(lambda x: pd.Series({ 'just_A': x.B1A > x.B1B
                                   , 'just_B': x.B1B > x.B1A
                                   , 'both': x.B1A & x.B1B
                                   }), axis=1)).sum()

## Ordination

### All

In [None]:
feats = data.columns[((data > 0).sum() > 2)]
d = data[feats].apply(np.sqrt)
fit = PCA().fit(d.loc[mag.genome_type == 'ormerod'])
ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

fig, ax = plt.subplots(figsize=(10, 10))

x, y = 'PC1', 'PC2'
text_offset_x = 0.08
text_offset_y = 0


ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
ax.set_yticklabels([])
ax.set_xticklabels([])


# Plot Ormerod
for ormerod_guild in ['plant', 'host', 'starch']:
    d1 = ordin[(mag.genome_type == 'ormerod') & (mag.ormerod_guild == ormerod_guild)]
    ax.scatter(x, y, data=d1,
               c=color_map[ormerod_guild], label=ormerod_guild)

# Plot Mine
ax.scatter(x, y, data=ordin[(mag.genome_type == 'here')],
           color='black', marker='^', alpha=1,
           label='__nolegend__')
for genome_id, coords in ordin[(mag.genome_type == 'here')].iterrows():
    coords = coords
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Plot Reference
ref_mags = ['Muribaculum_intestinale_DSM_28989T']
ax.scatter(x, y, data=ordin.loc[ref_mags],
           color='grey', alpha=1,
           label='__nolegend__')

text_offset_x = 0.08
text_offset_y = 0.1
labeled_mags = ['Muribaculum_intestinale_DSM_28989T', 'M6', 'Homeothermus_arabinoxylanisolvens']
for genome_id in labeled_mags:
    coords = ordin.loc[genome_id]
    if genome_id in italic_list:
        textstyle = 'italic'
    else:
        textstyle = 'normal'
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='right', va='center', fontsize=12, style=textstyle)
    
# Setup legend
ax.legend(loc='upper right')

# Save unzoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_unzoomed.pdf')

# Zoom view
#ax.set_xlim(-5, 5)
#ax.set_ylim(-8, 2)

# Save zoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_zoomed.pdf')

## Frequencies

In [None]:
func_summ = {'all': (data > 0).sum()}
for ormerod_guild in mag.ormerod_guild.dropna().unique():
    func_summ[ormerod_guild] = (data.loc[mag.ormerod_guild == ormerod_guild] > 0).mean()
    
for genome_type in mag.genome_type.dropna().unique():
    func_summ[genome_type] = (data.loc[mag.genome_type == genome_type] > 0).sum()

func_summary = pd.DataFrame(func_summ).join(function)

In [None]:
(func_summary
             [ (data.loc['B1A'] > 0)
#             & (data.loc['B1B'] > 0)
             & (data.loc['B2'] > 0)
             ]
             .sort_values(['here', 'starch'], ascending=[True, False])
             .head(10)
)

# GHs

In [None]:
data = (pd.read_sql("""
SELECT genome_id, domain_id AS func_id, COUNT(feature_id) AS tally
FROM feature JOIN sequence USING (sequence_id)
JOIN feature_x_cazy_minimal_domain USING (feature_id)
WHERE func_id LIKE 'GH%'
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
data = data.reindex(mag.index)
data = data.groupby(data.columns.map(lambda x: x.rsplit('_', 1)[0]), axis=1).sum()

function = pd.read_sql("""
SELECT domain_id AS func_id
FROM cazy_domain
WHERE func_id LIKE 'GH%'
                       """, con=con, index_col=['func_id'])

function = function.loc[data.columns]


# Combine domain subfamilies


## Ordination

### All

In [None]:
feats = data.columns[((data > 0).sum() > 2)]
d = data[feats].apply(np.sqrt)
fit = PCA().fit(d.loc[mag.genome_type == 'ormerod'])
ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

fig, ax = plt.subplots(figsize=(10, 10))

x, y = 'PC1', 'PC2'
text_offset_x = 0.08
text_offset_y = 0


ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
ax.set_yticklabels([])
ax.set_xticklabels([])


# Plot Ormerod
for ormerod_guild in ['plant', 'host', 'starch']:
    d1 = ordin[(mag.genome_type == 'ormerod') & (mag.ormerod_guild == ormerod_guild)]
    ax.scatter(x, y, data=d1,
               c=color_map[ormerod_guild], label=ormerod_guild)

# Plot Mine
ax.scatter(x, y, data=ordin[(mag.genome_type == 'here')],
           color='black', marker='^', alpha=1,
           label='__nolegend__')
for genome_id, coords in ordin[(mag.genome_type == 'here')].iterrows():
    coords = coords
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Plot Reference
ref_mags = ['Muribaculum_intestinale_DSM_28989T']
ax.scatter(x, y, data=ordin.loc[ref_mags],
           color='grey', alpha=1,
           label='__nolegend__')

text_offset_x = 0.08
text_offset_y = 0.1
labeled_mags = ['Muribaculum_intestinale_DSM_28989T', 'M6', 'Homeothermus_arabinoxylanisolvens']
for genome_id in labeled_mags:
    coords = ordin.loc[genome_id]
    if genome_id in italic_list:
        textstyle = 'italic'
    else:
        textstyle = 'normal'
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='right', va='center', fontsize=12, style=textstyle)
    
# Setup legend
ax.legend(loc='upper right')

# Save unzoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_unzoomed.pdf')

# Zoom view
#ax.set_xlim(-5, 5)
#ax.set_ylim(-8, 2)

# Save zoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_zoomed.pdf')

In [None]:
sns.clustermap(data.loc[mag.genome_type.isin(['ormerod', 'here']), (data > 0).sum() > 1].apply(np.sqrt).T,
               robust=True,
               figsize=(10, 25),
               xticklabels=1, yticklabels=1)

## Frequencies

In [None]:
func_summ = {'all': (data > 0).sum()}
for ormerod_guild in mag.ormerod_guild.dropna().unique():
    func_summ[ormerod_guild] = (data.loc[mag.ormerod_guild == ormerod_guild] > 0).mean()
    
for genome_type in mag.genome_type.dropna().unique():
    func_summ[genome_type] = (data.loc[mag.genome_type == genome_type] > 0).sum()

func_summary = pd.DataFrame(func_summ).join(function)

In [None]:
(func_summary
             [ (data.loc['B1A'] > 0)
#             & (data.loc['B1B'] > 0)
             & (data.loc['B2'] > 0)
             ]
             .sort_values(['here', 'starch'], ascending=[True, False])
             .head(10)
)

# Non-cytosolic GHs

In [None]:
pd.read_sql("SELECT DISTINCT localization FROM feature_localization", con=con)

In [None]:
data = (pd.read_sql("""
SELECT genome_id, domain_id AS func_id, COUNT(feature_id) AS tally
FROM feature JOIN sequence USING (sequence_id)
JOIN feature_x_cazy_minimal_domain USING (feature_id)
JOIN feature_localization USING (feature_id)
WHERE func_id LIKE 'GH%'
  AND localization IN ('PP', 'OM', 'IM')
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
data = data.reindex(mag.index)
data = data.groupby(data.columns.map(lambda x: x.rsplit('_', 1)[0]), axis=1).sum()

function = pd.read_sql("""
SELECT domain_id AS func_id
FROM cazy_domain
WHERE func_id LIKE 'GH%'
                       """, con=con, index_col=['func_id'])

function = function.loc[data.columns]


# Combine domain subfamilies


## Ordination

### All

In [None]:
feats = data.columns[((data > 0).sum() > 1)]
d = data[feats].apply(np.sqrt)
fit = PCA().fit(d.loc[mag.genome_type == 'ormerod'])
ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

fig, ax = plt.subplots(figsize=(10, 10))

x, y = 'PC1', 'PC2'
text_offset_x = 0.08
text_offset_y = 0


ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
ax.set_yticklabels([])
ax.set_xticklabels([])


# Plot Ormerod
for ormerod_guild in ['plant', 'host', 'starch']:
    d1 = ordin[(mag.genome_type == 'ormerod') & (mag.ormerod_guild == ormerod_guild)]
    ax.scatter(x, y, data=d1,
               c=color_map[ormerod_guild], label=ormerod_guild)

# Plot Mine
ax.scatter(x, y, data=ordin[(mag.genome_type == 'here')],
           color='black', marker='^', alpha=1,
           label='__nolegend__')
for genome_id, coords in ordin[(mag.genome_type == 'here')].iterrows():
    coords = coords
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Plot Reference
ref_mags = ['Muribaculum_intestinale_DSM_28989T']
ax.scatter(x, y, data=ordin.loc[ref_mags],
           color='grey', alpha=1,
           label='__nolegend__')

text_offset_x = 0.08
text_offset_y = 0.1
labeled_mags = ['Muribaculum_intestinale_DSM_28989T', 'M6', 'Homeothermus_arabinoxylanisolvens']
for genome_id in labeled_mags:
    coords = ordin.loc[genome_id]
    if genome_id in italic_list:
        textstyle = 'italic'
    else:
        textstyle = 'normal'
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='right', va='center', fontsize=12, style=textstyle)
    
# Setup legend
ax.legend(loc='upper right')

# Save unzoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_unzoomed.pdf')

# Zoom view
#ax.set_xlim(-5, 5)
#ax.set_ylim(-8, 2)

# Save zoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_zoomed.pdf')

# Non-cytosolic GH containing OPFs

In [None]:
data = (pd.read_sql("""
SELECT DISTINCT genome_id, opf_id AS func_id, COUNT(feature_id) AS tally
FROM feature JOIN sequence USING (sequence_id)
JOIN feature_x_cazy_minimal_domain USING (feature_id)
JOIN feature_localization USING (feature_id)
JOIN feature_to_opf USING (feature_id)
WHERE domain_id LIKE 'GH%'
  AND localization IN ('OM', 'PP', 'IM')
  AND opf_id NOT NULL
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
data = data.reindex(mag.index)

function = pd.DataFrame([], index=data.columns)

# Combine domain subfamilies


## Ordination

### All

In [None]:
feats = data.columns[((data > 0).sum() > 2)]
d = data[feats].dropna().apply(np.sqrt)
fit = PCA().fit(d.loc[mag.genome_type == 'ormerod'])
ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

fig, ax = plt.subplots(figsize=(10, 10))

x, y = 'PC1', 'PC2'
text_offset_x = 0.08
text_offset_y = 0


ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
ax.set_yticklabels([])
ax.set_xticklabels([])


# Plot Ormerod
for ormerod_guild in ['plant', 'host', 'starch']:
    d1 = ordin[(mag.genome_type == 'ormerod') & (mag.ormerod_guild == ormerod_guild)]
    ax.scatter(x, y, data=d1,
               c=color_map[ormerod_guild], label=ormerod_guild)

# Plot Mine
ax.scatter(x, y, data=ordin[(mag.genome_type == 'here')],
           color='black', marker='^', alpha=1,
           label='__nolegend__')
for genome_id, coords in ordin[(mag.genome_type == 'here')].iterrows():
    coords = coords
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Plot Reference
ref_mags = ['Muribaculum_intestinale_DSM_28989T']
ax.scatter(x, y, data=ordin.loc[ref_mags],
           color='grey', alpha=1,
           label='__nolegend__')

text_offset_x = 0.08
text_offset_y = -0.1
labeled_mags = ['Muribaculum_intestinale_DSM_28989T', 'M6', 'Homeothermus_arabinoxylanisolvens']
for genome_id in labeled_mags:
    coords = ordin.loc[genome_id]
    if genome_id in italic_list:
        textstyle = 'italic'
    else:
        textstyle = 'normal'
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='right', va='center', fontsize=12, style=textstyle)
    
# Setup legend
ax.legend(loc='upper right')

# Save unzoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_unzoomed.pdf')

# Zoom view
#ax.set_xlim(-5, 5)
#ax.set_ylim(-8, 2)

# Save zoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_zoomed.pdf')

# Non-cytosoloic GHs in PULs

In [None]:
data = (pd.read_sql("""
SELECT DISTINCT genome_id, domain_id AS func_id, COUNT(feature_id) AS tally
FROM feature_distance
JOIN (SELECT feature_id AS seed_id FROM susC) USING (seed_id)
JOIN feature USING (feature_id)
JOIN sequence USING (sequence_id)
JOIN feature_x_cazy_minimal_domain USING (feature_id)
JOIN feature_localization USING (feature_id)
JOIN feature_to_opf USING (feature_id)
WHERE domain_id LIKE 'GH%'
  AND localization IN ('OM', 'PP', 'IM')
  AND opf_id NOT NULL
  AND distance < 10000
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
data = data.reindex(mag.index).dropna()
data = data.groupby(data.columns.map(lambda x: x.rsplit('_', 1)[0]), axis=1).sum()

function = pd.DataFrame([], index=data.columns)

# Combine domain subfamilies


## Ordination

### All

In [None]:
feats = data.columns[((data > 0).sum() > 2)]
d = data[feats].dropna().apply(lambda x: x > 0)
fit = PCA().fit(d.loc[mag.genome_type == 'ormerod'])
ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

fig, ax = plt.subplots(figsize=(10, 10))

x, y = 'PC1', 'PC2'
text_offset_x = 0.08
text_offset_y = 0


ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
ax.set_yticklabels([])
ax.set_xticklabels([])


# Plot Ormerod
for ormerod_guild in ['plant', 'host', 'starch']:
    d1 = ordin[(mag.genome_type == 'ormerod') & (mag.ormerod_guild == ormerod_guild)]
    ax.scatter(x, y, data=d1,
               c=color_map[ormerod_guild], label=ormerod_guild)

# Plot Mine
ax.scatter(x, y, data=ordin[(mag.genome_type == 'here')],
           color='black', marker='^', alpha=1,
           label='__nolegend__')
for genome_id, coords in ordin[(mag.genome_type == 'here')].iterrows():
    coords = coords
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Plot Reference
ref_mags = ['Muribaculum_intestinale_DSM_28989T']
ax.scatter(x, y, data=ordin.loc[ref_mags],
           color='grey', alpha=1,
           label='__nolegend__')

text_offset_x = 0.08
text_offset_y = -0.1
labeled_mags = ['Muribaculum_intestinale_DSM_28989T', 'M6', 'Homeothermus_arabinoxylanisolvens']
for genome_id in labeled_mags:
    coords = ordin.loc[genome_id]
    if genome_id in italic_list:
        textstyle = 'italic'
    else:
        textstyle = 'normal'
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='right', va='center', fontsize=12, style=textstyle)
    
# Setup legend
ax.legend(loc='upper right')

# Save unzoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_unzoomed.pdf')

# Zoom view
#ax.set_xlim(-5, 5)
#ax.set_ylim(-8, 2)

# Save zoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_zoomed.pdf')

# OPFs in PULs

In [None]:
data = (pd.read_sql("""
SELECT DISTINCT genome_id, opf_id AS func_id, COUNT(feature_id) AS tally
FROM feature_distance
JOIN (SELECT feature_id AS seed_id FROM susC) USING (seed_id)
JOIN feature USING (feature_id)
JOIN sequence USING (sequence_id)
JOIN feature_x_cazy_minimal_domain USING (feature_id)
JOIN feature_localization USING (feature_id)
JOIN feature_to_opf USING (feature_id)
WHERE domain_id LIKE 'GH%'
--  AND localization IN ('OM', 'PP', 'IM')
  AND opf_id NOT NULL
  AND distance < 10000
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
data = data.reindex(mag.index).dropna()

function = pd.DataFrame([], index=data.columns)

# Combine domain subfamilies


## Ordination

### All

In [None]:
(data > 0).sum()

In [None]:
feats = data.columns[((data > 0).sum() > 2)]
d = data[feats].dropna().apply(np.sqrt)
fit = PCA().fit(d.loc[mag.genome_type == 'ormerod'])
ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

fig, ax = plt.subplots(figsize=(10, 10))

x, y = 'PC1', 'PC2'
text_offset_x = 0.08
text_offset_y = 0


ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
ax.set_yticklabels([])
ax.set_xticklabels([])


# Plot Ormerod
for ormerod_guild in ['plant', 'host', 'starch']:
    d1 = ordin[(mag.genome_type == 'ormerod') & (mag.ormerod_guild == ormerod_guild)]
    ax.scatter(x, y, data=d1,
               c=color_map[ormerod_guild], label=ormerod_guild)

# Plot Mine
ax.scatter(x, y, data=ordin[(mag.genome_type == 'here')],
           color='black', marker='^', alpha=1,
           label='__nolegend__')
for genome_id, coords in ordin[(mag.genome_type == 'here')].iterrows():
    coords = coords
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Plot Reference
ref_mags = ['Muribaculum_intestinale_DSM_28989T']
ax.scatter(x, y, data=ordin.loc[ref_mags],
           color='grey', alpha=1,
           label='__nolegend__')

text_offset_x = 0.08
text_offset_y = -0.1
labeled_mags = ['Muribaculum_intestinale_DSM_28989T', 'M6', 'Homeothermus_arabinoxylanisolvens']
for genome_id in labeled_mags:
    coords = ordin.loc[genome_id]
    if genome_id in italic_list:
        textstyle = 'italic'
    else:
        textstyle = 'normal'
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='right', va='center', fontsize=12, style=textstyle)
    
# Setup legend
ax.legend(loc='upper right')

# Save unzoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_unzoomed.pdf')

# Zoom view
#ax.set_xlim(-5, 5)
#ax.set_ylim(-8, 2)

# Save zoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_zoomed.pdf')

# PROTOTYPE Manuscript Figure

In [None]:
carb_opf_count = (pd.read_sql("""
SELECT genome_id, opf_id AS func_id, COUNT(feature_id) AS tally
FROM feature JOIN sequence USING (sequence_id)
JOIN feature_to_opf USING (feature_id)
JOIN feature_x_cazy_minimal_domain USING (feature_id)
WHERE domain_id LIKE 'GH%'
  AND opf_id NOT NULL
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
carb_opf_count = carb_opf_count.reindex(mag.index).dropna()

gh_count = (pd.read_sql("""
SELECT genome_id, domain_id AS func_id, COUNT(feature_id) AS tally
FROM feature JOIN sequence USING (sequence_id)
JOIN feature_x_cazy_minimal_domain USING (feature_id)
WHERE domain_id LIKE 'GH%'
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
gh_count = gh_count.groupby(gh_count.columns.map(lambda x: x.rsplit('_', 1)[0]), axis=1).sum()
gh_count = gh_count.reindex(mag.index).dropna()

ormerod_cog_count = (pd.read_sql("""
SELECT genome_id, cog_id AS func_id, COUNT(feature_id) AS tally
FROM feature JOIN sequence USING (sequence_id)
JOIN feature_to_cog USING (feature_id)
WHERE cog_id IN ('COG3507', 'COG3866', 'COG4677', 'COG2730', 'COG3693', 'COG0366', 'COG3525', 'COG3119')
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
ormerod_cog_count = ormerod_cog_count.reindex(mag.index).fillna(0)


In [None]:
# Fixes the issue where carb OPF counts
# were based on the number of domains, not features.
carb_opf_count2 = (pd.read_sql("""
SELECT genome_id, opf_id AS func_id, COUNT(feature_id) AS tally
FROM feature
JOIN sequence USING (sequence_id)
JOIN (SELECT DISTINCT feature_id
      FROM feature_x_cazy_minimal_domain
      WHERE domain_id LIKE 'GH%'
     ) USING (feature_id)
JOIN feature_to_opf USING (feature_id)
WHERE opf_id NOT NULL
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
carb_opf_count2 = carb_opf_count2.reindex(mag.index).dropna()

# Limits it to features with predicted localization outside the CY
carb_opf_count3 = (pd.read_sql("""
SELECT genome_id, opf_id AS func_id, COUNT(feature_id) AS tally
FROM feature
JOIN sequence USING (sequence_id)
JOIN (SELECT DISTINCT feature_id
      FROM feature_x_cazy_minimal_domain
      WHERE domain_id LIKE 'GH%'
     ) USING (feature_id)
JOIN feature_to_opf USING (feature_id)
LEFT JOIN feature_localization USING (feature_id)
WHERE opf_id NOT NULL
  AND localization IN ('PP', 'OM', 'IM')
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
carb_opf_count3 = carb_opf_count3.reindex(mag.index).dropna()

In [None]:
# Limits to features predicted to
# contain a GH and
# be exported that are located close
# to a PUL susC (within 10kb)
carb_opf_count4 = (pd.read_sql("""
SELECT DISTINCT genome_id, opf_id AS func_id, COUNT(feature_id) AS tally
FROM feature_distance
JOIN feature USING (feature_id)
JOIN sequence USING (sequence_id)
JOIN (SELECT feature_id AS seed_id FROM susC) USING (seed_id)
JOIN (SELECT DISTINCT feature_id
      FROM feature_x_cazy_minimal_domain
      WHERE domain_id LIKE 'GH%'
     ) USING (feature_id)
JOIN feature_localization USING (feature_id)
JOIN feature_to_opf USING (feature_id)
WHERE localization IN ('OM', 'PP', 'IM')
  AND opf_id NOT NULL
  AND distance < 10000
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
carb_opf_count4 = carb_opf_count4.reindex(mag.index).dropna()

In [None]:
# Glycoside hydrolase domains in exported features
# within 10kb of a PUL susC
carb_gh_count5 = (pd.read_sql("""
SELECT DISTINCT genome_id, domain_id AS func_id, COUNT(feature_id) AS tally
FROM feature_distance
JOIN feature USING (feature_id)
JOIN sequence USING (sequence_id)
JOIN (SELECT feature_id AS seed_id FROM pul_susC) USING (seed_id)
JOIN (SELECT feature_id, domain_id
      FROM feature_x_cazy_minimal_domain
      WHERE domain_id LIKE 'GH%'
     ) USING (feature_id)
JOIN feature_localization USING (feature_id)
JOIN feature_to_opf USING (feature_id)
WHERE localization IN ('OM', 'PP', 'IM')
  AND opf_id NOT NULL
  AND distance < 10000
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
carb_gh_count5 = carb_gh_count5.reindex(mag.index).dropna()

In [None]:
# CAZy GH/CBM domains in exported features
# within 10kb of a PUL susC
carb_domain_count5 = (pd.read_sql("""
SELECT DISTINCT genome_id, domain_id AS func_id, COUNT(feature_id) AS tally
FROM feature_distance
JOIN feature USING (feature_id)
JOIN sequence USING (sequence_id)
JOIN (SELECT feature_id AS seed_id FROM pul_susC) USING (seed_id)
JOIN (SELECT feature_id, domain_id
      FROM feature_x_cazy_minimal_domain
      WHERE domain_id LIKE 'GH%' OR domain_id LIKE 'CBM%'
     ) USING (feature_id)
JOIN feature_localization USING (feature_id)
JOIN feature_to_opf USING (feature_id)
WHERE localization IN ('OM', 'PP', 'IM')
  AND opf_id NOT NULL
  AND distance < 10000
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
carb_domain_count5 = carb_domain_count5.reindex(mag.index).dropna()

In [None]:
z = pd.read_sql("""
SELECT feature_id
     , SUBSTR(domain_id, 0,
              CASE WHEN INSTR(domain_id, '_')
                   THEN INSTR(domain_id, '_')
                   ELSE LENGTH(domain_id) + 1
              END)
            AS cazy_family
FROM feature_x_cazy_minimal_domain
""",
             con=con)
z.cazy_family.value_counts().head()
assert not z.cazy_family.apply(lambda x: '_' in x).any()

In [None]:
# CAZy GH/CBM domains in OM/PP features
# within 10kb of a PUL susC
carb_opf_count6 = (pd.read_sql("""
SELECT DISTINCT genome_id, opf_id AS func_id, COUNT(feature_id) AS tally
FROM feature_distance
JOIN feature USING (feature_id)
JOIN sequence USING (sequence_id)
JOIN (SELECT feature_id AS seed_id FROM pul_susC) USING (seed_id)
JOIN (SELECT feature_id, domain_id
      FROM feature_x_cazy_minimal_domain
      WHERE domain_id LIKE 'GH%' OR domain_id LIKE 'CBM%'
     ) USING (feature_id)
JOIN feature_localization USING (feature_id)
JOIN feature_to_opf USING (feature_id)
WHERE localization IN ('OM', 'PP', 'IM')
  AND opf_id NOT NULL
  AND distance < 10000
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
carb_opf_count6 = carb_opf_count6.reindex(mag.index).dropna()

carb_opf_count6_multimag = carb_opf_count6.loc[:, (((carb_opf_count6 > 0).sum()) > 1)]

In [None]:
# Glycoside hydrolase tallies as in Ormerod (except counting e.g. GH13_1 and GH13_2 separately)
gh_count = (pd.read_sql("""
SELECT DISTINCT genome_id, domain_id AS func_id, COUNT(feature_id) AS tally
FROM feature
JOIN sequence USING (sequence_id)
JOIN (SELECT DISTINCT feature_id, domain_id
      FROM feature_x_cazy_minimal_domain
      WHERE domain_id LIKE 'GH%'
     ) USING (feature_id)
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
gh_count = gh_count.reindex(mag.index).dropna()

# Glycoside hydrolase tallies as in Ormerod, but limited to
# those that are exported.
gh_count5 = (pd.read_sql("""
SELECT DISTINCT genome_id, cazy_family AS func_id, COUNT(feature_id) AS tally
FROM feature
JOIN sequence USING (sequence_id)
JOIN (SELECT
          feature_id,
          SUBSTR(domain_id, 0,
                 CASE WHEN INSTR(domain_id, '_')
                      THEN INSTR(domain_id, '_')
                      ELSE LENGTH(domain_id) + 1
                 END) AS cazy_family
      FROM feature_x_cazy_minimal_domain
      WHERE cazy_family LIKE 'GH%'
      ) USING (feature_id)
JOIN feature_localization USING (feature_id)
WHERE localization IN ('OM', 'PP', 'IM')
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
gh_count5 = gh_count5.reindex(mag.index).dropna()

In [None]:
# OPFs containing CAZy GH domains in exported features
# within 10kb of a PUL susC
gh_count4 = (pd.read_sql("""
SELECT DISTINCT genome_id, opf_id AS func_id, COUNT(feature_id) AS tally
FROM feature_distance
JOIN feature USING (feature_id)
JOIN sequence USING (sequence_id)
JOIN (SELECT feature_id AS seed_id FROM pul_susC) USING (seed_id)
JOIN (SELECT
          feature_id,
          SUBSTR(domain_id, 0,
                 CASE WHEN INSTR(domain_id, '_')
                      THEN INSTR(domain_id, '_')
                      ELSE LENGTH(domain_id) + 1
                 END) AS cazy_family
      FROM feature_x_cazy_minimal_domain
      WHERE cazy_family LIKE 'GH%'
      ) USING (feature_id)
JOIN feature_localization USING (feature_id)
JOIN feature_to_opf USING (feature_id)
WHERE distance < 10000
  AND localization IN ('OM', 'PP', 'IM')
  AND opf_id NOT NULL
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
gh_count4 = gh_count4.reindex(mag.index).dropna()

# Glycoside hydrolase OPFs, limited to
# those that are exported.
gh_count6 = (pd.read_sql("""
SELECT DISTINCT genome_id, opf_id AS func_id, COUNT(feature_id) AS tally
FROM feature
JOIN sequence USING (sequence_id)
JOIN (SELECT
          feature_id,
          SUBSTR(domain_id, 0,
                 CASE WHEN INSTR(domain_id, '_')
                      THEN INSTR(domain_id, '_')
                      ELSE LENGTH(domain_id) + 1
                 END) AS cazy_family
      FROM feature_x_cazy_minimal_domain
      WHERE cazy_family LIKE 'GH%'
      ) USING (feature_id)
JOIN feature_to_opf USING (feature_id)
JOIN feature_localization USING (feature_id)
WHERE localization IN ('OM', 'PP', 'IM')
  AND opf_id NOT NULL
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
gh_count6 = gh_count6.reindex(mag.index).dropna()

# POTENTIAL Manuscript Figures

## GH Domain Annotations

In [None]:
# Glycoside hydrolase tallies as in Ormerod (not counting e.g. GH13_1 separately from GH13_2)
gh_count2 = (pd.read_sql("""
SELECT genome_id, cazy_family AS func_id, COUNT(feature_id) AS tally
FROM (SELECT
          feature_id,
          SUBSTR(domain_id, 0,
                 CASE WHEN INSTR(domain_id, '_')
                      THEN INSTR(domain_id, '_')
                      ELSE LENGTH(domain_id) + 1
                 END) AS cazy_family
      FROM feature_x_cazy_minimal_domain
      WHERE cazy_family LIKE 'GH%'
      )
JOIN feature USING (feature_id)
JOIN sequence USING (sequence_id)
-- JOIN (SELECT DISTINCT feature_id, genome_id
--       FROM (SELECT feature_id AS seed_id FROM pul_susC)
--       JOIN feature_distance USING (seed_id)
--       JOIN feature USING (feature_id)
--       JOIN sequence USING (sequence_id)
--       WHERE distance < 10000  -- Choose filter distance from susC or remove entirely
--      ) USING (feature_id)
-- JOIN feature_localization USING (feature_id)
-- WHERE localization IN ('OM', 'IM', 'PP')  -- Choose localization filter
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
gh_count2 = gh_count2.reindex(mag.index).dropna()

In [None]:
# Glycoside hydrolase tallies as in Ormerod but filtered to within PULs *and* exported
gh_count3 = (pd.read_sql("""
SELECT genome_id, cazy_family AS func_id, COUNT(feature_id) AS tally
FROM (SELECT
          feature_id,
          SUBSTR(domain_id, 0,
                 CASE WHEN INSTR(domain_id, '_')
                      THEN INSTR(domain_id, '_')
                      ELSE LENGTH(domain_id) + 1
                 END) AS cazy_family
      FROM feature_x_cazy_minimal_domain
      WHERE cazy_family LIKE 'GH%'
      )
JOIN (SELECT DISTINCT feature_id, genome_id
      FROM (SELECT feature_id AS seed_id FROM pul_susC)
      JOIN feature_distance USING (seed_id)
      JOIN feature USING (feature_id)
      JOIN sequence USING (sequence_id)
      WHERE distance < 10000  -- Choose filter distance from susC or remove entirely
     ) USING (feature_id)
JOIN feature_localization USING (feature_id)
WHERE localization IN ('OM', 'IM', 'PP')  -- Choose localization filter
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
gh_count3 = gh_count3.reindex(mag.index).dropna()

In [None]:
# Exported CAZy GH domains
gh_count7 = (pd.read_sql("""
SELECT genome_id, cazy_family AS func_id, COUNT(feature_id) AS tally
FROM (SELECT
          feature_id,
          SUBSTR(domain_id, 0,
                 CASE WHEN INSTR(domain_id, '_')
                      THEN INSTR(domain_id, '_')
                      ELSE LENGTH(domain_id) + 1
                 END) AS cazy_family
      FROM feature_x_cazy_minimal_domain
      WHERE cazy_family LIKE 'GH%'
      )
-- JOIN (SELECT DISTINCT feature_id, genome_id
--       FROM (SELECT feature_id AS seed_id FROM pul_susC)
--       JOIN feature_distance USING (seed_id)
--       JOIN feature USING (feature_id)
--       JOIN sequence USING (sequence_id)
--       WHERE distance < 10000  -- Choose filter distance from susC or remove entirely
--      ) USING (feature_id)
JOIN feature USING (feature_id)
JOIN sequence USING (sequence_id)
JOIN feature_localization USING (feature_id)
WHERE localization IN ('OM', 'IM', 'PP')  -- Choose localization filter
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
gh_count7 = gh_count7.reindex(mag.index).dropna()

In [None]:
# CAZy GH domains encoded in PULs
gh_count8 = (pd.read_sql("""
SELECT genome_id, cazy_family AS func_id, COUNT(feature_id) AS tally
FROM (SELECT
          feature_id,
          SUBSTR(domain_id, 0,
                 CASE WHEN INSTR(domain_id, '_')
                      THEN INSTR(domain_id, '_')
                      ELSE LENGTH(domain_id) + 1
                 END) AS cazy_family
      FROM feature_x_cazy_minimal_domain
      WHERE cazy_family LIKE 'GH%'
      )
JOIN (SELECT DISTINCT feature_id, genome_id
      FROM (SELECT feature_id AS seed_id FROM pul_susC)
      JOIN feature_distance USING (seed_id)
      JOIN feature USING (feature_id)
      JOIN sequence USING (sequence_id)
      WHERE distance < 10000  -- Choose filter distance from susC or remove entirely
     ) USING (feature_id)
-- JOIN feature_localization USING (feature_id)
-- WHERE localization IN ('OM', 'IM', 'PP')  -- Choose localization filter
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
gh_count8 = gh_count8.reindex(mag.index).dropna()

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(fullwidth, fullwidth))
np.random.seed(10)


mags_list = ['GP1', 'GP2', 'GP3', 'GP4', 'H1',
       'H10', 'H2', 'H3', 'H4', 'H5', 'H6', 'H7', 'H8', 'H9',
       'Homeothermus_arabinoxylanisolvens', 'K1', 'K10', 'M1', 'M10', 'M11',
       'M12', 'M13', 'M14', 'M2', 'M3', 'M5', 'M6', 'M7', 'M8', 'M9',
       'Paramuribaculum_intestinale_DSM_100749T',
       'Muribaculum_intestinale_DSM_28989T', 'B1A', 'B1B', 'B2', 'B3', 'B4',
       'B5', 'B6', 'B7', 'B8',
       'Duncaniella_muris_DSM_103720T', 'Duncaniella_freteri_DSM_108168T',
       'Duncaniella_dubosii_DSM_107170T', 'Amulumruptor_caecigallinarius']

for panel, title, x, y, d, ax in [('A', 'All', 'PC1', 'PC2', gh_count2, axs[0,0]),
                                  ('B', 'Exported', 'PC1', 'PC2', gh_count7, axs[0,1]),
                                  ('C', 'In PUL', 'PC1', 'PC2', gh_count8, axs[1,0]),
                                  ('D', 'Exported + In PUL', 'PC1', 'PC2', gh_count3, axs[1,1]),
                                 ]:
    feats = d.columns[((d > 0).sum() >= 2)]
    d = d[feats].reindex(mags_list).fillna(0)#.apply(np.sqrt)
    fit = PCA().fit(d.loc[mag.genome_type.isin(['ormerod'])])
    ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
    
#     if panel == 'C':
#         ordin.PC1 = -ordin.PC1
    
    perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

    ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
    ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
#    ax.set_yticklabels([])
#    ax.set_xticklabels([])


    # Plot Ormerod
    for ormerod_guild in ['plant', 'host', 'starch']:
        d1 = ordin[(mag.genome_type == 'ormerod') & (mag.ormerod_guild == ormerod_guild)]
        ax.scatter(x, y, data=d1,
                   c=color_map[ormerod_guild], label=ormerod_guild)

    # Plot Mine
    ax.scatter(x, y, data=ordin[(mag.genome_type == 'here')],
               color='black', marker='^', alpha=1,
               label='__nolegend__')

    # Plot Reference
    ref_mags = [ 'Muribaculum_intestinale_DSM_28989T'
               , 'Duncaniella_muris_DSM_103720T'
               , 'Duncaniella_freteri_DSM_108168T'
               , 'Duncaniella_dubosii_DSM_107170T'
               , 'Paramuribaculum_intestinale_DSM_100749T'
               , 'Amulumruptor_caecigallinarius'
               ]
    ax.scatter(x, y, data=ordin.loc[ref_mags],
               color='grey', alpha=1,
               label='__nolegend__')


    other_labeled_mags = [ 'M6'
                         , 'Homeothermus_arabinoxylanisolvens'
                         ]
    
    scale = 0.05
    scale_x = (ordin[(mag.genome_type == 'ormerod')][x].max()
               - ordin[(mag.genome_type == 'ormerod')][x].min()) * scale
    scale_y = (ordin[(mag.genome_type == 'ormerod')][y].max()
               - ordin[(mag.genome_type == 'ormerod')][y].min()) * scale
    for genome_id in list(mag[mag.genome_type == 'here'].index) + ref_mags + other_labeled_mags:
        if genome_id in italic_list:
            fontstyle='italic'
        else:
            fontstyle=None
        coords = ordin.loc[genome_id]
        text_offset_y = np.random.uniform(-1, 1)
        text_offset_x = np.sqrt(1 - text_offset_y**2)
        ax.annotate(rename_mag(genome_id),
                    xy=(coords[x], coords[y]),
                    xytext=(coords[x] + text_offset_x * scale_x, coords[y] + text_offset_y * scale_y),
#                    arrowprops=dict(color='black', arrowstyle='-'),
                    va='center', ha='center', fontsize=8, fontstyle=fontstyle)
        
#     ax.annotate(panel, xy=(0.02, 1.01), xycoords='axes fraction', fontsize=12)
    ax.set_title(title)
    
axs[0,0].legend(loc='upper right')
fig.tight_layout()

# savefig(fig, 'build/figure_ordination')

## OPF Annoations

In [None]:
# Glycoside hydrolase OPFs
gh_opf_count1 = (pd.read_sql("""
SELECT genome_id, opf_id AS func_id, COUNT(feature_id) AS tally
FROM (SELECT
          feature_id
      FROM feature_x_cazy_minimal_domain
      WHERE domain_id LIKE 'GH%'
      )
JOIN feature USING (feature_id)
JOIN sequence USING (sequence_id)
JOIN feature_to_opf USING (feature_id)
-- JOIN (SELECT DISTINCT feature_id, genome_id
--       FROM (SELECT feature_id AS seed_id FROM pul_susC)
--       JOIN feature_distance USING (seed_id)
--       JOIN feature USING (feature_id)
--       JOIN sequence USING (sequence_id)
--       WHERE distance < 10000  -- Choose filter distance from susC or remove entirely
--      ) USING (feature_id)
-- JOIN feature_localization USING (feature_id)
WHERE opf_id NOT NULL
--   AND localization IN ('OM', 'IM', 'PP')  -- Choose localization filter
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
gh_opf_count1 = gh_opf_count1.reindex(mag.index).dropna()

# Exported glycoside hydrolase OPFs.
gh_opf_count2 = (pd.read_sql("""
SELECT genome_id, opf_id AS func_id, COUNT(feature_id) AS tally
FROM (SELECT
          feature_id
      FROM feature_x_cazy_minimal_domain
      WHERE domain_id LIKE 'GH%'
      )
JOIN feature USING (feature_id)
JOIN sequence USING (sequence_id)
JOIN feature_to_opf USING (feature_id)
-- JOIN (SELECT DISTINCT feature_id, genome_id
--       FROM (SELECT feature_id AS seed_id FROM pul_susC)
--       JOIN feature_distance USING (seed_id)
--       JOIN feature USING (feature_id)
--       JOIN sequence USING (sequence_id)
--       WHERE distance < 10000  -- Choose filter distance from susC or remove entirely
--      ) USING (feature_id)
JOIN feature_localization USING (feature_id)
WHERE opf_id NOT NULL
  AND localization IN ('OM', 'IM', 'PP')  -- Choose localization filter
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
gh_opf_count2 = gh_opf_count2.reindex(mag.index).dropna()

# PUL encoded glycoside hydrolase OPFs.
gh_opf_count3 = (pd.read_sql("""
SELECT genome_id, opf_id AS func_id, COUNT(feature_id) AS tally
FROM (SELECT
          feature_id
      FROM feature_x_cazy_minimal_domain
      WHERE domain_id LIKE 'GH%'
      )
JOIN feature_to_opf USING (feature_id)
JOIN (SELECT DISTINCT feature_id, genome_id
      FROM (SELECT feature_id AS seed_id FROM pul_susC)
      JOIN feature_distance USING (seed_id)
      JOIN feature USING (feature_id)
      JOIN sequence USING (sequence_id)
      WHERE distance < 10000  -- Choose filter distance from susC or remove entirely
     ) USING (feature_id)
JOIN feature_localization USING (feature_id)
WHERE opf_id NOT NULL
--   AND localization IN ('OM', 'IM', 'PP')  -- Choose localization filteropf_id NOT NULL
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
gh_opf_count3 = gh_opf_count3.reindex(mag.index).dropna()

# PUL encoded, exported glycoside hydrolase OPFs.
gh_opf_count4 = (pd.read_sql("""
SELECT genome_id, opf_id AS func_id, COUNT(feature_id) AS tally
FROM (SELECT
          feature_id
      FROM feature_x_cazy_minimal_domain
      WHERE domain_id LIKE 'GH%'
      )
JOIN feature_to_opf USING (feature_id)
JOIN (SELECT DISTINCT feature_id, genome_id
      FROM (SELECT feature_id AS seed_id FROM pul_susC)
      JOIN feature_distance USING (seed_id)
      JOIN feature USING (feature_id)
      JOIN sequence USING (sequence_id)
      WHERE distance < 10000  -- Choose filter distance from susC or remove entirely
     ) USING (feature_id)
JOIN feature_localization USING (feature_id)
WHERE opf_id NOT NULL
  AND localization IN ('OM', 'IM', 'PP')  -- Choose localization filteropf_id NOT NULL
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
gh_opf_count4 = gh_opf_count4.reindex(mag.index).dropna()

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(fullwidth, fullwidth))
np.random.seed(10)

for panel, title, x, y, d, ax in [('A', 'All', 'PC1', 'PC2', gh_opf_count1, axs[0,0]),
                                  ('B', 'Exported', 'PC1', 'PC2', gh_opf_count2, axs[0,1]),
                                  ('C', 'In PUL', 'PC1', 'PC2', gh_opf_count3, axs[1,0]),
                                  ('D', 'Exported + In PUL', 'PC1', 'PC2', gh_opf_count4, axs[1,1]),
                                 ]:
    feats = d.columns[((d > 0).sum() >= 2)]
    d = d[feats].dropna().apply(np.cbrt)
    fit = PCA().fit(d.loc[mag.genome_type.isin(['ormerod'])])
    ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
    
#     if panel == 'C':
#         ordin.PC1 = -ordin.PC1
    
    perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

    ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
    ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
#    ax.set_yticklabels([])
#    ax.set_xticklabels([])


    # Plot Ormerod
    for ormerod_guild in ['plant', 'host', 'starch']:
        d1 = ordin[(mag.genome_type == 'ormerod') & (mag.ormerod_guild == ormerod_guild)]
        ax.scatter(x, y, data=d1,
                   c=color_map[ormerod_guild], label=ormerod_guild)

    # Plot Mine
    ax.scatter(x, y, data=ordin[(mag.genome_type == 'here')],
               color='black', marker='^', alpha=1,
               label='__nolegend__')

    # Plot Reference
    ref_mags = [ 'Muribaculum_intestinale_DSM_28989T'
               , 'Duncaniella_muris_DSM_103720T'
               , 'Duncaniella_freteri_DSM_108168T'
               , 'Duncaniella_dubosii_DSM_107170T'
               , 'Paramuribaculum_intestinale_DSM_100749T'
               , 'Amulumruptor_caecigallinarius'
               ]
    ax.scatter(x, y, data=ordin.loc[ref_mags],
               color='grey', alpha=1,
               label='__nolegend__')


    other_labeled_mags = [ 'M6'
                         , 'Homeothermus_arabinoxylanisolvens'
                         ]
    
    scale = 0
    scale_x = (ordin[(mag.genome_type == 'ormerod')][x].max()
               - ordin[(mag.genome_type == 'ormerod')][x].min()) * scale
    scale_y = (ordin[(mag.genome_type == 'ormerod')][y].max()
               - ordin[(mag.genome_type == 'ormerod')][y].min()) * scale
    for genome_id in list(mag[mag.genome_type == 'here'].index) + ref_mags + other_labeled_mags:
        if genome_id in italic_list:
            fontstyle='italic'
        else:
            fontstyle=None
        coords = ordin.loc[genome_id]
        text_offset_y = np.random.uniform(-1, 1)
        text_offset_x = np.sqrt(1 - text_offset_y**2)
        ax.annotate(rename_mag(genome_id),
                    xy=(coords[x], coords[y]),
                    xytext=(coords[x] + text_offset_x * scale_x, coords[y] + text_offset_y * scale_y),
#                    arrowprops=dict(color='black', arrowstyle='-'),
                    va='center', ha='center', fontsize=8, fontstyle=fontstyle)
        
#     ax.annotate(panel, xy=(0.02, 1.01), xycoords='axes fraction', fontsize=12)
    ax.set_title(title)
    
axs[0,0].legend(loc='upper right')
fig.tight_layout()

# savefig(fig, 'build/figure_ordination')

## Ormerod COG Annotations

In [None]:
cog_count1 = (pd.read_sql("""
SELECT genome_id, cog_id AS func_id, COUNT(feature_id) AS tally
FROM feature_to_cog
JOIN feature USING (feature_id)
JOIN sequence USING (sequence_id)
JOIN feature_localization USING (feature_id)
WHERE cog_id IN ('COG3507', 'COG3866', 'COG4677',
                 'COG2730', 'COG3693', 'COG0366',
                 'COG3525', 'COG3119')
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
cog_count1 = cog_count1.reindex(mag.index).fillna(0)#.dropna()


In [None]:
fig, ax = plt.subplots(1, 1, figsize=(halfwidth, halfwidth))
np.random.seed(10)

for panel, title, x, y, d, ax in [('A', 'All', 'PC1', 'PC2', cog_count1, ax)]:
    feats = d.columns#[((d > 0).sum() >= 2)]
    d = d[feats].dropna()#.apply(np.cbrt)
    fit = PCA().fit(d.loc[mag.genome_type.isin(['ormerod'])])
    ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
    
#     if panel == 'C':
#         ordin.PC1 = -ordin.PC1
    
    perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

    ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
    ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
#    ax.set_yticklabels([])
#    ax.set_xticklabels([])


    # Plot Ormerod
    for ormerod_guild in ['plant', 'host', 'starch']:
        d1 = ordin[(mag.genome_type == 'ormerod') & (mag.ormerod_guild == ormerod_guild)]
        ax.scatter(x, y, data=d1,
                   c=color_map[ormerod_guild], label=ormerod_guild)

    # Plot Mine
    ax.scatter(x, y, data=ordin[(mag.genome_type == 'here')],
               color='black', marker='^', alpha=1,
               label='__nolegend__')

    # Plot Reference
    ref_mags = [ 'Muribaculum_intestinale_DSM_28989T'
               , 'Duncaniella_muris_DSM_103720T'
               , 'Duncaniella_freteri_DSM_108168T'
               , 'Duncaniella_dubosii_DSM_107170T'
               , 'Paramuribaculum_intestinale_DSM_100749T'
               , 'Amulumruptor_caecigallinarius'
               ]
    ax.scatter(x, y, data=ordin.loc[ref_mags],
               color='grey', alpha=1,
               label='__nolegend__')


    other_labeled_mags = [ 'M6'
                         , 'Homeothermus_arabinoxylanisolvens'
                         ]
    
    scale = 0
    scale_x = (ordin[(mag.genome_type == 'ormerod')][x].max()
               - ordin[(mag.genome_type == 'ormerod')][x].min()) * scale
    scale_y = (ordin[(mag.genome_type == 'ormerod')][y].max()
               - ordin[(mag.genome_type == 'ormerod')][y].min()) * scale
    for genome_id in list(mag[mag.genome_type == 'here'].index) + ref_mags + other_labeled_mags:
        if genome_id in italic_list:
            fontstyle='italic'
        else:
            fontstyle=None
        coords = ordin.loc[genome_id]
        text_offset_y = np.random.uniform(-1, 1)
        text_offset_x = np.sqrt(1 - text_offset_y**2)
        ax.annotate(rename_mag(genome_id),
                    xy=(coords[x], coords[y]),
                    xytext=(coords[x] + text_offset_x * scale_x, coords[y] + text_offset_y * scale_y),
#                    arrowprops=dict(color='black', arrowstyle='-'),
                    va='center', ha='center', fontsize=8, fontstyle=fontstyle)
        
#     ax.annotate(panel, xy=(0.02, 1.01), xycoords='axes fraction', fontsize=12)
    ax.set_title(title)
    
axs[0,0].legend(loc='upper right')
fig.tight_layout()

# savefig(fig, 'build/figure_ordination')

## GH+CBM OPFs

In [None]:
# Glycoside hydrolase OPFs
cazy_opf_count1 = (pd.read_sql("""
SELECT genome_id, opf_id AS func_id, COUNT(feature_id) AS tally
FROM (SELECT
          feature_id
      FROM feature_x_cazy_minimal_domain
      WHERE domain_id LIKE 'GH%' OR domain_id LIKE 'CBM%'
      )
JOIN feature USING (feature_id)
JOIN sequence USING (sequence_id)
JOIN feature_to_opf USING (feature_id)
-- JOIN (SELECT DISTINCT feature_id, genome_id
--       FROM (SELECT feature_id AS seed_id FROM pul_susC)
--       JOIN feature_distance USING (seed_id)
--       JOIN feature USING (feature_id)
--       JOIN sequence USING (sequence_id)
--       WHERE distance < 10000  -- Choose filter distance from susC or remove entirely
--      ) USING (feature_id)
-- JOIN feature_localization USING (feature_id)
WHERE opf_id NOT NULL
--   AND localization IN ('OM', 'IM', 'PP')  -- Choose localization filter
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
cazy_opf_count1 = cazy_opf_count1.reindex(mag.index).dropna()

# Exported glycoside hydrolase OPFs.
cazy_opf_count2 = (pd.read_sql("""
SELECT genome_id, opf_id AS func_id, COUNT(feature_id) AS tally
FROM (SELECT
          feature_id
      FROM feature_x_cazy_minimal_domain
      WHERE domain_id LIKE 'GH%' OR domain_id LIKE 'CBM%'
      )
JOIN feature USING (feature_id)
JOIN sequence USING (sequence_id)
JOIN feature_to_opf USING (feature_id)
-- JOIN (SELECT DISTINCT feature_id, genome_id
--       FROM (SELECT feature_id AS seed_id FROM pul_susC)
--       JOIN feature_distance USING (seed_id)
--       JOIN feature USING (feature_id)
--       JOIN sequence USING (sequence_id)
--       WHERE distance < 10000  -- Choose filter distance from susC or remove entirely
--      ) USING (feature_id)
JOIN feature_localization USING (feature_id)
WHERE opf_id NOT NULL
  AND localization IN ('OM', 'IM', 'PP')  -- Choose localization filter
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
cazy_opf_count2 = cazy_opf_count2.reindex(mag.index).dropna()

# PUL encoded glycoside hydrolase OPFs.
cazy_opf_count3 = (pd.read_sql("""
SELECT genome_id, opf_id AS func_id, COUNT(feature_id) AS tally
FROM (SELECT
          feature_id
      FROM feature_x_cazy_minimal_domain
      WHERE domain_id LIKE 'GH%' OR domain_id LIKE 'CBM%'
      )
JOIN feature_to_opf USING (feature_id)
JOIN (SELECT DISTINCT feature_id, genome_id
      FROM (SELECT feature_id AS seed_id FROM pul_susC)
      JOIN feature_distance USING (seed_id)
      JOIN feature USING (feature_id)
      JOIN sequence USING (sequence_id)
      WHERE distance < 10000  -- Choose filter distance from susC or remove entirely
     ) USING (feature_id)
JOIN feature_localization USING (feature_id)
WHERE opf_id NOT NULL
--   AND localization IN ('OM', 'IM', 'PP')  -- Choose localization filteropf_id NOT NULL
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
cazy_opf_count3 = cazy_opf_count3.reindex(mag.index).dropna()

# PUL encoded, exported glycoside hydrolase OPFs.
cazy_opf_count4 = (pd.read_sql("""
SELECT genome_id, opf_id AS func_id, COUNT(feature_id) AS tally
FROM (SELECT
          feature_id
      FROM feature_x_cazy_minimal_domain
      WHERE domain_id LIKE 'GH%' OR domain_id LIKE 'CBM%'
      )
JOIN feature_to_opf USING (feature_id)
JOIN (SELECT DISTINCT feature_id, genome_id
      FROM (SELECT feature_id AS seed_id FROM pul_susC)
      JOIN feature_distance USING (seed_id)
      JOIN feature USING (feature_id)
      JOIN sequence USING (sequence_id)
      WHERE distance < 10000  -- Choose filter distance from susC or remove entirely
     ) USING (feature_id)
JOIN feature_localization USING (feature_id)
WHERE opf_id NOT NULL
  AND localization IN ('OM', 'IM', 'PP')  -- Choose localization filteropf_id NOT NULL
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
cazy_opf_count4 = cazy_opf_count4.reindex(mag.index).dropna()

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(fullwidth, fullwidth))
np.random.seed(10)


mags_list = ['GP1', 'GP2', 'GP3', 'GP4', 'H1',
       'H10', 'H2', 'H3', 'H4', 'H5', 'H6', 'H7', 'H8', 'H9',
       'Homeothermus_arabinoxylanisolvens', 'K1', 'K10', 'M1', 'M10', 'M11',
       'M12', 'M13', 'M14', 'M2', 'M3', 'M5', 'M6', 'M7', 'M8', 'M9',
       'Paramuribaculum_intestinale_DSM_100749T',
       'Muribaculum_intestinale_DSM_28989T', 'B1A', 'B1B', 'B2', 'B3', 'B4',
       'B5', 'B6', 'B7', 'B8',
       'Duncaniella_muris_DSM_103720T', 'Duncaniella_freteri_DSM_108168T',
       'Duncaniella_dubosii_DSM_107170T', 'Amulumruptor_caecigallinarius']

for panel, title, x, y, d, ax in [('A', 'All', 'PC1', 'PC2', cazy_opf_count1, axs[0,0]),
                                  ('B', 'Exported', 'PC1', 'PC2', cazy_opf_count2, axs[0,1]),
                                  ('C', 'In PUL', 'PC1', 'PC2', cazy_opf_count3, axs[1,0]),
                                  ('D', 'Exported + In PUL', 'PC1', 'PC2', cazy_opf_count4, axs[1,1]),
                                 ]:
    feats = d.columns[((d > 0).sum() >= 2)]
    d = d[feats].reindex(mags_list).fillna(0).apply(lambda x: x > 0)
    fit = PCA().fit(d.loc[mag.genome_type.isin(['ormerod'])])
    ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
    
#     if panel == 'C':
#         ordin.PC1 = -ordin.PC1
    
    perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

    ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
    ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
#    ax.set_yticklabels([])
#    ax.set_xticklabels([])


    # Plot Ormerod
    for ormerod_guild in ['plant', 'host', 'starch']:
        d1 = ordin[(mag.genome_type == 'ormerod') & (mag.ormerod_guild == ormerod_guild)]
        ax.scatter(x, y, data=d1,
                   c=color_map[ormerod_guild], label=ormerod_guild)

    # Plot Mine
    ax.scatter(x, y, data=ordin[(mag.genome_type == 'here')],
               color='black', marker='^', alpha=1,
               label='__nolegend__')

    # Plot Reference
    ref_mags = [ 'Muribaculum_intestinale_DSM_28989T'
               , 'Duncaniella_muris_DSM_103720T'
               , 'Duncaniella_freteri_DSM_108168T'
               , 'Duncaniella_dubosii_DSM_107170T'
               , 'Paramuribaculum_intestinale_DSM_100749T'
               , 'Amulumruptor_caecigallinarius'
               ]
    ax.scatter(x, y, data=ordin.loc[ref_mags],
               color='grey', alpha=1,
               label='__nolegend__')


    other_labeled_mags = [ 'M6'
                         , 'Homeothermus_arabinoxylanisolvens'
                         ]
    
    scale = 0.05
    scale_x = (ordin[(mag.genome_type == 'ormerod')][x].max()
               - ordin[(mag.genome_type == 'ormerod')][x].min()) * scale
    scale_y = (ordin[(mag.genome_type == 'ormerod')][y].max()
               - ordin[(mag.genome_type == 'ormerod')][y].min()) * scale
    for genome_id in list(mag[mag.genome_type == 'here'].index) + ref_mags + other_labeled_mags:
        if genome_id in italic_list:
            fontstyle='italic'
        else:
            fontstyle=None
        coords = ordin.loc[genome_id]
        text_offset_y = np.random.uniform(-1, 1)
        text_offset_x = np.sqrt(1 - text_offset_y**2)
        ax.annotate(rename_mag(genome_id),
                    xy=(coords[x], coords[y]),
                    xytext=(coords[x] + text_offset_x * scale_x, coords[y] + text_offset_y * scale_y),
#                    arrowprops=dict(color='black', arrowstyle='-'),
                    va='center', ha='center', fontsize=8, fontstyle=fontstyle)
        
#     ax.annotate(panel, xy=(0.02, 1.01), xycoords='axes fraction', fontsize=12)
    ax.set_title(title)
    
axs[0,0].legend(loc='upper right')
fig.tight_layout()

# savefig(fig, 'build/figure_ordination')

## GH+CBM Domain Annotations

In [None]:
# Glycoside hydrolase OPFs
cazy_count1 = (pd.read_sql("""
SELECT genome_id, cazy_family AS func_id, COUNT(feature_id) AS tally
FROM (SELECT
          feature_id,
          SUBSTR(domain_id, 0,
                 CASE WHEN INSTR(domain_id, '_')
                      THEN INSTR(domain_id, '_')
                      ELSE LENGTH(domain_id) + 1
                 END) AS cazy_family
      FROM feature_x_cazy_minimal_domain
      WHERE cazy_family LIKE 'GH%' OR cazy_family LIKE 'CBM%'
      )
JOIN feature USING (feature_id)
JOIN sequence USING (sequence_id)
-- JOIN (SELECT DISTINCT feature_id
--       FROM (SELECT feature_id AS seed_id FROM pul_susC)
--       JOIN feature_distance USING (seed_id)
--       JOIN feature USING (feature_id)
--       JOIN sequence USING (sequence_id)
--       WHERE distance < 10000  -- Choose filter distance from susC or remove entirely
--      ) USING (feature_id)
-- JOIN feature_localization USING (feature_id)
-- WHERE localization IN ('OM', 'IM', 'PP')  -- Choose localization filter
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
cazy_count1 = cazy_count1.reindex(mag.index).dropna()

# Exported glycoside hydrolase OPFs.
cazy_count2 = (pd.read_sql("""
SELECT genome_id, cazy_family AS func_id, COUNT(feature_id) AS tally
FROM (SELECT
          feature_id,
          SUBSTR(domain_id, 0,
                 CASE WHEN INSTR(domain_id, '_')
                      THEN INSTR(domain_id, '_')
                      ELSE LENGTH(domain_id) + 1
                 END) AS cazy_family
      FROM feature_x_cazy_minimal_domain
      WHERE cazy_family LIKE 'GH%' OR cazy_family LIKE 'CBM%'
      )
JOIN feature USING (feature_id)
JOIN sequence USING (sequence_id)
-- JOIN (SELECT DISTINCT feature_id
--       FROM (SELECT feature_id AS seed_id FROM pul_susC)
--       JOIN feature_distance USING (seed_id)
--       JOIN feature USING (feature_id)
--       JOIN sequence USING (sequence_id)
--       WHERE distance < 10000  -- Choose filter distance from susC or remove entirely
--      ) USING (feature_id)
JOIN feature_localization USING (feature_id)
WHERE localization IN ('OM', 'IM', 'PP')  -- Choose localization filter
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
cazy_count2 = cazy_count2.reindex(mag.index).dropna()

# PUL encoded glycoside hydrolase OPFs.
cazy_count3 = (pd.read_sql("""
SELECT genome_id, cazy_family AS func_id, COUNT(feature_id) AS tally
FROM (SELECT
          feature_id,
          SUBSTR(domain_id, 0,
                 CASE WHEN INSTR(domain_id, '_')
                      THEN INSTR(domain_id, '_')
                      ELSE LENGTH(domain_id) + 1
                 END) AS cazy_family
      FROM feature_x_cazy_minimal_domain
      WHERE cazy_family LIKE 'GH%' OR cazy_family LIKE 'CBM%'
      )
JOIN feature USING (feature_id)
JOIN sequence USING (sequence_id)
JOIN (SELECT DISTINCT feature_id
      FROM (SELECT feature_id AS seed_id FROM pul_susC)
      JOIN feature_distance USING (seed_id)
      JOIN feature USING (feature_id)
      JOIN sequence USING (sequence_id)
      WHERE distance < 10000  -- Choose filter distance from susC or remove entirely
     ) USING (feature_id)
-- JOIN feature_localization USING (feature_id)
-- WHERE localization IN ('OM', 'IM', 'PP')  -- Choose localization filter
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
cazy_count3 = cazy_count3.reindex(mag.index).dropna()

# PUL encoded, exported glycoside hydrolase OPFs.
cazy_count4 = (pd.read_sql("""
SELECT genome_id, cazy_family AS func_id, COUNT(feature_id) AS tally
FROM (SELECT
          feature_id,
          SUBSTR(domain_id, 0,
                 CASE WHEN INSTR(domain_id, '_')
                      THEN INSTR(domain_id, '_')
                      ELSE LENGTH(domain_id) + 1
                 END) AS cazy_family
      FROM feature_x_cazy_minimal_domain
      WHERE cazy_family LIKE 'GH%' OR cazy_family LIKE 'CBM%'
      )
JOIN feature USING (feature_id)
JOIN sequence USING (sequence_id)
JOIN (SELECT DISTINCT feature_id
      FROM (SELECT feature_id AS seed_id FROM pul_susC)
      JOIN feature_distance USING (seed_id)
      JOIN feature USING (feature_id)
      JOIN sequence USING (sequence_id)
      WHERE distance < 10000  -- Choose filter distance from susC or remove entirely
     ) USING (feature_id)
JOIN feature_localization USING (feature_id)
WHERE localization IN ('OM', 'IM', 'PP')  -- Choose localization filter
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
cazy_count4 = cazy_count4.reindex(mag.index).dropna()

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(fullwidth, fullwidth))
np.random.seed(10)


mags_list = ['GP1', 'GP2', 'GP3', 'GP4', 'H1',
       'H10', 'H2', 'H3', 'H4', 'H5', 'H6', 'H7', 'H8', 'H9',
       'Homeothermus_arabinoxylanisolvens', 'K1', 'K10', 'M1', 'M10', 'M11',
       'M12', 'M13', 'M14', 'M2', 'M3', 'M5', 'M6', 'M7', 'M8', 'M9',
       'Paramuribaculum_intestinale_DSM_100749T',
       'Muribaculum_intestinale_DSM_28989T', 'B1A', 'B1B', 'B2', 'B3', 'B4',
       'B5', 'B6', 'B7', 'B8',
       'Duncaniella_muris_DSM_103720T', 'Duncaniella_freteri_DSM_108168T',
       'Duncaniella_dubosii_DSM_107170T', 'Amulumruptor_caecigallinarius']

for panel, title, x, y, d, ax in [('A', 'All', 'PC1', 'PC2', cazy_count1, axs[0,0]),
                                  ('B', 'Exported', 'PC1', 'PC2', cazy_count2, axs[0,1]),
                                  ('C', 'In PUL', 'PC1', 'PC2', cazy_count3, axs[1,0]),
                                  ('D', 'Exported + In PUL', 'PC1', 'PC2', cazy_count4, axs[1,1]),
                                 ]:
    feats = d.columns[((d > 0).sum() >= 2)]
    d = d[feats].reindex(mags_list).fillna(0).apply(lambda x: x > 0)
    fit = PCA().fit(d.loc[mag.genome_type.isin(['ormerod'])])
    ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
    
#     if panel == 'C':
#         ordin.PC1 = -ordin.PC1
    
    perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

    ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
    ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
#    ax.set_yticklabels([])
#    ax.set_xticklabels([])


    # Plot Ormerod
    for ormerod_guild in ['plant', 'host', 'starch']:
        d1 = ordin[(mag.genome_type == 'ormerod') & (mag.ormerod_guild == ormerod_guild)]
        ax.scatter(x, y, data=d1,
                   c=color_map[ormerod_guild], label=ormerod_guild)

    # Plot Mine
    ax.scatter(x, y, data=ordin[(mag.genome_type == 'here')],
               color='black', marker='^', alpha=1,
               label='__nolegend__')

    # Plot Reference
    ref_mags = [ 'Muribaculum_intestinale_DSM_28989T'
               , 'Duncaniella_muris_DSM_103720T'
               , 'Duncaniella_freteri_DSM_108168T'
               , 'Duncaniella_dubosii_DSM_107170T'
               , 'Paramuribaculum_intestinale_DSM_100749T'
               , 'Amulumruptor_caecigallinarius'
               ]
    ax.scatter(x, y, data=ordin.loc[ref_mags],
               color='grey', alpha=1,
               label='__nolegend__')


    other_labeled_mags = [ 'M6'
                         , 'Homeothermus_arabinoxylanisolvens'
                         ]
    
    scale = 0.05
    scale_x = (ordin[(mag.genome_type == 'ormerod')][x].max()
               - ordin[(mag.genome_type == 'ormerod')][x].min()) * scale
    scale_y = (ordin[(mag.genome_type == 'ormerod')][y].max()
               - ordin[(mag.genome_type == 'ormerod')][y].min()) * scale
    for genome_id in list(mag[mag.genome_type == 'here'].index) + ref_mags + other_labeled_mags:
        if genome_id in italic_list:
            fontstyle='italic'
        else:
            fontstyle=None
        coords = ordin.loc[genome_id]
        text_offset_y = np.random.uniform(-1, 1)
        text_offset_x = np.sqrt(1 - text_offset_y**2)
        ax.annotate(rename_mag(genome_id),
                    xy=(coords[x], coords[y]),
                    xytext=(coords[x] + text_offset_x * scale_x, coords[y] + text_offset_y * scale_y),
#                    arrowprops=dict(color='black', arrowstyle='-'),
                    va='center', ha='center', fontsize=8, fontstyle=fontstyle)
        
#     ax.annotate(panel, xy=(0.02, 1.01), xycoords='axes fraction', fontsize=12)
    ax.set_title(title)
    
axs[0,0].legend(loc='upper right')
fig.tight_layout()

# savefig(fig, 'build/figure_ordination')

# Manuscript Figure

In [None]:
fig, axs = plt.subplots(nrows=3, figsize=(halfwidth, 10))
np.random.seed(10)

mags_list = ['GP1', 'GP2', 'GP3', 'GP4', 'H1',
       'H10', 'H2', 'H3', 'H4', 'H5', 'H6', 'H7', 'H8', 'H9',
       'Homeothermus_arabinoxylanisolvens', 'K1', 'K10', 'M1', 'M10', 'M11',
       'M12', 'M13', 'M14', 'M2', 'M3', 'M5', 'M6', 'M7', 'M8', 'M9',
       'Paramuribaculum_intestinale_DSM_100749T',
       'Muribaculum_intestinale_DSM_28989T', 'B1A', 'B1B', 'B2', 'B3', 'B4',
       'B5', 'B6', 'B7', 'B8',
       'Duncaniella_muris_DSM_103720T', 'Duncaniella_freteri_DSM_108168T',
       'Duncaniella_dubosii_DSM_107170T', 'Amulumruptor_caecigallinarius']

for panel, title, x, y, d, ax in [('A', 'GH Domains', 'PC1', 'PC2', gh_count2, axs[0]),
                                  ('B', 'GH/CBM Domains (Exported, PUL-encoded)', 'PC1', 'PC2', cazy_count4, axs[1]),
                                  ('C', 'GH/CBM OPFs (Exported, PUL-encoded)', 'PC1', 'PC2', cazy_opf_count4, axs[2]),
                                 ]:
    feats = d.columns[((d > 0).sum() >= 3)]
    d = d[feats].reindex(mags_list).fillna(0).apply(lambda x: x > 0)
    fit = PCA().fit(d.loc[mag.genome_type.isin(['ormerod'])])
    ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
    
    if panel == 'A':
        ordin = -ordin
    
    perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

    ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
    ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
#    ax.set_yticklabels([])
#    ax.set_xticklabels([])


    # Plot Ormerod
    for ormerod_guild in ['plant', 'host', 'starch']:
        d1 = ordin[(mag.genome_type == 'ormerod') & (mag.ormerod_guild == ormerod_guild)]
        ax.scatter(x, y, data=d1, alpha=0.5, lw=0, zorder=1,
                   c=color_map[ormerod_guild], label=ormerod_guild)

    # Plot Mine
    ax.scatter(x, y, data=ordin[(mag.genome_type == 'here')],
               color='black', marker='^', alpha=1.0, lw=0, zorder=3,
               label='__nolegend__')

    # Plot Reference
    ref_mags = [ 'Muribaculum_intestinale_DSM_28989T'
               , 'Duncaniella_muris_DSM_103720T'
               , 'Duncaniella_freteri_DSM_108168T'
               , 'Duncaniella_dubosii_DSM_107170T'
               , 'Paramuribaculum_intestinale_DSM_100749T'
               , 'Amulumruptor_caecigallinarius'
               ]
    ax.scatter(x, y, data=ordin.loc[ref_mags],
               color='grey', alpha=0.7, lw=0, zorder=2,
               label='__nolegend__')


    other_labeled_mags = [ 'M6'
                         , 'Homeothermus_arabinoxylanisolvens'
                         ]
    
    scale = 0.05
    scale_x = (ordin[(mag.genome_type == 'ormerod')][x].max()
               - ordin[(mag.genome_type == 'ormerod')][x].min()) * scale
    scale_y = (ordin[(mag.genome_type == 'ormerod')][y].max()
               - ordin[(mag.genome_type == 'ormerod')][y].min()) * scale
    for genome_id in list(mag[mag.genome_type == 'here'].index) + ref_mags + other_labeled_mags:
        if genome_id in italic_list:
            fontstyle='italic'
        else:
            fontstyle=None
        coords = ordin.loc[genome_id]
        text_offset_y = np.random.uniform(-1, 1)
        text_offset_x = np.sqrt(1 - text_offset_y**2)
        ax.annotate(rename_mag(genome_id),
                    xy=(coords[x], coords[y]),
                    xytext=(coords[x] + text_offset_x * scale_x, coords[y] + text_offset_y * scale_y),
#                    arrowprops=dict(color='black', arrowstyle='-'),
                    va='center', ha='center', fontsize=8, fontstyle=fontstyle)
        
    ax.annotate(panel, xy=(0.02, 1.01), xycoords='axes fraction', fontsize=12)
#    ax.set_title(title)
    
axs[0].legend(loc='lower left')
fig.tight_layout()

# savefig(fig, 'build/figure_ordination')

In [None]:
d = cazy_opf_count4.apply(lambda x: x > 0)

d.loc[['B1A', 'B1B', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8'], (d.loc['B1A'] & d.loc['B2'])]