# Preamble

## Imports

In [None]:
import pandas as pd
from glob import glob
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import numpy as np
import seaborn as sns
import sqlite3
import matplotlib as mpl

from scripts.lib.plotting import load_style

loaded_style = load_style('paper')
savefig = loaded_style['savefig']
fullwidth = loaded_style['fullwidth']
halfwidth = loaded_style['halfwidth']

con = sqlite3.connect('data/core.muri.2.denorm.db')
#sns.set_context('notebook')

## Metadata

In [None]:
color_map = { 'starch': 'blue'
            , 'host': 'purple'
            , 'plant': 'green'
            }

#mpl.rcParams['text.usetex'] = True

In [None]:
mag = (
    pd.read_table('meta/genome.tsv', index_col='genome_id')
#     .drop(['An287', 'An289', 'Muribaculaceae_bacterium_DSM_100739',
#            'Muribaculum_intestinale_DSM_100746', 'Paramuribaculum_intestinale_DSM_100749T',
#            'Paramuribaculum_intestinale_DSM_100764', 'Duncaniella_muris_DSM_103720T'])
)

rename_map = { 'B1A': 'B1-A'
             , 'B1B': 'B1-B'
             , 'B2': 'B2'
             , 'B3': 'B3'
             , 'B4': 'B4'
             , 'B5': 'B5'
             , 'B6': 'B6'
             , 'B7': 'B7'
             , 'B8': 'B8'
             , 'Muribaculum_intestinale_yl27': 'Mi'
             , 'Barnesiella_viscericola_DSM_18177': 'Barnesiella viscericola'
             , 'Homeothermus_arabinoxylanisolvens': 'Ha'
             }

italics_list = [ 'Muribaculum_intestinale_yl27'
               , 'Barnesiella_viscericola_DSM_18177'
               , 'Homeothermus_arabinoxylanisolvens'
               ]

def rename_mag(genome_id):
    if genome_id in rename_map:
        return rename_map[genome_id]
    else:
        return genome_id

# COGs

In [None]:
data = (pd.read_sql("""
SELECT genome_id, cog_id AS func_id, COUNT(feature_id) AS tally
FROM feature JOIN sequence USING (sequence_id)
JOIN feature_to_cog USING (feature_id)
WHERE func_id NOT NULL
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
data = data.reindex(mag.index).dropna()

function = pd.read_sql("""
SELECT cog_id AS func_id, function_category, description
FROM cog
                       """, con=con, index_col=['func_id'])

function = function.loc[data.columns]

In [None]:
((data.loc[['B1A', 'B1B']] > 0)
      .T.apply(lambda x: pd.Series({ 'just_A': x.B1A > x.B1B
                                   , 'just_B': x.B1B > x.B1A
                                   , 'both': x.B1A & x.B1B
                                   }), axis=1)).sum()

## Ordination

### Ormerod

In [None]:
feats = ['COG3507',
#         'COG3866',
         'COG4677', 'COG2730', 'COG3693', 'COG0366', 'COG3525', 'COG3119']

d = data[feats].apply(np.sqrt)
fit = PCA().fit(d.loc[mag.genome_type == 'ormerod'])
ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

fig, ax = plt.subplots(figsize=(10, 10))

x, y = 'PC1', 'PC2'
text_offset_x = 0.08
text_offset_y = 0


ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
ax.set_yticklabels([])
ax.set_xticklabels([])


# Plot Ormerod
for guild in ['plant', 'host', 'starch']:
    d1 = ordin[(mag.genome_type == 'ormerod') & (mag.ormerod_guild == guild)]
    ax.scatter(x, y, data=d1,
               c=color_map[guild], label=guild)

# Plot Mine
ax.scatter(x, y, data=ordin[(mag.genome_type == 'here')],
           color='black', marker='^', alpha=1,
           label='__nolegend__')
for genome_id, coords in ordin[(mag.genome_type == 'here')].iterrows():
    coords = coords
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Plot Reference
ref_mags = ['Muribaculum_intestinale_yl27']
ax.scatter(x, y, data=ordin.loc[ref_mags],
           color='grey', alpha=1,
           label='__nolegend__')

text_offset_x = -0.08
text_offset_y = 0.1
labeled_mags = ['Muribaculum_intestinale_yl27', 'M6', 'Homeothermus_arabinoxylanisolvens']
for genome_id in labeled_mags:
    coords = ordin.loc[genome_id]
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='right', va='center', fontsize=12)
    
# Setup legend
ax.legend(loc='lower right')

#fig.savefig('/Users/bjsmith/Desktop/ormerod_ordination.pdf')

### Carbs

In [None]:
feats = function[function.function_category == 'G'].index

d = data[feats].apply(np.sqrt)
fit = PCA().fit(d.loc[mag.genome_type == 'ormerod'])
ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

fig, ax = plt.subplots(figsize=(10, 10))

x, y = 'PC1', 'PC2'
text_offset_x = 0.08
text_offset_y = 0


ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
ax.set_yticklabels([])
ax.set_xticklabels([])


# Plot Ormerod
for ormerod_guild in ['plant', 'host', 'starch']:
    d1 = ordin[(mag.genome_type == 'ormerod') & (mag.ormerod_guild == ormerod_guild)]
    ax.scatter(x, y, data=d1,
               c=color_map[ormerod_guild], label=ormerod_guild)

# Plot Mine
ax.scatter(x, y, data=ordin[(mag.genome_type == 'here')],
           color='black', marker='^', alpha=1,
           label='__nolegend__')
for genome_id, coords in ordin[(mag.genome_type == 'here')].iterrows():
    coords = coords
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Plot Reference
ref_mags = ['Muribaculum_intestinale_yl27']
ax.scatter(x, y, data=ordin.loc[ref_mags],
           color='grey', alpha=1,
           label='__nolegend__')

text_offset_x = -0.08
text_offset_y = 0.1
labeled_mags = ['Muribaculum_intestinale_yl27', 'M6', 'Homeothermus_arabinoxylanisolvens']
for genome_id in labeled_mags:
    coords = ordin.loc[genome_id]
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='right', va='center', fontsize=12)
    
# Setup legend
ax.legend(loc='lower right')

#fig.savefig('/Users/bjsmith/Desktop/ormerod_ordination.pdf')

### All

In [None]:
feats = data.columns[((data > 0).sum() > 2)]

d = data[feats].apply(np.sqrt)
fit = PCA().fit(d.loc[mag.genome_type == 'ormerod'])
ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

fig, ax = plt.subplots(figsize=(10, 10))

x, y = 'PC1', 'PC2'
text_offset_x = 0.08
text_offset_y = 0


ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
ax.set_yticklabels([])
ax.set_xticklabels([])


# Plot Ormerod
for ormerod_guild in ['plant', 'host', 'starch']:
    d1 = ordin[(mag.genome_type == 'ormerod') & (mag.ormerod_guild == ormerod_guild)]
    ax.scatter(x, y, data=d1,
               c=color_map[ormerod_guild], label=ormerod_guild)

# Plot Mine
ax.scatter(x, y, data=ordin[(mag.genome_type == 'here')],
           color='black', marker='^', alpha=1,
           label='__nolegend__')
for genome_id, coords in ordin[(mag.genome_type == 'here')].iterrows():
    coords = coords
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Plot Reference
ref_mags = ['Muribaculum_intestinale_yl27']
ax.scatter(x, y, data=ordin.loc[ref_mags],
           color='grey', alpha=1,
           label='__nolegend__')

text_offset_x = -0.08
text_offset_y = 0.1
labeled_mags = ['Muribaculum_intestinale_yl27', 'M6', 'Homeothermus_arabinoxylanisolvens']
for genome_id in labeled_mags:
    coords = ordin.loc[genome_id]
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='right', va='center', fontsize=12)
    
# Setup legend
ax.legend(loc='lower right')

#fig.savefig('/Users/bjsmith/Desktop/ormerod_ordination.pdf')

## Frequencies

In [None]:
func_summ = {'all': (data > 0).sum()}
for ormerod_guild in mag.ormerod_guild.dropna().unique():
    func_summ[ormerod_guild] = (data.loc[mag.ormerod_guild == ormerod_guild] > 0).mean()
    
for genome_type in mag.genome_type.dropna().unique():
    func_summ[genome_type] = (data.loc[mag.genome_type == genome_type] > 0).sum()

func_summary = pd.DataFrame(func_summ).join(function)

In [None]:
(func_summary
             [ (data.loc['B1A'] > 0)
             & (data.loc['B1B'] > 0)
             & (data.loc['B2'] > 0)
             ]
             .sort_values(['here', 'starch'], ascending=[True, False])
             .head(10)
)

# OPFs

In [None]:
data = (pd.read_sql("""
SELECT genome_id, opf_id AS func_id, COUNT(feature_id) AS tally
FROM feature JOIN sequence USING (sequence_id)
JOIN feature_to_opf USING (feature_id)
WHERE func_id NOT NULL
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
data = data.reindex(mag.index)

function = pd.read_sql("""
SELECT
    opf_id AS func_id
  , architecture
  , ko_id
  , ko.description AS ko_description
  , cog_id
  , cog.description AS cog_description
  , function_category AS cog_category
FROM opf_to_architecture
LEFT JOIN opf_to_ko USING (opf_id)
LEFT JOIN ko USING (ko_id)
LEFT JOIN opf_to_cog USING (opf_id)
LEFT JOIN cog USING (cog_id)
                       """, con=con, index_col=['func_id'])

function = function.loc[data.columns]

In [None]:
function

In [None]:
((data.loc[['B1A', 'B1B']] > 0)
      .T.apply(lambda x: pd.Series({ 'just_A': x.B1A > x.B1B
                                   , 'just_B': x.B1B > x.B1A
                                   , 'both': x.B1A & x.B1B
                                   }), axis=1)).sum()

## Ordination

### Carb COG OPFs

In [None]:
feats = function[function.cog_category.isin(['G'])].index
d = data[feats].apply(np.sqrt).dropna()
fit = PCA().fit(d.loc[mag.genome_type == 'ormerod'])
ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

fig, ax = plt.subplots(figsize=(10, 10))

x, y = 'PC1', 'PC2'
text_offset_x = 0.08
text_offset_y = 0


ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
ax.set_yticklabels([])
ax.set_xticklabels([])


# Plot Ormerod
for ormerod_guild in ['plant', 'host', 'starch']:
    d1 = ordin[(mag.genome_type == 'ormerod') & (mag.ormerod_guild == ormerod_guild)]
    ax.scatter(x, y, data=d1,
               c=color_map[ormerod_guild], label=ormerod_guild)

# Plot Mine
ax.scatter(x, y, data=ordin[(mag.genome_type == 'here')],
           color='black', marker='^', alpha=1,
           label='__nolegend__')
for genome_id, coords in ordin[(mag.genome_type == 'here')].iterrows():
    coords = coords
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Plot Reference
ref_mags = ['Muribaculum_intestinale_yl27']
ax.scatter(x, y, data=ordin.loc[ref_mags],
           color='grey', alpha=1,
           label='__nolegend__')

text_offset_x = -0.05
text_offset_y = 0.05
labeled_mags = ['Muribaculum_intestinale_yl27', 'M6', 'Homeothermus_arabinoxylanisolvens']
for genome_id in labeled_mags:
    coords = ordin.loc[genome_id]
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='right', va='center', fontsize=12)
    
# Setup legend
ax.legend(loc='lower right')

# Save unzoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_unzoomed.pdf')

# Zoom view
#ax.set_xlim(-5, 5)
#ax.set_ylim(-8, 2)

# Save zoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_zoomed.pdf')

### SusC/SusD/SusEF OPFs

In [None]:
susC_OPF_list = pd.read_sql("""
SELECT DISTINCT opf_id
FROM feature_details
JOIN susC USING (feature_id)
WHERE opf_id NOT NULL
""", con=con).dropna()
susC_OPF_list = list(susC_OPF_list.opf_id)

susD_OPF_list = pd.read_sql("""
SELECT DISTINCT opf_id
FROM feature_details
JOIN susD USING (feature_id)
WHERE opf_id NOT NULL
""", con=con).dropna()
susD_OPF_list = list(susD_OPF_list.opf_id)

susEF_OPF_list = pd.read_sql("""
SELECT DISTINCT opf_id
FROM feature_details
JOIN susEF USING (feature_id)
WHERE opf_id NOT NULL
""", con=con).dropna()
susEF_OPF_list = list(susEF_OPF_list.opf_id)

In [None]:
feats = list(set(function.index) & set(susC_OPF_list + susD_OPF_list + susEF_OPF_list))
d = data[feats].apply(np.sqrt)
fit = PCA().fit(d.loc[mag.genome_type.isin(['ormerod'])])
ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

fig, ax = plt.subplots(figsize=(10, 10))

x, y = 'PC1', 'PC2'
text_offset_x = 0.08
text_offset_y = 0


ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
ax.set_yticklabels([])
ax.set_xticklabels([])


# Plot Ormerod
for ormerod_guild in ['plant', 'host', 'starch']:
    d1 = ordin[(mag.genome_type == 'ormerod') & (mag.ormerod_guild == ormerod_guild)]
    ax.scatter(x, y, data=d1,
               c=color_map[ormerod_guild], label=ormerod_guild)

# Plot Mine
ax.scatter(x, y, data=ordin[(mag.genome_type == 'here')],
           color='black', marker='^', alpha=1,
           label='__nolegend__')
for genome_id, coords in ordin[(mag.genome_type == 'here')].iterrows():
    coords = coords
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Plot Reference
ref_mags = ['Muribaculum_intestinale_yl27']
ax.scatter(x, y, data=ordin.loc[ref_mags],
           color='grey', alpha=1,
           label='__nolegend__')

text_offset_x = -0.08
text_offset_y = 0.1
labeled_mags = ['Muribaculum_intestinale_yl27', 'M6', 'Homeothermus_arabinoxylanisolvens']
for genome_id in labeled_mags:
    coords = ordin.loc[genome_id]
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='right', va='center', fontsize=12)
    
# Setup legend
ax.legend(loc='upper right')

# Save unzoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_unzoomed.pdf')

# Zoom view
#ax.set_xlim(-5, 5)
#ax.set_ylim(-8, 2)

# Save zoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_zoomed.pdf')

### GH Containing OPFs

In [None]:
gh_OPF_list = pd.read_sql("""
SELECT DISTINCT opf_id
FROM feature_to_opf
LEFT JOIN feature_x_cazy_minimal_domain USING (feature_id)
WHERE opf_id NOT NULL
  AND domain_id LIKE 'GH%'
""", con=con).dropna()
gh_OPF_list = list(gh_OPF_list.opf_id)

In [None]:
feats = list(set(function.index) & set(gh_OPF_list))
d = data[feats].apply(np.sqrt)
fit = PCA().fit(d.loc[mag.genome_type.isin(['ormerod'])])
ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

fig, ax = plt.subplots(figsize=(10, 10))

x, y = 'PC1', 'PC2'
text_offset_x = 0.08
text_offset_y = 0


ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
ax.set_yticklabels([])
ax.set_xticklabels([])


# Plot Ormerod
for ormerod_guild in ['plant', 'host', 'starch']:
    d1 = ordin[(mag.genome_type == 'ormerod') & (mag.ormerod_guild == ormerod_guild)]
    ax.scatter(x, y, data=d1,
               c=color_map[ormerod_guild], label=ormerod_guild)

# Plot Mine
ax.scatter(x, y, data=ordin[(mag.genome_type == 'here')],
           color='black', marker='^', alpha=1,
           label='__nolegend__')
for genome_id, coords in ordin[(mag.genome_type == 'here')].iterrows():
    coords = coords
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Plot Reference
ref_mags = ['Muribaculum_intestinale_yl27']
ax.scatter(x, y, data=ordin.loc[ref_mags],
           color='grey', alpha=1,
           label='__nolegend__')

text_offset_x = -0.08
text_offset_y = 0.1
labeled_mags = ['Muribaculum_intestinale_yl27', 'M6', 'Homeothermus_arabinoxylanisolvens']
for genome_id in labeled_mags:
    coords = ordin.loc[genome_id]
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='right', va='center', fontsize=12)
    
# Setup legend
ax.legend(loc='upper right')

# Save unzoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_unzoomed.pdf')

# Zoom view
#ax.set_xlim(-5, 5)
#ax.set_ylim(-8, 2)

# Save zoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_zoomed.pdf')

### All OPFs

In [None]:
feats = data.columns[((data > 0).sum() > 2)]
d = data[feats].apply(np.sqrt)
fit = PCA().fit(d.loc[mag.genome_type == 'ormerod'])
ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

fig, ax = plt.subplots(figsize=(10, 10))

x, y = 'PC1', 'PC2'
text_offset_x = 0.08
text_offset_y = 0


ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
ax.set_yticklabels([])
ax.set_xticklabels([])


# Plot Ormerod
for ormerod_guild in ['plant', 'host', 'starch']:
    d1 = ordin[(mag.genome_type == 'ormerod') & (mag.ormerod_guild == ormerod_guild)]
    ax.scatter(x, y, data=d1,
               c=color_map[ormerod_guild], label=ormerod_guild)

# Plot Mine
ax.scatter(x, y, data=ordin[(mag.genome_type == 'here')],
           color='black', marker='^', alpha=1,
           label='__nolegend__')
for genome_id, coords in ordin[(mag.genome_type == 'here')].iterrows():
    coords = coords
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Plot Reference
ref_mags = ['Muribaculum_intestinale_yl27']
ax.scatter(x, y, data=ordin.loc[ref_mags],
           color='grey', alpha=1,
           label='__nolegend__')

text_offset_x = -0.08
text_offset_y = 0.1
labeled_mags = ['Muribaculum_intestinale_yl27', 'M6', 'Homeothermus_arabinoxylanisolvens']
for genome_id in labeled_mags:
    coords = ordin.loc[genome_id]
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='right', va='center', fontsize=12)
    
# Setup legend
ax.legend(loc='lower right')

# Save unzoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_unzoomed.pdf')

# Zoom view
#ax.set_xlim(-5, 5)
#ax.set_ylim(-8, 2)

# Save zoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_zoomed.pdf')

## Frequencies

In [None]:
func_summ = {'all': (data > 0).sum()}
for ormerod_guild in mag.ormerod_guild.dropna().unique():
    func_summ[ormerod_guild] = (data.loc[mag.ormerod_guild == ormerod_guild] > 0).mean()
    
for genome_type in mag.genome_type.dropna().unique():
    func_summ[genome_type] = (data.loc[mag.genome_type == genome_type] > 0).sum()

func_summary = pd.DataFrame(func_summ).join(function)
func_summary['susC'] = False
func_summary.loc[susC_OPF_list, 'susC'] = True
func_summary['susD'] = False
func_summary.loc[susD_OPF_list, 'susD'] = True
func_summary['susEF'] = False
func_summary.loc[susEF_OPF_list, 'susEF'] = True

In [None]:
(func_summary
             [ (data.loc['B1A'] > 0)
#             & (data.loc['B1B'] > 0)
             & (data.loc['B2'] > 0)
             & (func_summary.ormerod > 1)
             ]
             .sort_values(['here', 'starch'], ascending=[True, False])
             .head(10)
)

In [None]:
(func_summary
             [ (data.loc['B1A'] > 0)
#             & (data.loc['B1B'] > 0)
             & (data.loc['B2'] > 0)
             & (func_summary.susC | func_summary.susD)
             ]
             .sort_values(['here', 'starch'], ascending=[True, False])
)

In [None]:
(func_summary
             [ (data.loc['B1A'] > 0)
#             & (data.loc['B1B'] > 0)
             & (data.loc['B2'] > 0)
             & (func_summary.ormerod > 1)
             ]
             .sort_values(['here', 'starch'], ascending=[True, False])
             .head(10)
)

In [None]:
(func_summary
             [ (data.loc['B1A'] > 0)
             & (data.loc['B1B'] == 0)
             & (data.loc['B2'] > 0)
             ]
             .sort_values(['starch'], ascending=[False])
)

# KOs

In [None]:
data = (pd.read_sql("""
SELECT genome_id, ko_id AS func_id, COUNT(feature_id) AS tally
FROM feature JOIN sequence USING (sequence_id)
JOIN feature_x_ko USING (feature_id)
WHERE func_id NOT NULL
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
data = data.reindex(mag.index)

function = pd.read_sql("""
SELECT ko_id AS func_id, description
FROM ko
                       """, con=con, index_col=['func_id'])

function = function.loc[data.columns]

In [None]:
((data.loc[['B1A', 'B1B']] > 0)
      .T.apply(lambda x: pd.Series({ 'just_A': x.B1A > x.B1B
                                   , 'just_B': x.B1B > x.B1A
                                   , 'both': x.B1A & x.B1B
                                   }), axis=1)).sum()

## Ordination

### All

In [None]:
feats = data.columns[((data > 0).sum() > 2)]
d = data[feats].apply(np.sqrt)
fit = PCA().fit(d.loc[mag.genome_type == 'ormerod'])
ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

fig, ax = plt.subplots(figsize=(10, 10))

x, y = 'PC1', 'PC2'
text_offset_x = 0.08
text_offset_y = 0


ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
ax.set_yticklabels([])
ax.set_xticklabels([])


# Plot Ormerod
for ormerod_guild in ['plant', 'host', 'starch']:
    d1 = ordin[(mag.genome_type == 'ormerod') & (mag.ormerod_guild == ormerod_guild)]
    ax.scatter(x, y, data=d1,
               c=color_map[ormerod_guild], label=ormerod_guild)

# Plot Mine
ax.scatter(x, y, data=ordin[(mag.genome_type == 'here')],
           color='black', marker='^', alpha=1,
           label='__nolegend__')
for genome_id, coords in ordin[(mag.genome_type == 'here')].iterrows():
    coords = coords
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Plot Reference
ref_mags = ['Muribaculum_intestinale_yl27']
ax.scatter(x, y, data=ordin.loc[ref_mags],
           color='grey', alpha=1,
           label='__nolegend__')

text_offset_x = 0.08
text_offset_y = 0.1
labeled_mags = ['Muribaculum_intestinale_yl27', 'M6', 'Homeothermus_arabinoxylanisolvens']
for genome_id in labeled_mags:
    coords = ordin.loc[genome_id]
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Setup legend
ax.legend(loc='upper right')

# Save unzoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_unzoomed.pdf')

# Zoom view
#ax.set_xlim(-5, 5)
#ax.set_ylim(-8, 2)

# Save zoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_zoomed.pdf')

## Frequencies

In [None]:
func_summ = {'all': (data > 0).sum()}
for ormerod_guild in mag.ormerod_guild.dropna().unique():
    func_summ[ormerod_guild] = (data.loc[mag.ormerod_guild == ormerod_guild] > 0).mean()
    
for genome_type in mag.genome_type.dropna().unique():
    func_summ[genome_type] = (data.loc[mag.genome_type == genome_type] > 0).sum()

func_summary = pd.DataFrame(func_summ).join(function)

In [None]:
(func_summary
             [ (data.loc['B1A'] > 0)
#             & (data.loc['B1B'] > 0)
             & (data.loc['B2'] > 0)
             ]
             .sort_values(['here', 'starch'], ascending=[True, False])
             .head(10)
)

# GHs

In [None]:
data = (pd.read_sql("""
SELECT genome_id, domain_id AS func_id, COUNT(feature_id) AS tally
FROM feature JOIN sequence USING (sequence_id)
JOIN feature_x_cazy_minimal_domain USING (feature_id)
WHERE func_id LIKE 'GH%'
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
data = data.reindex(mag.index)
data = data.groupby(data.columns.map(lambda x: x.rsplit('_', 1)[0]), axis=1).sum()

function = pd.read_sql("""
SELECT domain_id AS func_id
FROM cazy_domain
WHERE func_id LIKE 'GH%'
                       """, con=con, index_col=['func_id'])

function = function.loc[data.columns]


# Combine domain subfamilies


## Ordination

### All

In [None]:
feats = data.columns[((data > 0).sum() > 2)]
d = data[feats].apply(np.sqrt)
fit = PCA().fit(d.loc[mag.genome_type == 'ormerod'])
ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

fig, ax = plt.subplots(figsize=(10, 10))

x, y = 'PC1', 'PC2'
text_offset_x = 0.08
text_offset_y = 0


ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
ax.set_yticklabels([])
ax.set_xticklabels([])


# Plot Ormerod
for ormerod_guild in ['plant', 'host', 'starch']:
    d1 = ordin[(mag.genome_type == 'ormerod') & (mag.ormerod_guild == ormerod_guild)]
    ax.scatter(x, y, data=d1,
               c=color_map[ormerod_guild], label=ormerod_guild)

# Plot Mine
ax.scatter(x, y, data=ordin[(mag.genome_type == 'here')],
           color='black', marker='^', alpha=1,
           label='__nolegend__')
for genome_id, coords in ordin[(mag.genome_type == 'here')].iterrows():
    coords = coords
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Plot Reference
ref_mags = ['Muribaculum_intestinale_yl27']
ax.scatter(x, y, data=ordin.loc[ref_mags],
           color='grey', alpha=1,
           label='__nolegend__')

text_offset_x = 0.08
text_offset_y = 0.1
labeled_mags = ['Muribaculum_intestinale_yl27', 'M6', 'Homeothermus_arabinoxylanisolvens']
for genome_id in labeled_mags:
    coords = ordin.loc[genome_id]
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Setup legend
ax.legend(loc='upper right')

# Save unzoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_unzoomed.pdf')

# Zoom view
#ax.set_xlim(-5, 5)
#ax.set_ylim(-8, 2)

# Save zoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_zoomed.pdf')

In [None]:
sns.clustermap(data.loc[mag.genome_type.isin(['ormerod', 'here']), (data > 0).sum() > 1].apply(np.sqrt).T,
               robust=True,
               figsize=(10, 25),
               xticklabels=1, yticklabels=1)

## Frequencies

In [None]:
func_summ = {'all': (data > 0).sum()}
for ormerod_guild in mag.ormerod_guild.dropna().unique():
    func_summ[ormerod_guild] = (data.loc[mag.ormerod_guild == ormerod_guild] > 0).mean()
    
for genome_type in mag.genome_type.dropna().unique():
    func_summ[genome_type] = (data.loc[mag.genome_type == genome_type] > 0).sum()

func_summary = pd.DataFrame(func_summ).join(function)

In [None]:
(func_summary
             [ (data.loc['B1A'] > 0)
#             & (data.loc['B1B'] > 0)
             & (data.loc['B2'] > 0)
             ]
             .sort_values(['here', 'starch'], ascending=[True, False])
             .head(10)
)

# Non-cytosolic GHs

In [None]:
pd.read_sql("SELECT DISTINCT localization FROM feature_localization", con=con)

In [None]:
data = (pd.read_sql("""
SELECT genome_id, domain_id AS func_id, COUNT(feature_id) AS tally
FROM feature JOIN sequence USING (sequence_id)
JOIN feature_x_cazy_minimal_domain USING (feature_id)
JOIN feature_localization USING (feature_id)
WHERE func_id LIKE 'GH%'
  AND localization IN ('PP', 'OM', 'IM')
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
data = data.reindex(mag.index)
data = data.groupby(data.columns.map(lambda x: x.rsplit('_', 1)[0]), axis=1).sum()

function = pd.read_sql("""
SELECT domain_id AS func_id
FROM cazy_domain
WHERE func_id LIKE 'GH%'
                       """, con=con, index_col=['func_id'])

function = function.loc[data.columns]


# Combine domain subfamilies


## Ordination

### All

In [None]:
feats = data.columns[((data > 0).sum() > 1)]
d = data[feats].apply(np.sqrt)
fit = PCA().fit(d.loc[mag.genome_type == 'ormerod'])
ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

fig, ax = plt.subplots(figsize=(10, 10))

x, y = 'PC1', 'PC2'
text_offset_x = 0.08
text_offset_y = 0


ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
ax.set_yticklabels([])
ax.set_xticklabels([])


# Plot Ormerod
for ormerod_guild in ['plant', 'host', 'starch']:
    d1 = ordin[(mag.genome_type == 'ormerod') & (mag.ormerod_guild == ormerod_guild)]
    ax.scatter(x, y, data=d1,
               c=color_map[ormerod_guild], label=ormerod_guild)

# Plot Mine
ax.scatter(x, y, data=ordin[(mag.genome_type == 'here')],
           color='black', marker='^', alpha=1,
           label='__nolegend__')
for genome_id, coords in ordin[(mag.genome_type == 'here')].iterrows():
    coords = coords
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Plot Reference
ref_mags = ['Muribaculum_intestinale_yl27']
ax.scatter(x, y, data=ordin.loc[ref_mags],
           color='grey', alpha=1,
           label='__nolegend__')

text_offset_x = 0.08
text_offset_y = 0.1
labeled_mags = ['Muribaculum_intestinale_yl27', 'M6', 'Homeothermus_arabinoxylanisolvens']
for genome_id in labeled_mags:
    coords = ordin.loc[genome_id]
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Setup legend
ax.legend(loc='upper right')

# Save unzoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_unzoomed.pdf')

# Zoom view
#ax.set_xlim(-5, 5)
#ax.set_ylim(-8, 2)

# Save zoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_zoomed.pdf')

# Non-cytosolic GH containing OPFs

In [None]:
data = (pd.read_sql("""
SELECT DISTINCT genome_id, opf_id AS func_id, COUNT(feature_id) AS tally
FROM feature JOIN sequence USING (sequence_id)
JOIN feature_x_cazy_minimal_domain USING (feature_id)
JOIN feature_localization USING (feature_id)
JOIN feature_to_opf USING (feature_id)
WHERE domain_id LIKE 'GH%'
  AND localization IN ('OM', 'PP', 'IM')
  AND opf_id NOT NULL
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
data = data.reindex(mag.index)

function = pd.DataFrame([], index=data.columns)

# Combine domain subfamilies


## Ordination

### All

In [None]:
feats = data.columns[((data > 0).sum() > 2)]
d = data[feats].dropna().apply(np.sqrt)
fit = PCA().fit(d.loc[mag.genome_type == 'ormerod'])
ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

fig, ax = plt.subplots(figsize=(10, 10))

x, y = 'PC1', 'PC2'
text_offset_x = 0.08
text_offset_y = 0


ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
ax.set_yticklabels([])
ax.set_xticklabels([])


# Plot Ormerod
for ormerod_guild in ['plant', 'host', 'starch']:
    d1 = ordin[(mag.genome_type == 'ormerod') & (mag.ormerod_guild == ormerod_guild)]
    ax.scatter(x, y, data=d1,
               c=color_map[ormerod_guild], label=ormerod_guild)

# Plot Mine
ax.scatter(x, y, data=ordin[(mag.genome_type == 'here')],
           color='black', marker='^', alpha=1,
           label='__nolegend__')
for genome_id, coords in ordin[(mag.genome_type == 'here')].iterrows():
    coords = coords
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Plot Reference
ref_mags = ['Muribaculum_intestinale_yl27']
ax.scatter(x, y, data=ordin.loc[ref_mags],
           color='grey', alpha=1,
           label='__nolegend__')

text_offset_x = 0.08
text_offset_y = -0.1
labeled_mags = ['Muribaculum_intestinale_yl27', 'M6', 'Homeothermus_arabinoxylanisolvens']
for genome_id in labeled_mags:
    coords = ordin.loc[genome_id]
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Setup legend
ax.legend(loc='upper right')

# Save unzoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_unzoomed.pdf')

# Zoom view
#ax.set_xlim(-5, 5)
#ax.set_ylim(-8, 2)

# Save zoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_zoomed.pdf')

# Non-cytosoloic GHs in PULs

In [None]:
data = (pd.read_sql("""
SELECT DISTINCT genome_id, domain_id AS func_id, COUNT(feature_id) AS tally
FROM feature_distance
JOIN (SELECT feature_id AS seed_id FROM susC) USING (seed_id)
JOIN feature USING (feature_id)
JOIN sequence USING (sequence_id)
JOIN feature_x_cazy_minimal_domain USING (feature_id)
JOIN feature_localization USING (feature_id)
JOIN feature_to_opf USING (feature_id)
WHERE domain_id LIKE 'GH%'
  AND localization IN ('OM', 'PP', 'IM')
  AND opf_id NOT NULL
  AND distance < 10000
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
data = data.reindex(mag.index).dropna()
data = data.groupby(data.columns.map(lambda x: x.rsplit('_', 1)[0]), axis=1).sum()

function = pd.DataFrame([], index=data.columns)

# Combine domain subfamilies


## Ordination

### All

In [None]:
feats = data.columns[((data > 0).sum() > 2)]
d = data[feats].dropna().apply(lambda x: x > 0)
fit = PCA().fit(d.loc[mag.genome_type == 'ormerod'])
ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

fig, ax = plt.subplots(figsize=(10, 10))

x, y = 'PC1', 'PC2'
text_offset_x = 0.08
text_offset_y = 0


ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
ax.set_yticklabels([])
ax.set_xticklabels([])


# Plot Ormerod
for ormerod_guild in ['plant', 'host', 'starch']:
    d1 = ordin[(mag.genome_type == 'ormerod') & (mag.ormerod_guild == ormerod_guild)]
    ax.scatter(x, y, data=d1,
               c=color_map[ormerod_guild], label=ormerod_guild)

# Plot Mine
ax.scatter(x, y, data=ordin[(mag.genome_type == 'here')],
           color='black', marker='^', alpha=1,
           label='__nolegend__')
for genome_id, coords in ordin[(mag.genome_type == 'here')].iterrows():
    coords = coords
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Plot Reference
ref_mags = ['Muribaculum_intestinale_yl27']
ax.scatter(x, y, data=ordin.loc[ref_mags],
           color='grey', alpha=1,
           label='__nolegend__')

text_offset_x = 0.08
text_offset_y = -0.1
labeled_mags = ['Muribaculum_intestinale_yl27', 'M6', 'Homeothermus_arabinoxylanisolvens']
for genome_id in labeled_mags:
    coords = ordin.loc[genome_id]
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Setup legend
ax.legend(loc='upper right')

# Save unzoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_unzoomed.pdf')

# Zoom view
#ax.set_xlim(-5, 5)
#ax.set_ylim(-8, 2)

# Save zoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_zoomed.pdf')

# OPFs in PULs

In [None]:
data = (pd.read_sql("""
SELECT DISTINCT genome_id, opf_id AS func_id, COUNT(feature_id) AS tally
FROM feature_distance
JOIN (SELECT feature_id AS seed_id FROM susC) USING (seed_id)
JOIN feature USING (feature_id)
JOIN sequence USING (sequence_id)
JOIN feature_x_cazy_minimal_domain USING (feature_id)
JOIN feature_localization USING (feature_id)
JOIN feature_to_opf USING (feature_id)
WHERE domain_id LIKE 'GH%'
--  AND localization IN ('OM', 'PP', 'IM')
  AND opf_id NOT NULL
  AND distance < 10000
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
data = data.reindex(mag.index).dropna()

function = pd.DataFrame([], index=data.columns)

# Combine domain subfamilies


## Ordination

### All

In [None]:
(data > 0).sum()

In [None]:
feats = data.columns[((data > 0).sum() > 2)]
d = data[feats].dropna().apply(np.sqrt)
fit = PCA().fit(d.loc[mag.genome_type == 'ormerod'])
ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

fig, ax = plt.subplots(figsize=(10, 10))

x, y = 'PC1', 'PC2'
text_offset_x = 0.08
text_offset_y = 0


ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
ax.set_yticklabels([])
ax.set_xticklabels([])


# Plot Ormerod
for ormerod_guild in ['plant', 'host', 'starch']:
    d1 = ordin[(mag.genome_type == 'ormerod') & (mag.ormerod_guild == ormerod_guild)]
    ax.scatter(x, y, data=d1,
               c=color_map[ormerod_guild], label=ormerod_guild)

# Plot Mine
ax.scatter(x, y, data=ordin[(mag.genome_type == 'here')],
           color='black', marker='^', alpha=1,
           label='__nolegend__')
for genome_id, coords in ordin[(mag.genome_type == 'here')].iterrows():
    coords = coords
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Plot Reference
ref_mags = ['Muribaculum_intestinale_yl27']
ax.scatter(x, y, data=ordin.loc[ref_mags],
           color='grey', alpha=1,
           label='__nolegend__')

text_offset_x = 0.08
text_offset_y = -0.1
labeled_mags = ['Muribaculum_intestinale_yl27', 'M6', 'Homeothermus_arabinoxylanisolvens']
for genome_id in labeled_mags:
    coords = ordin.loc[genome_id]
    ax.annotate(rename_mag(genome_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Setup legend
ax.legend(loc='upper right')

# Save unzoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_unzoomed.pdf')

# Zoom view
#ax.set_xlim(-5, 5)
#ax.set_ylim(-8, 2)

# Save zoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_zoomed.pdf')

# Manuscript Figure

In [None]:
carb_opf_count = (pd.read_sql("""
SELECT genome_id, opf_id AS func_id, COUNT(feature_id) AS tally
FROM feature JOIN sequence USING (sequence_id)
JOIN feature_to_opf USING (feature_id)
JOIN feature_x_cazy_minimal_domain USING (feature_id)
WHERE domain_id LIKE 'GH%'
  AND opf_id NOT NULL
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
carb_opf_count = carb_opf_count.reindex(mag.index).dropna()

gh_count = (pd.read_sql("""
SELECT genome_id, domain_id AS func_id, COUNT(feature_id) AS tally
FROM feature JOIN sequence USING (sequence_id)
JOIN feature_x_cazy_minimal_domain USING (feature_id)
WHERE domain_id LIKE 'GH%'
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
gh_count = gh_count.groupby(gh_count.columns.map(lambda x: x.rsplit('_', 1)[0]), axis=1).sum()
gh_count = gh_count.reindex(mag.index).dropna()

ormerod_cog_count = (pd.read_sql("""
SELECT genome_id, cog_id AS func_id, COUNT(feature_id) AS tally
FROM feature JOIN sequence USING (sequence_id)
JOIN feature_to_cog USING (feature_id)
WHERE cog_id IN ('COG3507', 'COG3866', 'COG4677', 'COG2730', 'COG3693', 'COG0366', 'COG3525', 'COG3119')
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
ormerod_cog_count = ormerod_cog_count.reindex(mag.index).fillna(0)


In [None]:
fig, axs = plt.subplots(2, 1, figsize=(halfwidth, 6))
np.random.seed(10)

for panel, title, d, ax in [('B', 'Marker COGs', ormerod_cog_count, axs[0]),
                     ('C', 'De novo GH families', carb_opf_count, axs[1])]:
    feats = d.columns[((d > 0).sum() > 2)]
    d = d[feats].dropna()#.apply(np.sqrt)
    fit = PCA().fit(d.loc[mag.genome_type.isin(['ormerod'])])
    ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
    
    if panel == 'C':
        ordin.PC1 = -ordin.PC1
    
    perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

    x, y = 'PC1', 'PC2'


    ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
    ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
#    ax.set_yticklabels([])
#    ax.set_xticklabels([])


    # Plot Ormerod
    for ormerod_guild in ['plant', 'host', 'starch']:
        d1 = ordin[(mag.genome_type == 'ormerod') & (mag.ormerod_guild == ormerod_guild)]
        ax.scatter(x, y, data=d1,
                   c=color_map[ormerod_guild], label=ormerod_guild)

    # Plot Mine
    ax.scatter(x, y, data=ordin[(mag.genome_type == 'here')],
               color='black', marker='^', alpha=1,
               label='__nolegend__')

    # Plot Reference
    ref_mags = ['Muribaculum_intestinale_yl27']
    ax.scatter(x, y, data=ordin.loc[ref_mags],
               color='grey', alpha=1,
               label='__nolegend__')


    other_labeled_mags = [ 'Muribaculum_intestinale_yl27'
                   , 'M6'
                   , 'Homeothermus_arabinoxylanisolvens'
                   ]
    
    for genome_id in list(mag[mag.genome_type == 'here'].index) + other_labeled_mags:
        if genome_id in italics_list:
            fontstyle='italic'
        else:
            fontstyle=None
        coords = ordin.loc[genome_id]
        scale = 0.8
        text_offset_y = 0.5  # np.random.uniform(-1, 1)
        text_offset_x = 0.5  # np.sqrt(1 - text_offset_y**2)
        ax.annotate(rename_mag(genome_id),
                    xy=(coords[x] + text_offset_x * scale, coords[y] + text_offset_y * scale),
                    va='center', ha='center', fontsize=8, fontstyle=fontstyle)
        
    ax.annotate(panel, xy=(0.02, 1.01), xycoords='axes fraction', fontsize=12)
    ax.set_title(title)
    
ax.legend(loc='upper left')
fig.tight_layout()

savefig(fig, 'build/figure_ordination')