# Preamble

## Imports

In [None]:
import pandas as pd
from glob import glob
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import numpy as np
import seaborn as sns
import sqlite3
import matplotlib as mpl

con = sqlite3.connect('data/core.muri.2.denorm.db')
sns.set_context('notebook')

## Metadata

In [None]:
color_map = { 'starch': 'blue'
            , 'host': 'purple'
            , 'plant': 'green'
            }

#mpl.rcParams['text.usetex'] = True

In [None]:
mag = pd.read_table('meta/mag.tsv', index_col='mag_id')

rename_map = { 'Otu0001_vC': 'B1-A'
             , 'Otu0001_vB': 'B1-B'
             , 'Otu0007_vA': 'B2'
             , 'Otu0004_vA': 'B3'
             , 'Otu0005_vA': 'B4'
             , 'Otu0009_vA': 'B5'
             , 'Otu0017_vA': 'B6'
             , 'Otu0049_vA': 'B7'
             , 'Muribaculum_intestinale_yl27': 'Muribaculum intestinale'
             , 'Barnesiella_viscericola_DSM_18177': 'Barnesiella viscericola'
             }

def rename_mag(mag_id):
    if mag_id in rename_map:
        return rename_map[mag_id]
    else:
        return mag_id

# COGs

In [None]:
data = (pd.read_sql("""
SELECT mag_id, cog_id AS func_id, COUNT(feature_id) AS tally
FROM feature JOIN sequence USING (sequence_id)
JOIN feature_to_cog USING (feature_id)
WHERE func_id NOT NULL
GROUP BY mag_id, func_id
                    """, con=con, index_col=['mag_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
data = data.reindex(mag.index)

function = pd.read_sql("""
SELECT cog_id AS func_id, function_category, description
FROM cog
                       """, con=con, index_col=['func_id'])

function = function.loc[data.columns]

In [None]:
((data.loc[['Otu0001_vC', 'Otu0001_vB']] > 0)
      .T.apply(lambda x: pd.Series({ 'just_A': x.Otu0001_vC > x.Otu0001_vB
                                   , 'just_B': x.Otu0001_vB > x.Otu0001_vC
                                   , 'both': x.Otu0001_vC & x.Otu0001_vB
                                   }), axis=1)).sum()

## Ordination

### Ormerod

In [None]:
feats = ['COG3507', 'COG3866', 'COG4677', 'COG2730', 'COG3693', 'COG0366', 'COG3525', 'COG3119']

d = data[feats].apply(np.sqrt)
fit = PCA().fit(d.loc[mag.mag_class == 'ormerod'])
ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

fig, ax = plt.subplots(figsize=(10, 10))

x, y = 'PC1', 'PC2'
text_offset_x = 0.08
text_offset_y = 0


ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
ax.set_yticklabels([])
ax.set_xticklabels([])


# Plot Ormerod
for guild in ['plant', 'host', 'starch']:
    d1 = ordin[(mag.mag_class == 'ormerod') & (mag.guild == guild)]
    ax.scatter(x, y, data=d1,
               c=color_map[guild], label=guild)

# Plot Mine
ax.scatter(x, y, data=ordin[(mag.mag_class == 'here')],
           color='black', marker='^', alpha=1,
           label='__nolegend__')
for mag_id, coords in ordin[(mag.mag_class == 'here')].iterrows():
    coords = coords
    ax.annotate(rename_mag(mag_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Plot Reference
ref_mags = ['Muribaculum_intestinale_yl27']
ax.scatter(x, y, data=ordin.loc[ref_mags],
           color='grey', alpha=1,
           label='__nolegend__')

text_offset_x = -0.08
text_offset_y = 0.1
labeled_mags = ['Muribaculum_intestinale_yl27', 'M6']
for mag_id in labeled_mags:
    coords = ordin.loc[mag_id]
    ax.annotate(rename_mag(mag_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='right', va='center', fontsize=12)
    
# Setup legend
ax.legend(loc='lower right')

#fig.savefig('/Users/bjsmith/Desktop/ormerod_ordination.pdf')

### Carbs

In [None]:
feats = function[function.function_category == 'G'].index

d = data[feats].apply(np.sqrt)
fit = PCA().fit(d.loc[mag.mag_class == 'ormerod'])
ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

fig, ax = plt.subplots(figsize=(10, 10))

x, y = 'PC1', 'PC2'
text_offset_x = 0.08
text_offset_y = 0


ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
ax.set_yticklabels([])
ax.set_xticklabels([])


# Plot Ormerod
for guild in ['plant', 'host', 'starch']:
    d1 = ordin[(mag.mag_class == 'ormerod') & (mag.guild == guild)]
    ax.scatter(x, y, data=d1,
               c=color_map[guild], label=guild)

# Plot Mine
ax.scatter(x, y, data=ordin[(mag.mag_class == 'here')],
           color='black', marker='^', alpha=1,
           label='__nolegend__')
for mag_id, coords in ordin[(mag.mag_class == 'here')].iterrows():
    coords = coords
    ax.annotate(rename_mag(mag_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Plot Reference
ref_mags = ['Muribaculum_intestinale_yl27']
ax.scatter(x, y, data=ordin.loc[ref_mags],
           color='grey', alpha=1,
           label='__nolegend__')

text_offset_x = -0.08
text_offset_y = 0.1
labeled_mags = ['Muribaculum_intestinale_yl27', 'M6']
for mag_id in labeled_mags:
    coords = ordin.loc[mag_id]
    ax.annotate(rename_mag(mag_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='right', va='center', fontsize=12)
    
# Setup legend
ax.legend(loc='lower right')

#fig.savefig('/Users/bjsmith/Desktop/ormerod_ordination.pdf')

### All

In [None]:
feats = data.columns[((data > 0).sum() > 2)]

d = data[feats].apply(np.sqrt)
fit = PCA().fit(d.loc[mag.mag_class == 'ormerod'])
ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

fig, ax = plt.subplots(figsize=(10, 10))

x, y = 'PC1', 'PC2'
text_offset_x = 0.08
text_offset_y = 0


ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
ax.set_yticklabels([])
ax.set_xticklabels([])


# Plot Ormerod
for guild in ['plant', 'host', 'starch']:
    d1 = ordin[(mag.mag_class == 'ormerod') & (mag.guild == guild)]
    ax.scatter(x, y, data=d1,
               c=color_map[guild], label=guild)

# Plot Mine
ax.scatter(x, y, data=ordin[(mag.mag_class == 'here')],
           color='black', marker='^', alpha=1,
           label='__nolegend__')
for mag_id, coords in ordin[(mag.mag_class == 'here')].iterrows():
    coords = coords
    ax.annotate(rename_mag(mag_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Plot Reference
ref_mags = ['Muribaculum_intestinale_yl27']
ax.scatter(x, y, data=ordin.loc[ref_mags],
           color='grey', alpha=1,
           label='__nolegend__')

text_offset_x = -0.08
text_offset_y = 0.1
labeled_mags = ['Muribaculum_intestinale_yl27', 'M6']
for mag_id in labeled_mags:
    coords = ordin.loc[mag_id]
    ax.annotate(rename_mag(mag_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='right', va='center', fontsize=12)
    
# Setup legend
ax.legend(loc='lower right')

#fig.savefig('/Users/bjsmith/Desktop/ormerod_ordination.pdf')

## Frequencies

In [None]:
func_summ = {'all': (data > 0).sum()}
for guild in mag.guild.dropna().unique():
    func_summ[guild] = (data.loc[mag.guild == guild] > 0).mean()
    
for mag_class in mag.mag_class.dropna().unique():
    func_summ[mag_class] = (data.loc[mag.mag_class == mag_class] > 0).sum()

func_summary = pd.DataFrame(func_summ).join(function)

In [None]:
(func_summary
             [ (data.loc['Otu0001_vC'] > 0)
             & (data.loc['Otu0001_vB'] > 0)
             & (data.loc['Otu0007_vA'] > 0)
             ]
             .sort_values(['here', 'starch'], ascending=[True, False])
             .head(10)
)

# OPFs

In [None]:
data = (pd.read_sql("""
SELECT mag_id, opf_id AS func_id, COUNT(feature_id) AS tally
FROM feature JOIN sequence USING (sequence_id)
JOIN feature_to_opf USING (feature_id)
WHERE func_id NOT NULL
GROUP BY mag_id, func_id
                    """, con=con, index_col=['mag_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
data = data.reindex(mag.index)

function = pd.read_sql("""
SELECT
    opf_id AS func_id
  , architecture
  , ko_id
  , ko.description AS ko_description
  , cog_id
  , cog.description AS cog_description
  , function_category AS cog_category
FROM opf_to_architecture
LEFT JOIN opf_to_ko USING (opf_id)
LEFT JOIN ko USING (ko_id)
LEFT JOIN opf_to_cog USING (opf_id)
LEFT JOIN cog USING (cog_id)
                       """, con=con, index_col=['func_id'])

function = function.loc[data.columns]

In [None]:
susC_OPF_list = pd.read_sql("""
SELECT DISTINCT opf_id
FROM feature_details
JOIN putative_susC USING (feature_id)
WHERE opf_id NOT NULL
""", con=con).dropna()
susC_OPF_list = list(susC_OPF_list.opf_id)

susD_OPF_list = pd.read_sql("""
SELECT DISTINCT opf_id
FROM feature_details
JOIN putative_susD USING (feature_id)
WHERE opf_id NOT NULL
""", con=con).dropna()
susD_OPF_list = list(susD_OPF_list.opf_id)

susEF_OPF_list = pd.read_sql("""
SELECT DISTINCT opf_id
FROM feature_details
JOIN putative_susEF USING (feature_id)
WHERE opf_id NOT NULL
""", con=con).dropna()
susEF_OPF_list = list(susEF_OPF_list.opf_id)

gh_OPF_list = pd.read_sql("""
SELECT DISTINCT opf_id
FROM feature_details
JOIN putative_susEF USING (feature_id)
WHERE opf_id NOT NULL
""", con=con).dropna()
susEF_OPF_list = list(susEF_OPF_list.opf_id)

In [None]:
((data.loc[['Otu0001_vC', 'Otu0001_vB']] > 0)
      .T.apply(lambda x: pd.Series({ 'just_A': x.Otu0001_vC > x.Otu0001_vB
                                   , 'just_B': x.Otu0001_vB > x.Otu0001_vC
                                   , 'both': x.Otu0001_vC & x.Otu0001_vB
                                   }), axis=1)).sum()

## Ordination

### Carb COG OPFs

In [None]:
feats = function[function.cog_category.isin(['G'])].index
d = data[feats].apply(np.sqrt)
fit = PCA().fit(d.loc[mag.mag_class == 'ormerod'])
ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

fig, ax = plt.subplots(figsize=(10, 10))

x, y = 'PC1', 'PC2'
text_offset_x = 0.08
text_offset_y = 0


ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
ax.set_yticklabels([])
ax.set_xticklabels([])


# Plot Ormerod
for guild in ['plant', 'host', 'starch']:
    d1 = ordin[(mag.mag_class == 'ormerod') & (mag.guild == guild)]
    ax.scatter(x, y, data=d1,
               c=color_map[guild], label=guild)

# Plot Mine
ax.scatter(x, y, data=ordin[(mag.mag_class == 'here')],
           color='black', marker='^', alpha=1,
           label='__nolegend__')
for mag_id, coords in ordin[(mag.mag_class == 'here')].iterrows():
    coords = coords
    ax.annotate(rename_mag(mag_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Plot Reference
ref_mags = ['Muribaculum_intestinale_yl27']
ax.scatter(x, y, data=ordin.loc[ref_mags],
           color='grey', alpha=1,
           label='__nolegend__')

text_offset_x = -0.08
text_offset_y = 0.1
labeled_mags = ['Muribaculum_intestinale_yl27', 'M6']
for mag_id in labeled_mags:
    coords = ordin.loc[mag_id]
    ax.annotate(rename_mag(mag_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='right', va='center', fontsize=12)
    
# Setup legend
ax.legend(loc='lower right')

# Save unzoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_unzoomed.pdf')

# Zoom view
#ax.set_xlim(-5, 5)
#ax.set_ylim(-8, 2)

# Save zoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_zoomed.pdf')

### SusC/SusD/SusEF OPFs

In [None]:
feats = list(set(function.index) & set(susC_OPF_list + susD_OPF_list + susEF_OPF_list))
d = data[feats].apply(np.sqrt)
fit = PCA().fit(d.loc[mag.mag_class.isin(['ormerod'])])
ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

fig, ax = plt.subplots(figsize=(10, 10))

x, y = 'PC1', 'PC2'
text_offset_x = 0.08
text_offset_y = 0


ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
ax.set_yticklabels([])
ax.set_xticklabels([])


# Plot Ormerod
for guild in ['plant', 'host', 'starch']:
    d1 = ordin[(mag.mag_class == 'ormerod') & (mag.guild == guild)]
    ax.scatter(x, y, data=d1,
               c=color_map[guild], label=guild)

# Plot Mine
ax.scatter(x, y, data=ordin[(mag.mag_class == 'here')],
           color='black', marker='^', alpha=1,
           label='__nolegend__')
for mag_id, coords in ordin[(mag.mag_class == 'here')].iterrows():
    coords = coords
    ax.annotate(rename_mag(mag_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Plot Reference
ref_mags = ['Muribaculum_intestinale_yl27']
ax.scatter(x, y, data=ordin.loc[ref_mags],
           color='grey', alpha=1,
           label='__nolegend__')

text_offset_x = -0.08
text_offset_y = 0.1
labeled_mags = ['Muribaculum_intestinale_yl27', 'M6']
for mag_id in labeled_mags:
    coords = ordin.loc[mag_id]
    ax.annotate(rename_mag(mag_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='right', va='center', fontsize=12)
    
# Setup legend
ax.legend(loc='upper right')

# Save unzoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_unzoomed.pdf')

# Zoom view
#ax.set_xlim(-5, 5)
#ax.set_ylim(-8, 2)

# Save zoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_zoomed.pdf')

### All OPFs

In [None]:
feats = data.columns[((data > 0).sum() > 2)]
d = data[feats].apply(np.sqrt)
fit = PCA().fit(d.loc[mag.mag_class == 'ormerod'])
ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

fig, ax = plt.subplots(figsize=(10, 10))

x, y = 'PC1', 'PC2'
text_offset_x = 0.08
text_offset_y = 0


ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
ax.set_yticklabels([])
ax.set_xticklabels([])


# Plot Ormerod
for guild in ['plant', 'host', 'starch']:
    d1 = ordin[(mag.mag_class == 'ormerod') & (mag.guild == guild)]
    ax.scatter(x, y, data=d1,
               c=color_map[guild], label=guild)

# Plot Mine
ax.scatter(x, y, data=ordin[(mag.mag_class == 'here')],
           color='black', marker='^', alpha=1,
           label='__nolegend__')
for mag_id, coords in ordin[(mag.mag_class == 'here')].iterrows():
    coords = coords
    ax.annotate(rename_mag(mag_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Plot Reference
ref_mags = ['Muribaculum_intestinale_yl27']
ax.scatter(x, y, data=ordin.loc[ref_mags],
           color='grey', alpha=1,
           label='__nolegend__')

text_offset_x = -0.08
text_offset_y = 0.1
labeled_mags = ['Muribaculum_intestinale_yl27', 'M6']
for mag_id in labeled_mags:
    coords = ordin.loc[mag_id]
    ax.annotate(rename_mag(mag_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='right', va='center', fontsize=12)
    
# Setup legend
ax.legend(loc='lower right')

# Save unzoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_unzoomed.pdf')

# Zoom view
#ax.set_xlim(-5, 5)
#ax.set_ylim(-8, 2)

# Save zoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_zoomed.pdf')

## Frequencies

In [None]:
func_summ = {'all': (data > 0).sum()}
for guild in mag.guild.dropna().unique():
    func_summ[guild] = (data.loc[mag.guild == guild] > 0).mean()
    
for mag_class in mag.mag_class.dropna().unique():
    func_summ[mag_class] = (data.loc[mag.mag_class == mag_class] > 0).sum()

func_summary = pd.DataFrame(func_summ).join(function)
func_summary['susC'] = False
func_summary.loc[susC_OPF_list, 'susC'] = True
func_summary['susD'] = False
func_summary.loc[susD_OPF_list, 'susD'] = True
func_summary['susEF'] = False
func_summary.loc[susEF_OPF_list, 'susEF'] = True

In [None]:
(func_summary
             [ (data.loc['Otu0001_vC'] > 0)
#             & (data.loc['Otu0001_vB'] > 0)
             & (data.loc['Otu0007_vA'] > 0)
             & (func_summary.ormerod > 1)
             ]
             .sort_values(['here', 'starch'], ascending=[True, False])
             .head(10)
)

In [None]:
(func_summary
             [ (data.loc['Otu0001_vC'] > 0)
#             & (data.loc['Otu0001_vB'] > 0)
             & (data.loc['Otu0007_vA'] > 0)
             & (func_summary.susC_or_susD)
             ]
             .sort_values(['here', 'starch'], ascending=[True, False])
)

In [None]:
(func_summary
             [ (data.loc['Otu0001_vC'] > 0)
#             & (data.loc['Otu0001_vB'] > 0)
             & (data.loc['Otu0007_vA'] > 0)
             & (func_summary.ormerod > 1)
             ]
             .sort_values(['here', 'starch'], ascending=[True, False])
             .head(10)
)

In [None]:
(func_summary
             [ (data.loc['Otu0001_vC'] > 0)
             & (data.loc['Otu0001_vB'] == 0)
             & (data.loc['Otu0007_vA'] > 0)
             ]
             .sort_values(['starch'], ascending=[False])
)

# KOs

In [None]:
data = (pd.read_sql("""
SELECT mag_id, ko_id AS func_id, COUNT(feature_id) AS tally
FROM feature JOIN sequence USING (sequence_id)
JOIN feature_x_ko USING (feature_id)
WHERE func_id NOT NULL
GROUP BY mag_id, func_id
                    """, con=con, index_col=['mag_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
data = data.reindex(mag.index)

function = pd.read_sql("""
SELECT ko_id AS func_id, description
FROM ko
                       """, con=con, index_col=['func_id'])

function = function.loc[data.columns]

In [None]:
((data.loc[['Otu0001_vC', 'Otu0001_vB']] > 0)
      .T.apply(lambda x: pd.Series({ 'just_A': x.Otu0001_vC > x.Otu0001_vB
                                   , 'just_B': x.Otu0001_vB > x.Otu0001_vC
                                   , 'both': x.Otu0001_vC & x.Otu0001_vB
                                   }), axis=1)).sum()

## Ordination

### All

In [None]:
feats = data.columns[((data > 0).sum() > 2)]
d = data[feats].apply(np.sqrt)
fit = PCA().fit(d.loc[mag.mag_class == 'ormerod'])
ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

fig, ax = plt.subplots(figsize=(10, 10))

x, y = 'PC1', 'PC2'
text_offset_x = 0.08
text_offset_y = 0


ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
ax.set_yticklabels([])
ax.set_xticklabels([])


# Plot Ormerod
for guild in ['plant', 'host', 'starch']:
    d1 = ordin[(mag.mag_class == 'ormerod') & (mag.guild == guild)]
    ax.scatter(x, y, data=d1,
               c=color_map[guild], label=guild)

# Plot Mine
ax.scatter(x, y, data=ordin[(mag.mag_class == 'here')],
           color='black', marker='^', alpha=1,
           label='__nolegend__')
for mag_id, coords in ordin[(mag.mag_class == 'here')].iterrows():
    coords = coords
    ax.annotate(rename_mag(mag_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Plot Reference
ref_mags = ['Muribaculum_intestinale_yl27']
ax.scatter(x, y, data=ordin.loc[ref_mags],
           color='grey', alpha=1,
           label='__nolegend__')

text_offset_x = 0.08
text_offset_y = 0.1
labeled_mags = ['Muribaculum_intestinale_yl27', 'M6']
for mag_id in labeled_mags:
    coords = ordin.loc[mag_id]
    ax.annotate(rename_mag(mag_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Setup legend
ax.legend(loc='upper right')

# Save unzoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_unzoomed.pdf')

# Zoom view
#ax.set_xlim(-5, 5)
#ax.set_ylim(-8, 2)

# Save zoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_zoomed.pdf')

## Frequencies

In [None]:
func_summ = {'all': (data > 0).sum()}
for guild in mag.guild.dropna().unique():
    func_summ[guild] = (data.loc[mag.guild == guild] > 0).mean()
    
for mag_class in mag.mag_class.dropna().unique():
    func_summ[mag_class] = (data.loc[mag.mag_class == mag_class] > 0).sum()

func_summary = pd.DataFrame(func_summ).join(function)

In [None]:
(func_summary
             [ (data.loc['Otu0001_vC'] > 0)
#             & (data.loc['Otu0001_vB'] > 0)
             & (data.loc['Otu0007_vA'] > 0)
             ]
             .sort_values(['here', 'starch'], ascending=[True, False])
             .head(10)
)

# GHs

In [None]:
data = (pd.read_sql("""
SELECT mag_id, domain_id AS func_id, COUNT(feature_id) AS tally
FROM feature JOIN sequence USING (sequence_id)
JOIN feature_x_cazy_minimal_domain USING (feature_id)
WHERE func_id LIKE 'GH%'
GROUP BY mag_id, func_id
                    """, con=con, index_col=['mag_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
data = data.reindex(mag.index)
data = data.groupby(data.columns.map(lambda x: x.rsplit('_', 1)[0]), axis=1).sum()

function = pd.read_sql("""
SELECT domain_id AS func_id
FROM cazy_domain
WHERE func_id LIKE 'GH%'
                       """, con=con, index_col=['func_id'])

function = function.loc[data.columns]


# Combine domain subfamilies


## Ordination

### All

In [None]:
feats = data.columns[((data > 0).sum() > 2)]
d = data[feats].apply(np.sqrt)
fit = PCA().fit(d.loc[mag.mag_class == 'ormerod'])
ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

fig, ax = plt.subplots(figsize=(10, 10))

x, y = 'PC1', 'PC2'
text_offset_x = 0.08
text_offset_y = 0


ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
ax.set_yticklabels([])
ax.set_xticklabels([])


# Plot Ormerod
for guild in ['plant', 'host', 'starch']:
    d1 = ordin[(mag.mag_class == 'ormerod') & (mag.guild == guild)]
    ax.scatter(x, y, data=d1,
               c=color_map[guild], label=guild)

# Plot Mine
ax.scatter(x, y, data=ordin[(mag.mag_class == 'here')],
           color='black', marker='^', alpha=1,
           label='__nolegend__')
for mag_id, coords in ordin[(mag.mag_class == 'here')].iterrows():
    coords = coords
    ax.annotate(rename_mag(mag_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Plot Reference
ref_mags = ['Muribaculum_intestinale_yl27']
ax.scatter(x, y, data=ordin.loc[ref_mags],
           color='grey', alpha=1,
           label='__nolegend__')

text_offset_x = 0.08
text_offset_y = 0.1
labeled_mags = ['Muribaculum_intestinale_yl27', 'M6']
for mag_id in labeled_mags:
    coords = ordin.loc[mag_id]
    ax.annotate(rename_mag(mag_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Setup legend
ax.legend(loc='upper right')

# Save unzoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_unzoomed.pdf')

# Zoom view
#ax.set_xlim(-5, 5)
#ax.set_ylim(-8, 2)

# Save zoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_zoomed.pdf')

In [None]:
sns.clustermap(data.loc[mag.mag_class.isin(['ormerod', 'here']), (data > 0).sum() > 1].apply(np.sqrt).T,
               robust=True,
               figsize=(10, 25),
               xticklabels=1, yticklabels=1)

## Frequencies

In [None]:
func_summ = {'all': (data > 0).sum()}
for guild in mag.guild.dropna().unique():
    func_summ[guild] = (data.loc[mag.guild == guild] > 0).mean()
    
for mag_class in mag.mag_class.dropna().unique():
    func_summ[mag_class] = (data.loc[mag.mag_class == mag_class] > 0).sum()

func_summary = pd.DataFrame(func_summ).join(function)

In [None]:
(func_summary
             [ (data.loc['Otu0001_vC'] > 0)
#             & (data.loc['Otu0001_vB'] > 0)
             & (data.loc['Otu0007_vA'] > 0)
             ]
             .sort_values(['here', 'starch'], ascending=[True, False])
             .head(10)
)

# Non-cytosolic GHs

In [None]:
pd.read_sql("SELECT DISTINCT localization FROM feature_localization", con=con)

In [None]:
data = (pd.read_sql("""
SELECT mag_id, domain_id AS func_id, COUNT(feature_id) AS tally
FROM feature JOIN sequence USING (sequence_id)
JOIN feature_x_cazy_minimal_domain USING (feature_id)
JOIN feature_localization USING (feature_id)
WHERE func_id LIKE 'GH%'
  AND localization IN ('PP', 'OM', 'OM?', 'PP?/OM?', 'IM', 'IM?', 'PP?', 'OM??', 'PP??', 'IM?')
GROUP BY mag_id, func_id
                    """, con=con, index_col=['mag_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
data = data.reindex(mag.index)
data = data.groupby(data.columns.map(lambda x: x.rsplit('_', 1)[0]), axis=1).sum()

function = pd.read_sql("""
SELECT domain_id AS func_id
FROM cazy_domain
WHERE func_id LIKE 'GH%'
                       """, con=con, index_col=['func_id'])

function = function.loc[data.columns]


# Combine domain subfamilies


## Ordination

### All

In [None]:
feats = data.columns[((data > 0).sum() > 1)]
d = data[feats].apply(np.sqrt)
fit = PCA().fit(d.loc[mag.mag_class == 'ormerod'])
ordin = pd.DataFrame(fit.transform(d), index=d.index).rename(lambda i: 'PC{}'.format(i + 1), axis='columns')
perc_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

fig, ax = plt.subplots(figsize=(10, 10))

x, y = 'PC1', 'PC2'
text_offset_x = 0.08
text_offset_y = 0


ax.set_xlabel('{} ({:.0%})'.format(x, perc_explained[x]))
ax.set_ylabel('{} ({:.0%})'.format(y, perc_explained[y]))
ax.set_yticklabels([])
ax.set_xticklabels([])


# Plot Ormerod
for guild in ['plant', 'host', 'starch']:
    d1 = ordin[(mag.mag_class == 'ormerod') & (mag.guild == guild)]
    ax.scatter(x, y, data=d1,
               c=color_map[guild], label=guild)

# Plot Mine
ax.scatter(x, y, data=ordin[(mag.mag_class == 'here')],
           color='black', marker='^', alpha=1,
           label='__nolegend__')
for mag_id, coords in ordin[(mag.mag_class == 'here')].iterrows():
    coords = coords
    ax.annotate(rename_mag(mag_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Plot Reference
ref_mags = ['Muribaculum_intestinale_yl27']
ax.scatter(x, y, data=ordin.loc[ref_mags],
           color='grey', alpha=1,
           label='__nolegend__')

text_offset_x = 0.08
text_offset_y = 0.1
labeled_mags = ['Muribaculum_intestinale_yl27', 'M6']
for mag_id in labeled_mags:
    coords = ordin.loc[mag_id]
    ax.annotate(rename_mag(mag_id), xy=(coords[x] + text_offset_x, coords[y] + text_offset_y),
                ha='left', va='center', fontsize=12)
    
# Setup legend
ax.legend(loc='upper right')

# Save unzoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_unzoomed.pdf')

# Zoom view
#ax.set_xlim(-5, 5)
#ax.set_ylim(-8, 2)

# Save zoomed plot
#fig.savefig('/Users/bjsmith/Desktop/opf_ordination_zoomed.pdf')