# Preamble

In [None]:
import pandas as pd
from glob import glob
import matplotlib.pyplot as plt
from skbio.diversity import beta_diversity
from skbio.stats.ordination import pcoa
from sklearn.decomposition import PCA
import numpy as np
import seaborn as sns

In [None]:
def pcoa_t(counts, diss='jaccard', return_prop_explained=False):
    dist = beta_diversity(diss, counts.values, ids=counts.index)
    pcoa_fit = pcoa(dist)
    pcoa_result = pcoa_fit.samples
    if return_prop_explained:
        return pcoa_result, pd.Series(pcoa_fit.proportion_explained, index=pcoa_result.columns)
    else:
        return pcoa_result

def pca_t(counts, return_prop_explained=False):
    fit = PCA().fit(counts)
    values = fit.transform(counts)
    out = pd.DataFrame(values, index=counts.index)
    out.rename(lambda i: 'PC{}'.format(i), axis='columns', inplace=True)
    if return_prop_explained:
        return out, pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=out.columns)
    else:
        return out

In [None]:
ormerod_desig = { 'GP4': 'plant'
                , 'GP3': 'plant'
                , 'H7': 'plant'
                , 'K1': 'plant'
                , 'M13': 'plant'
                , 'GP1': 'plant'
                , 'M2': 'plant'
                , 'M8': 'plant'
                , 'M1': 'plant'
                , 'M12': 'plant'
                , 'H5': 'plant'
                , 'Homeothermus_arabinoxylanisolvens': 'plant'
                , 'GP2': 'host'
                , 'M9': 'host'
                , 'M14': 'host'
                , 'H6': 'host'
                , 'M5': 'host'
                , 'M6': 'starch'
                , 'M11': 'starch'
                , 'H2': 'starch'
                , 'H4': 'starch'
                , 'H10': 'starch'
                , 'M10': 'starch'
                , 'H3': 'starch'
                , 'H8': 'starch'
                , 'H9': 'starch'
                , 'H1': 'starch'
                , 'H3': 'starch'
                , 'M7': 'starch'
                , 'K10': 'starch'
                , 'M3': 'starch'
                }

mag_desig = {
      'Otu0001_vC': 'Muribaculaceae'
    , 'Otu0001_vB': 'Muribaculaceae'
    , 'Otu0007_vA': 'Muribaculaceae'
    , 'Otu0009_vA': 'Muribaculaceae'
    , 'Otu0005_vA': 'Muribaculaceae'
    , 'Otu0004_vA': 'Muribaculaceae'
    , 'Otu0049_vA': 'Muribaculaceae'
    , 'Otu0017_vA': 'Muribaculaceae'
    , 'Otu0041_v0': 'Bacteroides'
}

ormerod_strains = list(ormerod_desig.keys()) +  ['Muribaculum_intestinale_yl27']
plant_strain = [k for k in ormerod_desig if ormerod_desig[k] == 'plant']
host_strain = [k for k in ormerod_desig if ormerod_desig[k] == 'host']
starch_strain = [k for k in ormerod_desig if ormerod_desig[k] == 'starch']
muri_strains = [k for k in mag_desig if mag_desig[k] == 'Muribaculaceae']

In [None]:
palette = {'starch': 'blue', 'host': 'purple', 'plant': 'green'}
num_features_to_plot = 30

# ECs

In [None]:
function = pd.read_table('ref/expasy.tsv', names=['func_id', 'description'], index_col='func_id')

In [None]:
data = (pd.read_table('data/core.a.mags.muri.g.rfn.ec-annot.count.tsv',
                      names=['mag_id', 'func_id', 'tally'],
                      index_col=['mag_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0))

In [None]:
for func_id in set(function.index) - set(data.columns):
    data[func_id] = 0

In [None]:
data.sum(1).sort_values(ascending=False)

In [None]:
freq_in_ormerod_strains = (data.loc[ormerod_strains] > 0).mean()
freq_in_plant_strains = (data.loc[plant_strain] > 0).mean()
freq_in_host_strains = (data.loc[host_strain] > 0).mean()
freq_in_starch_strains = (data.loc[starch_strain] > 0).mean()
freq_in_muri_strains = (data.loc[muri_strains] > 0).mean()
freq_in_all = (data.loc[muri_strains + ormerod_strains] > 0).mean()
freq = pd.DataFrame({ 'ormerod_freq': freq_in_ormerod_strains
                    , 'plant_freq': freq_in_plant_strains
                    , 'host_freq': freq_in_host_strains
                    , 'starch_freq': freq_in_starch_strains
                    , 'muri_freq': freq_in_muri_strains
                    , 'all_freq': freq_in_all
                    })
freq = freq[['all_freq', 'muri_freq', 'ormerod_freq', 'plant_freq', 'host_freq', 'starch_freq']]

## All Categories

### Abundance

In [None]:
feats = list(set(function.index) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]).apply(np.sqrt)

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, color='k', s=5)
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(num_features_to_plot).index]
scale = 20
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')
    
function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(num_features_to_plot).T

### Presence/Absence

In [None]:
feats = list(set(function.index) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]) > 0

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, color='k', s=5)
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(num_features_to_plot).index]
scale = 20
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')
    
function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(num_features_to_plot).T

## Carb Related

### Abundance

In [None]:
feats = list(set(function[function.index.str.startswith('3.2.1')].index) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]).apply(np.sqrt)

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, color='k', s=5)
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(num_features_to_plot).index]
scale = 20
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')
    
function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(num_features_to_plot).T

### Presence/Absence

In [None]:
feats = list(set(function.index) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]) > 0

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, color='k', s=5)
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(num_features_to_plot).index]
scale = 20
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')
    
function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(num_features_to_plot).T

## Frequencies

In [None]:
a = function.join(freq)
a['diff'] = a.muri_freq - a.ormerod_freq
spread_x = 0.02
spread_y = 0.03
a['x'] = a.ormerod_freq + np.random.uniform(-spread_x, spread_x, size=len(a.ormerod_freq))
a['y'] = a.muri_freq + np.random.uniform(-spread_y, spread_y, size=len(a.muri_freq))
plt.scatter('x', 'y', data=a, c='diff', alpha=0.5)

In [None]:
sns.stripplot('muri_freq', 'ormerod_freq', data=freq, jitter=True, alpha=0.2)

### Present in OTU-1-UM and OTU-7

In [None]:
(function.loc[(data.loc['Otu0001_vC'] > 0) &
                  (data.loc['Otu0007_vA'] > 0)]
             .join(freq)
             .sort_values(['muri_freq', 'starch_freq'], ascending=[True, False])
             
).head(20)

### Present in OTU-1-UM but not OTU-1-UT

In [None]:
(function.loc[(data.loc['Otu0001_vC'] > 0) &
              (data.loc['Otu0001_vB'] == 0)]
             .join(freq)
             .sort_values('starch_freq', ascending=False)        
)

### Present in OTU-1-UT but not OTU-1-UM

In [None]:
(function.loc[(data.loc['Otu0001_vC'] == 0) &
                  (data.loc['Otu0001_vB'] > 0)]
             .join(freq)
             .sort_values('starch_freq', ascending=False)        
             
)

### Present in OTU-1-UM and OTU-7, but not in OTU-1-UT

In [None]:
(function.loc[(data.loc['Otu0001_vC'] > 0) &
                  (data.loc['Otu0001_vB'] == 0) &
                  (data.loc['Otu0007_vA'] > 0)]
             .join(freq)
             .sort_values(['host_freq'], ascending=False)
             
).head(10)

### Missing in OTU-1-UM and OTU-7 but not in OTU-1-UT

In [None]:
(function.loc[(data.loc['Otu0001_vC'] == 0) &
                  (data.loc['Otu0001_vB'] > 0) &
                  (data.loc['Otu0007_vA'] == 0)]
             .join(freq)
             .sort_values('ormerod_freq', ascending=False)
             
).head(10)

# COGs

In [None]:
function = pd.read_table('ref/cog_function.tsv', index_col='cog_id')
function.index.name = 'func_id'

In [None]:
data = (pd.read_table('data/core.a.mags.muri.g.rfn.cog-annot.count.tsv',
                      names=['mag_id', 'func_id', 'tally'],
                      index_col=['mag_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0))

In [None]:
for func_id in set(function.index) - set(data.columns):
    data[func_id] = 0

In [None]:
data.sum(1).sort_values(ascending=False)

In [None]:
freq_in_ormerod_strains = (data.loc[ormerod_strains] > 0).mean()
freq_in_plant_strains = (data.loc[plant_strain] > 0).mean()
freq_in_host_strains = (data.loc[host_strain] > 0).mean()
freq_in_starch_strains = (data.loc[starch_strain] > 0).mean()
freq_in_muri_strains = (data.loc[muri_strains] > 0).mean()
freq_in_all = (data.loc[muri_strains + ormerod_strains] > 0).mean()
freq = pd.DataFrame({ 'ormerod_freq': freq_in_ormerod_strains
                    , 'plant_freq': freq_in_plant_strains
                    , 'host_freq': freq_in_host_strains
                    , 'starch_freq': freq_in_starch_strains
                    , 'muri_freq': freq_in_muri_strains
                    , 'all_freq': freq_in_all
                    })
freq = freq[['all_freq', 'muri_freq', 'ormerod_freq', 'plant_freq', 'host_freq', 'starch_freq']]

## All Categories

### Abundance

In [None]:
feats = list(set(function.index) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]).apply(np.sqrt)

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, color='k', s=5)
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(num_features_to_plot).index]
scale = 20
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')
    
function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(num_features_to_plot).T

### Presence/Absence

In [None]:
feats = list(set(function.index) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]) > 0

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, color='k', s=5)
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(num_features_to_plot).index]
scale = 20
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')
    
function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(num_features_to_plot).T

## Carbohydrates

### Abundance


In [None]:
feats = list(set(function[lambda x: x.function_categories.str.contains('G')].index) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]).apply(np.sqrt)

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, color='k', s=5)
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(num_features_to_plot).index]
scale = 5
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')
    
function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(num_features_to_plot).T

In [None]:
sns.clustermap(d.loc[:,d.sum() != 0].T.apply(np.sqrt), robust=True,
                   figsize=(10, 10), vmin=0)

### Presence/Absence

In [None]:
feats = list(set(function[lambda x: x.function_categories.str.contains('G')].index) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]) > 0

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, color='k', s=5)
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(num_features_to_plot).index]
scale = 20
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')
    
function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(num_features_to_plot).T

## Ormerod COGs (mostly carbs)

### Abundance

In [None]:
feats = list(set(['COG3507', 'COG3866', 'COG4677', 'COG2730', 'COG3693', 'COG0366', 'COG3525', 'COG3119']) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]).apply(np.sqrt)

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, color='k', s=5)
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(num_features_to_plot).index]
scale = 5
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')
    
function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(num_features_to_plot).T

## Amino-acid COGs

### Abundance

In [None]:
feats = list(set(function[lambda x: x.function_categories.str.contains('E')].index) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]).apply(np.sqrt)

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, color='k', s=5)
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(num_features_to_plot).index]
scale = 5
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')
    
function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(num_features_to_plot).T

In [None]:
ordin.loc[['Otu0001_vC', 'Otu0001_vB', 'Otu0007_vA'],['PC0', 'PC1']]

### Presence/Absence

In [None]:
feats = list(set(function[lambda x: x.function_categories.str.contains('E')].index) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]) > 0

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, color='k', s=5)
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(num_features_to_plot).index]
scale = 20
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')
    
function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(num_features_to_plot).T

## Frequencies

In [None]:
a = function.join(freq)
a['diff'] = a.muri_freq - a.ormerod_freq
spread_x = 0.02
spread_y = 0.03
a['x'] = a.ormerod_freq + np.random.uniform(-spread_x, spread_x, size=len(a.ormerod_freq))
a['y'] = a.muri_freq + np.random.uniform(-spread_y, spread_y, size=len(a.muri_freq))
plt.scatter('x', 'y', data=a, c='diff', alpha=0.5)

In [None]:
sns.stripplot('muri_freq', 'ormerod_freq', data=freq, jitter=True, alpha=0.2)

### Present in OTU-1-UM and OTU-7

In [None]:
(function.loc[(data.loc['Otu0001_vC'] > 0) &
                  (data.loc['Otu0007_vA'] > 0)]
             .join(freq)
             .sort_values(['muri_freq', 'starch_freq'], ascending=[True, False])
             
).head(20)

### Present in OTU-1-UM but not OTU-1-UT

In [None]:
(function.loc[(data.loc['Otu0001_vC'] > 0) &
              (data.loc['Otu0001_vB'] == 0)]
             .join(freq)
             .sort_values(['muri_freq', 'starch_freq'], ascending=[True, False])
)

### Present in OTU-1-UT but not OTU-1-UM

In [None]:
(function.loc[(data.loc['Otu0001_vC'] == 0) &
                  (data.loc['Otu0001_vB'] > 0)]
             .join(freq)
             .sort_values(['muri_freq', 'starch_freq'], ascending=[True, False])
             
)

### Present in OTU-1-UM and OTU-7, but not in OTU-1-UT

In [None]:
(function.loc[(data.loc['Otu0001_vC'] > 0) &
                  (data.loc['Otu0001_vB'] == 0) &
                  (data.loc['Otu0007_vA'] > 0)]
             .join(freq)
             .sort_values(['host_freq'], ascending=False)
             
).head(10)

### Missing in OTU-1-UM and OTU-7 but not in OTU-1-UT

In [None]:
(function.loc[(data.loc['Otu0001_vC'] == 0) &
                  (data.loc['Otu0001_vB'] > 0) &
                  (data.loc['Otu0007_vA'] == 0)]
             .join(freq)
             .sort_values(['muri_freq', 'starch_freq'], ascending=[True, False])
             
).head(10)

# KOs

In [None]:
function = (pd.read_table('ref/kegg.tsv')
              .rename(columns={'koid': 'func_id',
                               'koname': 'description'})
              .set_index('func_id'))

function

In [None]:
data = (pd.read_table('data/core.a.mags.muri.g.rfn.ko-annot.count.tsv',
                      names=['mag_id', 'func_id', 'tally'],
                      index_col=['mag_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0))

In [None]:
for func_id in set(function.index) - set(data.columns):
    data[func_id] = 0

In [None]:
data.sum(1).sort_values(ascending=False)

In [None]:
freq_in_ormerod_strains = (data.loc[ormerod_strains] > 0).mean()
freq_in_plant_strains = (data.loc[plant_strain] > 0).mean()
freq_in_host_strains = (data.loc[host_strain] > 0).mean()
freq_in_starch_strains = (data.loc[starch_strain] > 0).mean()
freq_in_muri_strains = (data.loc[muri_strains] > 0).mean()
freq_in_all = (data.loc[muri_strains + ormerod_strains] > 0).mean()
freq = pd.DataFrame({ 'ormerod_freq': freq_in_ormerod_strains
                    , 'plant_freq': freq_in_plant_strains
                    , 'host_freq': freq_in_host_strains
                    , 'starch_freq': freq_in_starch_strains
                    , 'muri_freq': freq_in_muri_strains
                    , 'all_freq': freq_in_all
                    })
freq = freq[['all_freq', 'muri_freq', 'ormerod_freq', 'plant_freq', 'host_freq', 'starch_freq']]

## All Categories

### Abundance

In [None]:
feats = list(set(function.index) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]).apply(np.sqrt)

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, color='k', s=5)
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(num_features_to_plot).index]
scale = 20
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')
    
function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(num_features_to_plot).T

### Presence/Absence

In [None]:
feats = list(set(function.index) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]) > 0

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, color='k', s=5)
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(num_features_to_plot).index]
scale = 20
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')
    
function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(num_features_to_plot).T

## Frequencies

In [None]:
a = function.join(freq)
a['diff'] = a.muri_freq - a.ormerod_freq
spread_x = 0.02
spread_y = 0.03
a['x'] = a.ormerod_freq + np.random.uniform(-spread_x, spread_x, size=len(a.ormerod_freq))
a['y'] = a.muri_freq + np.random.uniform(-spread_y, spread_y, size=len(a.muri_freq))
plt.scatter('x', 'y', data=a, c='diff', alpha=0.5)

In [None]:
sns.stripplot('muri_freq', 'ormerod_freq', data=freq, jitter=True, alpha=0.2)

### Present in OTU-1-UM and OTU-7

In [None]:
(function.loc[(data.loc['Otu0001_vC'] > 0) &
                  (data.loc['Otu0007_vA'] > 0)]
             .join(freq)
             .sort_values(['muri_freq', 'starch_freq'], ascending=[True, False])
             
).head(20)

### Present in OTU-1-UM but not OTU-1-UT

In [None]:
(function.loc[(data.loc['Otu0001_vC'] > 0) &
              (data.loc['Otu0001_vB'] == 0)]
             .join(freq)
             .sort_values('starch_freq', ascending=False)        
)

### Present in OTU-1-UT but not OTU-1-UM

In [None]:
(function.loc[(data.loc['Otu0001_vC'] == 0) &
                  (data.loc['Otu0001_vB'] > 0)]
             .join(freq)
             .sort_values('starch_freq', ascending=False)        
             
)

### Present in OTU-1-UM and OTU-7, but not in OTU-1-UT

In [None]:
(function.loc[(data.loc['Otu0001_vC'] > 0) &
                  (data.loc['Otu0001_vB'] == 0) &
                  (data.loc['Otu0007_vA'] > 0)]
             .join(freq)
             .sort_values(['host_freq'], ascending=False)
             
).head(10)

### Missing in OTU-1-UM and OTU-7 but not in OTU-1-UT

In [None]:
(function.loc[(data.loc['Otu0001_vC'] == 0) &
                  (data.loc['Otu0001_vB'] > 0) &
                  (data.loc['Otu0007_vA'] == 0)]
             .join(freq)
             .sort_values('ormerod_freq', ascending=False)
             
).head(10)

# OPFs

In [None]:
data = (pd.read_table('data/core.a.mags.muri.g.rfn.denovo50-clust.count.tsv',
                      names=['mag_id', 'func_id', 'tally'],
                      index_col=['mag_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0))

In [None]:
function = pd.DataFrame({}, index=data.columns)

In [None]:
data.sum(1).sort_values(ascending=False)

In [None]:
freq_in_ormerod_strains = (data.loc[ormerod_strains] > 0).mean()
freq_in_plant_strains = (data.loc[plant_strain] > 0).mean()
freq_in_host_strains = (data.loc[host_strain] > 0).mean()
freq_in_starch_strains = (data.loc[starch_strain] > 0).mean()
freq_in_muri_strains = (data.loc[muri_strains] > 0).mean()
freq_in_all = (data.loc[muri_strains + ormerod_strains] > 0).mean()
freq = pd.DataFrame({ 'ormerod_freq': freq_in_ormerod_strains
                    , 'plant_freq': freq_in_plant_strains
                    , 'host_freq': freq_in_host_strains
                    , 'starch_freq': freq_in_starch_strains
                    , 'muri_freq': freq_in_muri_strains
                    , 'all_freq': freq_in_all
                    })
freq = freq[['all_freq', 'muri_freq', 'ormerod_freq', 'plant_freq', 'host_freq', 'starch_freq']]

## All Categories

### Abundance

In [None]:
feats = list(data.columns)
d = (data.loc[ormerod_strains + muri_strains, feats]).apply(np.sqrt)

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1', 'PC2'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, c=ordin[comps[2]], s=30, lw=1, edgecolors='k', cmap='Greys')
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(num_features_to_plot).index]
scale = 200
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')
    
function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(num_features_to_plot).T

### Presence/Absence

In [None]:
feats = list(set(function.index) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]) > 0

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1', 'PC2'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, c=ordin[comps[2]], s=30, lw=1, edgecolors='k', cmap='Greys')
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(num_features_to_plot).index]
scale = 300
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')

function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(num_features_to_plot).T

## Common Clusters

In [None]:
freq_thresh = 0.10

### Abundance

In [None]:
feats = list(set(freq[freq.all_freq > freq_thresh].index) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]).apply(np.sqrt)

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1', 'PC2'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, c=ordin[comps[2]], s=30, lw=1, edgecolors='k', cmap='Greys')
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(num_features_to_plot).index]
scale = 100
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')
    
function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(num_features_to_plot).T

### Presence/Absence

In [None]:
feats = list(set(freq[freq.all_freq > freq_thresh].index) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]) > 0

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1', 'PC2'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, c=ordin[comps[2]], s=30, lw=1, edgecolors='k', cmap='Greys')
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(num_features_to_plot).index]
scale = 20
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')

function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(num_features_to_plot).T

## Frequencies

In [None]:
a = function.join(freq)
a['diff'] = a.muri_freq - a.ormerod_freq
spread_x = 0.02
spread_y = 0.03
a['x'] = a.ormerod_freq + np.random.uniform(-spread_x, spread_x, size=len(a.ormerod_freq))
a['y'] = a.muri_freq + np.random.uniform(-spread_y, spread_y, size=len(a.muri_freq))
plt.scatter('x', 'y', data=a, c='diff', alpha=0.5)

In [None]:
sns.stripplot('muri_freq', 'ormerod_freq', data=freq, jitter=True, alpha=0.2)

### Present in OTU-1-UM and OTU-7

Traits shared by these two responding OTUs,
rare in other strains reconstructed in this study,
and common in starch-specializing Muribaculaceae (according to Ormerod),
may be evidence of a shared niche for these two strains.

In [None]:
(function.loc[(data.loc['Otu0001_vC'] > 0) &
                  (data.loc['Otu0007_vA'] > 0)]
             .join(freq)
             .sort_values(['muri_freq', 'all_freq'], ascending=[True, False])
             .head(40)
)

In [None]:
(function.loc[ (data.loc['Bacteroides_thetaiotaomicron_VPI5482'] == 0)
             & (data.loc['Bacteroides_ovatus_ATCC_8483'] > 0)
             & (data.loc['Otu0001_vC'] > 0)
             & (data.loc['Otu0007_vA'] > 0)
             ]
             .join(freq)
             .sort_values(['muri_freq', 'starch_freq'], ascending=[True, False])
             .head(20)
)

### Present in either OTU-1-UM or OTU-1-UT but not both

Mechanism of non-response to ACA at UT?

In [None]:
(function.loc[(data.loc['Otu0001_vC'] > 0) &
              (data.loc['Otu0001_vB'] == 0)]
             .join(freq)
             .sort_values('starch_freq', ascending=False)
             .head(10)
)

In [None]:
(function.loc[(data.loc['Otu0001_vC'] == 0) &
              (data.loc['Otu0001_vB'] > 0)]
             .join(freq)
             .sort_values('starch_freq', ascending=False)
             .head(10)
)

### Present in OTU-1-UM and OTU-7, but not in OTU-1-UT

In [None]:
(function.loc[(data.loc['Otu0001_vC'] > 0) &
                  (data.loc['Otu0001_vB'] == 0) &
                  (data.loc['Otu0007_vA'] > 0)]
             .join(freq)
             .sort_values(['starch_freq'], ascending=False)
             .head(10)

)

### Present in OTU-1-UT and OTU-7, but not in OTU-1-UM

Evidence of HGT?  Further suggests same niche?

In [None]:
(function.loc[(data.loc['Otu0001_vC'] == 0) &
                  (data.loc['Otu0001_vB'] > 0) &
                  (data.loc['Otu0007_vA'] > 0)]
             .join(freq)
             .sort_values(['all_freq'])
             .head(10)

)

### Present in OTU-7 but not in OTU-1-UT

Potential mechanism of competitive advantage?

In [None]:
(function.loc[(data.loc['Otu0001_vB'] == 0) &
                  (data.loc['Otu0007_vA'] > 0)]
             .join(freq)
             .sort_values(['starch_freq'], ascending=False)
             .head(10)

)

### Present in only OTU-1 and OTU-7, but not any Ormerod Muri

In [None]:
(function.loc[((data.loc['Otu0001_vB'] > 0) | (data.loc['Otu0001_vC'] > 0)) &
               (data.loc['Otu0007_vA'] > 0)]
             .join(freq)[lambda x: x.ormerod_freq == 0]

)

# OPFs w/ CAZy domains

In [None]:
data = (pd.read_table('data/core.a.mags.muri.g.rfn.denovo50-clust.count.tsv',
                      names=['mag_id', 'func_id', 'tally'],
                      index_col=['mag_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0))

dbcan_list = pd.read_table('opfs_with_dbcan_domains.tsv')
data = data[list(dbcan_list.opf_id.unique())]

In [None]:
function = (dbcan_list.rename(columns={'opf_id': 'func_id',
                                       'domain_id': 'description'})
                      .groupby('func_id')
                      .apply(lambda x: ', '.join(x.description))
                      .to_frame(name='description')
           )


In [None]:
data.sum(1).sort_values(ascending=False)

In [None]:
freq_in_ormerod_strains = (data.loc[ormerod_strains] > 0).mean()
freq_in_plant_strains = (data.loc[plant_strain] > 0).mean()
freq_in_host_strains = (data.loc[host_strain] > 0).mean()
freq_in_starch_strains = (data.loc[starch_strain] > 0).mean()
freq_in_muri_strains = (data.loc[muri_strains] > 0).mean()
freq_in_all = (data.loc[muri_strains + ormerod_strains] > 0).mean()
freq = pd.DataFrame({ 'ormerod_freq': freq_in_ormerod_strains
                    , 'plant_freq': freq_in_plant_strains
                    , 'host_freq': freq_in_host_strains
                    , 'starch_freq': freq_in_starch_strains
                    , 'muri_freq': freq_in_muri_strains
                    , 'all_freq': freq_in_all
                    })
freq = freq[['all_freq', 'muri_freq', 'ormerod_freq', 'plant_freq', 'host_freq', 'starch_freq']]

## All Categories

### Abundance

In [None]:
feats = list(data.columns)
d = (data.loc[ormerod_strains + muri_strains, feats]).apply(np.sqrt)

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1', 'PC2'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, c=ordin[comps[2]], s=30, lw=1, edgecolors='k', cmap='Greys')
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(num_features_to_plot).index]
scale = 30
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')
    
function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(num_features_to_plot).T

### Presence/Absence

In [None]:
feats = list(set(function.index) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]) > 0

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1', 'PC2'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, c=ordin[comps[2]], s=30, lw=1, edgecolors='k', cmap='Greys')
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(num_features_to_plot).index]
scale = 300
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')

function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(num_features_to_plot).T

## Common Clusters

In [None]:
freq_thresh = 0.10

### Abundance

In [None]:
feats = list(set(freq[freq.all_freq > freq_thresh].index) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]).apply(np.sqrt)

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1', 'PC2'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, c=ordin[comps[2]], s=30, lw=1, edgecolors='k', cmap='Greys')
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(num_features_to_plot).index]
scale = 100
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')
    
function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(num_features_to_plot).T

### Presence/Absence

In [None]:
feats = list(set(freq[freq.all_freq > freq_thresh].index) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]) > 0

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1', 'PC2'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, c=ordin[comps[2]], s=30, lw=1, edgecolors='k', cmap='Greys')
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(num_features_to_plot).index]
scale = 20
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')

function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(num_features_to_plot).T

## Frequencies

In [None]:
a = function.join(freq)
a['diff'] = a.muri_freq - a.ormerod_freq
spread_x = 0.02
spread_y = 0.03
a['x'] = a.ormerod_freq + np.random.uniform(-spread_x, spread_x, size=len(a.ormerod_freq))
a['y'] = a.muri_freq + np.random.uniform(-spread_y, spread_y, size=len(a.muri_freq))
plt.scatter('x', 'y', data=a, c='diff', alpha=0.5)

In [None]:
sns.stripplot('muri_freq', 'ormerod_freq', data=freq, jitter=True, alpha=0.2)

### Present in OTU-1-UM and OTU-7

Traits shared by these two responding OTUs,
rare in other strains reconstructed in this study,
and common in starch-specializing Muribaculaceae (according to Ormerod),
may be evidence of a shared niche for these two strains.

In [None]:
(function.loc[(data.loc['Otu0001_vC'] > 0) &
                  (data.loc['Otu0007_vA'] > 0)]
             .join(freq)
             .sort_values(['muri_freq', 'all_freq'], ascending=[True, False])
             .head(20)
)

In [None]:
(function.loc[ (data.loc['Bacteroides_thetaiotaomicron_VPI5482'] == 0)
             & (data.loc['Bacteroides_ovatus_ATCC_8483'] > 0)
             & (data.loc['Otu0001_vC'] > 0)
             & (data.loc['Otu0007_vA'] > 0)
             ]
             .join(freq)
             .sort_values(['muri_freq', 'starch_freq'], ascending=[True, False])
             .head(20)
)

### Present in either OTU-1-UM or OTU-1-UT but not both

Mechanism of non-response to ACA at UT?

In [None]:
(function.loc[(data.loc['Otu0001_vC'] > 0) &
              (data.loc['Otu0001_vB'] == 0)]
             .join(freq)
             .sort_values('all_freq', ascending=False)
             .head(10)
)

In [None]:
(function.loc[(data.loc['Otu0001_vC'] == 0) &
              (data.loc['Otu0001_vB'] > 0)]
             .join(freq)
             .sort_values('starch_freq', ascending=False)
             .head(10)
)

### Present in OTU-1-UM and OTU-7, but not in OTU-1-UT

In [None]:
(function.loc[(data.loc['Otu0001_vC'] > 0) &
                  (data.loc['Otu0001_vB'] == 0) &
                  (data.loc['Otu0007_vA'] > 0)]
             .join(freq)
             .sort_values(['starch_freq'], ascending=False)
             .head(10)

)

### Present in OTU-1-UT and OTU-7, but not in OTU-1-UM

Evidence of HGT?  Further suggests same niche?

In [None]:
(function.loc[(data.loc['Otu0001_vC'] == 0) &
                  (data.loc['Otu0001_vB'] > 0) &
                  (data.loc['Otu0007_vA'] > 0)]
             .join(freq)
             .sort_values(['all_freq'])
             .head(10)

)

### Present in OTU-7 but not in OTU-1-UT

Potential mechanism of competitive advantage?

In [None]:
(function.loc[(data.loc['Otu0001_vB'] == 0) &
                  (data.loc['Otu0007_vA'] > 0)]
             .join(freq)
             .sort_values(['starch_freq'], ascending=False)
             .head(10)

)

### Present in only OTU-1 and OTU-7, but not any Ormerod Muri

In [None]:
(function.loc[((data.loc['Otu0001_vB'] > 0) | (data.loc['Otu0001_vC'] > 0)) &
               (data.loc['Otu0007_vA'] > 0)]
             .join(freq)[lambda x: x.ormerod_freq == 0]

)

# Protein architectures

In [None]:
data = (pd.read_table('data/core.a.mags.muri.g.rfn.architecture-annot.count.tsv',
                      names=['mag_id', 'func_id', 'tally'],
                      index_col=['mag_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0))

In [None]:
function = pd.DataFrame({}, index=data.columns)

In [None]:
data.sum(1).sort_values(ascending=False)

In [None]:
freq_in_ormerod_strains = (data.loc[ormerod_strains] > 0).mean()
freq_in_plant_strains = (data.loc[plant_strain] > 0).mean()
freq_in_host_strains = (data.loc[host_strain] > 0).mean()
freq_in_starch_strains = (data.loc[starch_strain] > 0).mean()
freq_in_muri_strains = (data.loc[muri_strains] > 0).mean()
freq_in_all = (data.loc[muri_strains + ormerod_strains] > 0).mean()
freq = pd.DataFrame({ 'ormerod_freq': freq_in_ormerod_strains
                    , 'plant_freq': freq_in_plant_strains
                    , 'host_freq': freq_in_host_strains
                    , 'starch_freq': freq_in_starch_strains
                    , 'muri_freq': freq_in_muri_strains
                    , 'all_freq': freq_in_all
                    })
freq = freq[['all_freq', 'muri_freq', 'ormerod_freq', 'plant_freq', 'host_freq', 'starch_freq']]

## All Categories

### Abundance

In [None]:
feats = list(data.columns)
d = (data.loc[ormerod_strains + muri_strains, feats]).apply(np.sqrt)

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1', 'PC2'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, c=ordin[comps[2]], s=30, lw=1, edgecolors='k', cmap='Greys')
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(num_features_to_plot).index]
scale = 50
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')
    
function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(num_features_to_plot).T

### Presence/Absence

In [None]:
feats = list(set(function.index) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]) > 0

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1', 'PC2'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, c=ordin[comps[2]], s=30, lw=1, edgecolors='k', cmap='Greys')
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(num_features_to_plot).index]
scale = 70
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')

function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(num_features_to_plot).T

## Common Clusters

In [None]:
freq_thresh = 0.10

### Abundance

In [None]:
feats = list(set(freq[freq.all_freq > freq_thresh].index) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]).apply(np.sqrt)

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1', 'PC2'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, c=ordin[comps[2]], s=30, lw=1, edgecolors='k', cmap='Greys')
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(num_features_to_plot).index]
scale = 100
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')
    
function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(num_features_to_plot).T

### Presence/Absence

In [None]:
feats = list(set(freq[freq.all_freq > freq_thresh].index) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]) > 0

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1', 'PC2'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, c=ordin[comps[2]], s=30, lw=1, edgecolors='k', cmap='Greys')
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(num_features_to_plot).index]
scale = 80
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')

function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(num_features_to_plot).T

## Frequencies

In [None]:
a = function.join(freq)
a['diff'] = a.muri_freq - a.ormerod_freq
spread_x = 0.02
spread_y = 0.03
a['x'] = a.ormerod_freq + np.random.uniform(-spread_x, spread_x, size=len(a.ormerod_freq))
a['y'] = a.muri_freq + np.random.uniform(-spread_y, spread_y, size=len(a.muri_freq))
plt.scatter('x', 'y', data=a, c='diff', alpha=0.5)

In [None]:
sns.stripplot('muri_freq', 'ormerod_freq', data=freq, jitter=True, alpha=0.2)

### Present in OTU-1-UM and OTU-7

Traits shared by these two responding OTUs,
rare in other strains reconstructed in this study,
and common in starch-specializing Muribaculaceae (according to Ormerod),
may be evidence of a shared niche for these two strains.

In [None]:
(function.loc[(data.loc['Otu0001_vC'] > 0) &
                  (data.loc['Otu0007_vA'] > 0)]
             .join(freq)
             .sort_values(['muri_freq', 'starch_freq'], ascending=[True, False])
             .head(20)
)

### Present in either OTU-1-UM or OTU-1-UT but not both

Mechanism of non-response to ACA at UT?

In [None]:
(function.loc[(data.loc['Otu0001_vC'] > 0) &
              (data.loc['Otu0001_vB'] == 0)]
             .join(freq)
             .sort_values('starch_freq', ascending=False)
             .head(10)
)

In [None]:
(function.loc[(data.loc['Otu0001_vC'] == 0) &
              (data.loc['Otu0001_vB'] > 0)]
             .join(freq)
             .sort_values('starch_freq', ascending=False)
             .head(10)
)

### Present in OTU-1-UM and OTU-7, but not in OTU-1-UT

In [None]:
(function.loc[(data.loc['Otu0001_vC'] > 0) &
                  (data.loc['Otu0001_vB'] == 0) &
                  (data.loc['Otu0007_vA'] > 0)]
             .join(freq)
             .sort_values(['starch_freq'], ascending=False)
             .head(10)

)

### Present in OTU-1-UT and OTU-7, but not in OTU-1-UM

Evidence of HGT?  Further suggests same niche?

In [None]:
(function.loc[(data.loc['Otu0001_vC'] == 0) &
                  (data.loc['Otu0001_vB'] > 0) &
                  (data.loc['Otu0007_vA'] > 0)]
             .join(freq)
             .sort_values(['starch_freq'], ascending=False)
             .head(20)

)

### Present in OTU-7 but not in OTU-1-UT

Potential mechanism of competitive advantage?

In [None]:
(function.loc[(data.loc['Otu0001_vB'] == 0) &
                  (data.loc['Otu0007_vA'] > 0)]
             .join(freq)
             .sort_values(['starch_freq'], ascending=False)
             .head(10)

)

### Present in only OTU-1 and OTU-7, but not any Ormerod Muri

In [None]:
(function.loc[((data.loc['Otu0001_vB'] > 0) | (data.loc['Otu0001_vC'] > 0)) &
               (data.loc['Otu0007_vA'] > 0)]
             .join(freq)[lambda x: x.ormerod_freq == 0]

)