# Preamble

In [None]:
import pandas as pd
from glob import glob
import matplotlib.pyplot as plt
from skbio.diversity import beta_diversity
from skbio.stats.ordination import pcoa
from sklearn.decomposition import PCA
import numpy as np
import seaborn as sns

In [None]:
def pcoa_t(counts, diss='jaccard', return_prop_explained=False):
    dist = beta_diversity(diss, counts.values, ids=counts.index)
    pcoa_fit = pcoa(dist)
    pcoa_result = pcoa_fit.samples
    if return_prop_explained:
        return pcoa_result, pd.Series(pcoa_fit.proportion_explained, index=pcoa_result.columns)
    else:
        return pcoa_result

def pca_t(counts, return_prop_explained=False):
    fit = PCA().fit(counts)
    values = fit.transform(counts)
    out = pd.DataFrame(values, index=counts.index)
    out.rename(lambda i: 'PC{}'.format(i), axis='columns', inplace=True)
    if return_prop_explained:
        return out, pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=out.columns)
    else:
        return out

In [None]:
ormerod_desig = { 'GP4': 'plant'
                , 'GP3': 'plant'
                , 'H7': 'plant'
                , 'K1': 'plant'
                , 'M13': 'plant'
                , 'GP1': 'plant'
                , 'M2': 'plant'
                , 'M8': 'plant'
                , 'M1': 'plant'
                , 'M12': 'plant'
                , 'H5': 'plant'
                , 'Homeothermus_arabinoxylanisolvens': 'plant'
                , 'GP2': 'host'
                , 'M9': 'host'
                , 'M14': 'host'
                , 'H6': 'host'
                , 'M5': 'host'
                , 'M6': 'starch'
                , 'M11': 'starch'
                , 'H2': 'starch'
                , 'H4': 'starch'
                , 'H10': 'starch'
                , 'M10': 'starch'
                , 'H3': 'starch'
                , 'H8': 'starch'
                , 'H9': 'starch'
                , 'H1': 'starch'
                , 'H3': 'starch'
                , 'M7': 'starch'
                , 'K10': 'starch'
                , 'M3': 'starch'
                }

mag_desig = {
      'OTU-1.vA': 'Muribaculaceae'
    , 'OTU-1.vB': 'Muribaculaceae'
    , 'OTU-7.vA': 'Muribaculaceae'
    , 'OTU-9.vA': 'Muribaculaceae'
    , 'OTU-5.vA': 'Muribaculaceae'
    , 'OTU-4.vA': 'Muribaculaceae'
    , 'OTU-49.vA': 'Muribaculaceae'
    , 'OTU-17.vA': 'Muribaculaceae'
    , 'OTU-41.v0': 'Bacteroides'
}

In [None]:
palette = {'starch': 'blue', 'host': 'purple', 'plant': 'green'}

# ECs

In [None]:
function = pd.read_table('ref/expasy.tsv', names=['func_id', 'description'], index_col='func_id')

In [None]:
from glob import glob


mag_paths = glob('res/muri.mags.annot.d/*.ec.tsv')

all_frames = []
for filepath in mag_paths:
    frame = pd.read_table(filepath, names=['orf_id', 'func_id'])
    if filepath.split('/')[-1].startswith('OTU-'):
        genome_name = '.'.join(filepath.split('/')[-1].split('.')[:2])
    else:
        genome_name = filepath.split('/')[-1].split('.')[0]
    frame['mag_id'] = genome_name
    all_frames.append(frame)
    
data = (pd.concat(all_frames)
          .reset_index(drop=True)
          .groupby(['mag_id', 'func_id'])
          .apply(len)
          .unstack('func_id')
          .fillna(0).astype(int)
       )

In [None]:
for func_id in set(function.index) - set(data.columns):
    data[func_id] = 0

In [None]:
data.sum(1).sort_values(ascending=False)

In [None]:
ormerod_strains = list(ormerod_desig.keys()) +  ['Muribaculum_intestinale_yl27']
plant_strain = [k for k in ormerod_desig if ormerod_desig[k] == 'plant']
host_strain = [k for k in ormerod_desig if ormerod_desig[k] == 'host']
starch_strain = [k for k in ormerod_desig if ormerod_desig[k] == 'starch']
muri_strains = [k for k in mag_desig if mag_desig[k] == 'Muribaculaceae']

In [None]:
freq_in_ormerod_strains = (data.loc[ormerod_strains] > 0).mean()
freq_in_plant_strains = (data.loc[plant_strain] > 0).mean()
freq_in_host_strains = (data.loc[host_strain] > 0).mean()
freq_in_starch_strains = (data.loc[starch_strain] > 0).mean()
freq_in_muri_strains = (data.loc[muri_strains] > 0).mean()
freq_in_all = (data.loc[muri_strains + ormerod_strains] > 0).mean()
freq = pd.DataFrame({ 'ormerod_freq': freq_in_ormerod_strains
                    , 'plant_freq': freq_in_plant_strains
                    , 'host_freq': freq_in_host_strains
                    , 'starch_freq': freq_in_starch_strains
                    , 'muri_freq': freq_in_muri_strains
                    , 'all_freq': freq_in_all
                    })
freq = freq[['all_freq', 'muri_freq', 'ormerod_freq', 'plant_freq', 'host_freq', 'starch_freq']]

## All Categories

### Abundance

In [None]:
feats = list(set(function.index) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]).apply(np.sqrt)

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, color='k', s=5)
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(10).index]
scale = 20
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')
    
function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(10)

### Presence/Absence

In [None]:
feats = list(set(function.index) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]) > 0

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, color='k', s=5)
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(10).index]
scale = 20
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')
    
function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(10)

## Frequencies

In [None]:
a = function.join(freq)
a['diff'] = a.muri_freq - a.ormerod_freq
spread_x = 0.02
spread_y = 0.03
a['x'] = a.ormerod_freq + np.random.uniform(-spread_x, spread_x, size=len(a.ormerod_freq))
a['y'] = a.muri_freq + np.random.uniform(-spread_y, spread_y, size=len(a.muri_freq))
plt.scatter('x', 'y', data=a, c='diff', alpha=0.5)

In [None]:
sns.stripplot('muri_freq', 'ormerod_freq', data=freq, jitter=True, alpha=0.2)

### Present in both OTU-1 and OTU-7

In [None]:
(function.loc[(data.loc['OTU-1.vA'] > 0) &
                  (data.loc['OTU-7.vA'] > 0)]
             .join(freq)
             .sort_values('muri_freq')
             
).head(10)

### Present in OTU-1.vA but not OTU-1.vB

In [None]:
(function.loc[(data.loc['OTU-1.vA'] > 0) &
                  (data.loc['OTU-1.vB'] == 0)]
             .join(freq)
             .sort_values('ormerod_freq')
             
)

### Present in OTU-1-UM and OTU-7, but not in OTU-1-UT

In [None]:
(function.loc[(data.loc['OTU-1.vA'] > 0) &
                  (data.loc['OTU-1.vB'] == 0) &
                  (data.loc['OTU-7.vA'] > 0)]
             .join(freq)
             .sort_values('ormerod_freq')
             
).head(10)

### Missing in OTU-1-UM and OTU-7 but not in OTU-1-UT

In [None]:
(function.loc[(data.loc['OTU-1.vA'] == 0) &
                  (data.loc['OTU-1.vB'] > 0) &
                  (data.loc['OTU-7.vA'] == 0)]
             .join(freq)
             .sort_values('ormerod_freq', ascending=False)
             
).head(10)

### Present in OTU-1-UT and OTU-7, but not in OTU-1-UM

In [None]:
(function.loc[(data.loc['OTU-1.vA'] == 0) &
                  (data.loc['OTU-1.vB'] > 0) &
                  (data.loc['OTU-7.vA'] > 0)]
            .join(freq)).sort_values('ormerod_freq')  #[lambda x: x.function_categories.str.contains('G')]


### Present in OTU-1 (both sites) but not OTU-7

In [None]:
(function.loc[(data.loc['OTU-1.vA'] > 0) &
                  (data.loc['OTU-1.vB'] > 0) &
                  (data.loc['OTU-7.vA'] == 0)]
             .join(freq)
             .sort_values('ormerod_freq')
             
).head(10)

### Present in OTU-7 but not OTU-1 (both sites)

In [None]:
(function.loc[(data.loc['OTU-1.vA'] == 0) &
                  (data.loc['OTU-1.vB'] == 0) &
                  (data.loc['OTU-7.vA'] > 0)]
             .join(freq)
             .sort_values('ormerod_freq')
             
).head(10)

### Missing in OTU-1 and OTU-7

In [None]:
(function.loc[(data.loc['OTU-1.vA'] == 0) &
                  (data.loc['OTU-1.vB'] == 0) &
                  (data.loc['OTU-7.vA'] == 0)]
             .join(freq)
             .sort_values('ormerod_freq', ascending=False)
             
).head(20)

# COGs

In [None]:
function = pd.read_table('ref/cog_function.tsv', index_col='cog_id')
function.index.name = 'func_id'

In [None]:
from glob import glob


mag_paths = glob('res/muri.mags.annot.d/*.cog.tsv')

all_frames = []
for filepath in mag_paths:
    frame = pd.read_table(filepath, names=['orf_id', 'func_id'])
    if filepath.split('/')[-1].startswith('OTU-'):
        genome_name = '.'.join(filepath.split('/')[-1].split('.')[:2])
    else:
        genome_name = filepath.split('/')[-1].split('.')[0]
    frame['mag_id'] = genome_name
    all_frames.append(frame)
    
data = (pd.concat(all_frames)
          .reset_index(drop=True)
          .groupby(['mag_id', 'func_id'])
          .apply(len)
          .unstack('func_id')
          .fillna(0).astype(int)
       )

In [None]:
for func_id in set(function.index) - set(data.columns):
    data[func_id] = 0

In [None]:
data.sum(1).sort_values(ascending=False)

In [None]:
freq_in_ormerod_strains = (data.loc[ormerod_strains] > 0).mean()
freq_in_plant_strains = (data.loc[plant_strain] > 0).mean()
freq_in_host_strains = (data.loc[host_strain] > 0).mean()
freq_in_starch_strains = (data.loc[starch_strain] > 0).mean()
freq_in_muri_strains = (data.loc[muri_strains] > 0).mean()
freq_in_all = (data.loc[muri_strains + ormerod_strains] > 0).mean()
freq = pd.DataFrame({ 'ormerod_freq': freq_in_ormerod_strains
                    , 'plant_freq': freq_in_plant_strains
                    , 'host_freq': freq_in_host_strains
                    , 'starch_freq': freq_in_starch_strains
                    , 'muri_freq': freq_in_muri_strains
                    , 'all_freq': freq_in_all
                    })
freq = freq[['all_freq', 'muri_freq', 'ormerod_freq', 'plant_freq', 'host_freq', 'starch_freq']]

## All Categories

### Abundance

In [None]:
feats = list(set(function.index) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]).apply(np.sqrt)

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, color='k', s=5)
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(10).index]
scale = 20
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')
    
function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(10)

### Presence/Absence

In [None]:
feats = list(set(function.index) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]) > 0

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, color='k', s=5)
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(10).index]
scale = 20
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')
    
function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(10)

## Carbohydrates

### Abundance


In [None]:
feats = list(set(function[lambda x: x.function_categories.str.contains('G')].index) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]).apply(np.sqrt)

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, color='k', s=5)
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(10).index]
scale = 5
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')
    
function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(10)

In [None]:
sns.clustermap(d.loc[:,d.sum() != 0].T.apply(np.sqrt), robust=True,
                   figsize=(10, 10), vmin=0)

### Presence/Absence

In [None]:
feats = list(set(function[lambda x: x.function_categories.str.contains('G')].index) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]) > 0

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, color='k', s=5)
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(10).index]
scale = 20
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')
    
function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(10)

## Ormerod COGs (mostly carbs)

### Abundance

In [None]:
feats = list(set(['COG3507', 'COG3866', 'COG4677', 'COG2730', 'COG3693', 'COG0366', 'COG3525', 'COG3119']) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]).apply(np.sqrt)

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, color='k', s=5)
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(10).index]
scale = 5
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')
    
function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(10)

## Amino-acid COGs

### Abundance

In [None]:
feats = list(set(function[lambda x: x.function_categories.str.contains('E')].index) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]).apply(np.sqrt)

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, color='k', s=5)
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(10).index]
scale = 5
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')
    
function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(10)

In [None]:
ordin.loc[['OTU-1.vA', 'OTU-1.vB', 'OTU-7.vA'],['PC0', 'PC1']]

### Presence/Absence

In [None]:
feats = list(set(function[lambda x: x.function_categories.str.contains('E')].index) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]) > 0

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, color='k', s=5)
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(10).index]
scale = 20
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')
    
function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(10)

## Frequencies

In [None]:
a = function.join(freq)
a['diff'] = a.muri_freq - a.ormerod_freq
spread_x = 0.02
spread_y = 0.03
a['x'] = a.ormerod_freq + np.random.uniform(-spread_x, spread_x, size=len(a.ormerod_freq))
a['y'] = a.muri_freq + np.random.uniform(-spread_y, spread_y, size=len(a.muri_freq))
plt.scatter('x', 'y', data=a, c='diff', alpha=0.5)

In [None]:
sns.stripplot('muri_freq', 'ormerod_freq', data=freq, jitter=True, alpha=0.2)

### Present in both OTU-1 and OTU-7

In [None]:
(function.loc[(data.loc['OTU-1.vA'] > 0) &
                  (data.loc['OTU-7.vA'] > 0)]
             .join(freq)
             .sort_values('muri_freq')
             
).head(10)

### Present in OTU-1.vA but not OTU-1.vB

In [None]:
(function.loc[(data.loc['OTU-1.vA'] > 0) &
                  (data.loc['OTU-1.vB'] == 0)]
             .join(freq)
             .sort_values('ormerod_freq')
             
)

### Present in OTU-1-UM and OTU-7, but not in OTU-1-UT

In [None]:
(function.loc[(data.loc['OTU-1.vA'] > 0) &
                  (data.loc['OTU-1.vB'] == 0) &
                  (data.loc['OTU-7.vA'] > 0)]
             .join(freq)
             .sort_values('ormerod_freq')
             
).head(10)

### Missing in OTU-1-UM and OTU-7 but not in OTU-1-UT

In [None]:
(function.loc[(data.loc['OTU-1.vA'] == 0) &
                  (data.loc['OTU-1.vB'] > 0) &
                  (data.loc['OTU-7.vA'] == 0)]
             .join(freq)
             .sort_values('ormerod_freq', ascending=False)
             
).head(10)

### Present in OTU-1-UT and OTU-7, but not in OTU-1-UM

In [None]:
(function.loc[(data.loc['OTU-1.vA'] == 0) &
                  (data.loc['OTU-1.vB'] > 0) &
                  (data.loc['OTU-7.vA'] > 0)]
            .join(freq)).sort_values('ormerod_freq')  #[lambda x: x.function_categories.str.contains('G')]


### Present in OTU-1 (both sites) but not OTU-7

In [None]:
(function.loc[(data.loc['OTU-1.vA'] > 0) &
                  (data.loc['OTU-1.vB'] > 0) &
                  (data.loc['OTU-7.vA'] == 0)]
             .join(freq)
             .sort_values('ormerod_freq')
             
).head(10)

### Present in OTU-7 but not OTU-1 (both sites)

In [None]:
(function.loc[(data.loc['OTU-1.vA'] == 0) &
                  (data.loc['OTU-1.vB'] == 0) &
                  (data.loc['OTU-7.vA'] > 0)]
             .join(freq)
             .sort_values('ormerod_freq')
             
).head(10)

### Missing in OTU-1 and OTU-7

In [None]:
(function.loc[(data.loc['OTU-1.vA'] == 0) &
                  (data.loc['OTU-1.vB'] == 0) &
                  (data.loc['OTU-7.vA'] == 0)]
             .join(freq)
             .sort_values('ormerod_freq', ascending=False)
             
).head(20)

# MinPath to MetaCyc pathways

In [None]:
function = pd.read_table('ref/metacyc_pathway_descriptions.tsv', names=['func_id', 'description'], index_col='func_id')

In [None]:
from glob import glob

mag_paths = glob('res/muri.mags.annot.d/*.ec.minpath.list')

all_frames = []
for filepath in mag_paths:
    frame = pd.read_table(filepath, names=['func_id'])
    if filepath.split('/')[-1].startswith('OTU-'):
        genome_name = '.'.join(filepath.split('/')[-1].split('.')[:2])
    else:
        genome_name = filepath.split('/')[-1].split('.')[0]
    frame['mag_id'] = genome_name
    all_frames.append(frame)
    
data = (pd.concat(all_frames)
          .reset_index(drop=True)
          .groupby(['mag_id', 'func_id'])
          .apply(len)
          .unstack('func_id')
          .fillna(0).astype(int)
       )

In [None]:
for func_id in set(function.index) - set(data.columns):
    data[func_id] = 0

In [None]:
data.sum(1).sort_values(ascending=False)

In [None]:
freq_in_ormerod_strains = (data.loc[ormerod_strains] > 0).mean()
freq_in_plant_strains = (data.loc[plant_strain] > 0).mean()
freq_in_host_strains = (data.loc[host_strain] > 0).mean()
freq_in_starch_strains = (data.loc[starch_strain] > 0).mean()
freq_in_muri_strains = (data.loc[muri_strains] > 0).mean()
freq_in_all = (data.loc[muri_strains + ormerod_strains] > 0).mean()
freq = pd.DataFrame({ 'ormerod_freq': freq_in_ormerod_strains
                    , 'plant_freq': freq_in_plant_strains
                    , 'host_freq': freq_in_host_strains
                    , 'starch_freq': freq_in_starch_strains
                    , 'muri_freq': freq_in_muri_strains
                    , 'all_freq': freq_in_all
                    })
freq = freq[['all_freq', 'muri_freq', 'ormerod_freq', 'plant_freq', 'host_freq', 'starch_freq']]

## Presence/Absence

In [None]:
feats = list(set(data.columns))
d = (data.loc[lambda x: x.index.isin(ormerod_strains + muri_strains), feats])

# I use Sorensen-Dice dissimilarity here because it punishes dissimilarities less harshly than
# Jaccard.
ordin, prop_explained = pcoa_t(d, diss='dice', return_prop_explained=True)

comps = 'PC1', 'PC2'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, color='k', s=5)
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))

## Frequencies

In [None]:
sns.stripplot('muri_freq', 'ormerod_freq', data=freq, jitter=True, alpha=0.2)

### Present in both OTU-1 and OTU-7

In [None]:
(function.loc[(data.loc['OTU-1.vA'] > 0) &
                  (data.loc['OTU-7.vA'] > 0)]
             .join(freq)
             .sort_values('muri_freq')
             
).head(10)

### Absent in both OTU-1 and OTU-7

In [None]:
(function.loc[(data.loc['OTU-1.vA'] == 0) &
                  (data.loc['OTU-7.vA'] == 0)]
             .join(freq)
             .sort_values('all_freq', ascending=False)
             
).head(10)

### Present in OTU-1.vA but not OTU-1.vB

In [None]:
(function.loc[(data.loc['OTU-1.vA'] > 0) &
                  (data.loc['OTU-1.vB'] == 0)]
             .join(freq)
             .sort_values('ormerod_freq')
             
)

### Present in OTU-1-UM and OTU-7, but not in OTU-1-UT

In [None]:
(function.loc[(data.loc['OTU-1.vA'] > 0) &
                  (data.loc['OTU-1.vB'] == 0) &
                  (data.loc['OTU-7.vA'] > 0)]
             .join(freq)
             .sort_values('ormerod_freq')
             
).head(10)

### Missing in OTU-1-UM and OTU-7 but not in OTU-1-UT

In [None]:
(function.loc[(data.loc['OTU-1.vA'] == 0) &
                  (data.loc['OTU-1.vB'] > 0) &
                  (data.loc['OTU-7.vA'] == 0)]
             .join(freq)
             .sort_values('ormerod_freq', ascending=False)
             
).head(10)

### Present in OTU-1-UT and OTU-7, but not in OTU-1-UM

In [None]:
(function.loc[(data.loc['OTU-1.vA'] == 0) &
                  (data.loc['OTU-1.vB'] > 0) &
                  (data.loc['OTU-7.vA'] > 0)]
            .join(freq)).sort_values('ormerod_freq')  #[lambda x: x.function_categories.str.contains('G')]


### Present in OTU-1 (both sites) but not OTU-7

In [None]:
(function.loc[(data.loc['OTU-1.vA'] > 0) &
                  (data.loc['OTU-1.vB'] > 0) &
                  (data.loc['OTU-7.vA'] == 0)]
             .join(freq)
             .sort_values('ormerod_freq')
             
).head(10)

### Present in OTU-7 but not OTU-1 (both sites)

In [None]:
(function.loc[(data.loc['OTU-1.vA'] == 0) &
                  (data.loc['OTU-1.vB'] == 0) &
                  (data.loc['OTU-7.vA'] > 0)]
             .join(freq)
             .sort_values('ormerod_freq')
             
).head(10)

### Missing in OTU-1 and OTU-7

In [None]:
(function.loc[(data.loc['OTU-1.vA'] == 0) &
                  (data.loc['OTU-1.vB'] == 0) &
                  (data.loc['OTU-7.vA'] == 0)]
             .join(freq)
             .sort_values('ormerod_freq', ascending=False)
             
).head(20)

# CAZy De Novo Clusters

In [None]:
data = (pd.read_table('res/core.a.mags.muri.dbCAN-hits.denovo-clust.count.tsv',
                      names=['mag_id', 'func_id', 'tally'],
                      index_col=['mag_id', 'func_id'])
          .tally.unstack(fill_value=0)
          .rename({ 'Otu0001.vA': 'OTU-1.vA'
                  , 'Otu0001.vB': 'OTU-1.vB'
                  , 'Otu0004.vA': 'OTU-4.vA'
                  , 'Otu0005.vA': 'OTU-5.vA'
                  , 'Otu0007.vA': 'OTU-7.vA'
                  , 'Otu0009.vA': 'OTU-9.vA'
                  , 'Otu0017.vA': 'OTU-17.vA'
                  , 'Otu0049.vA': 'OTU-49.vA'
                  }
                 )
       )


In [None]:
function = pd.DataFrame({}, index=data.columns)

In [None]:
data.sum(1).sort_values(ascending=False)

In [None]:
freq_in_ormerod_strains = (data.loc[ormerod_strains] > 0).mean()
freq_in_plant_strains = (data.loc[plant_strain] > 0).mean()
freq_in_host_strains = (data.loc[host_strain] > 0).mean()
freq_in_starch_strains = (data.loc[starch_strain] > 0).mean()
freq_in_muri_strains = (data.loc[muri_strains] > 0).mean()
freq_in_all = (data.loc[muri_strains + ormerod_strains] > 0).mean()
freq = pd.DataFrame({ 'ormerod_freq': freq_in_ormerod_strains
                    , 'plant_freq': freq_in_plant_strains
                    , 'host_freq': freq_in_host_strains
                    , 'starch_freq': freq_in_starch_strains
                    , 'muri_freq': freq_in_muri_strains
                    , 'all_freq': freq_in_all
                    })
freq = freq[['all_freq', 'muri_freq', 'ormerod_freq', 'plant_freq', 'host_freq', 'starch_freq']]

## All Categories

### Abundance

In [None]:
feats = list(data.columns)
d = (data.loc[ormerod_strains + muri_strains, feats]).apply(np.sqrt)

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1', 'PC2'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, c=ordin[comps[2]], s=30, lw=1, edgecolors='k', cmap='Greys')
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(10).index]
scale = 20
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')
    
function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(10)

### Presence/Absence

In [None]:
feats = list(set(function.index) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]) > 0

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1', 'PC2'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, c=ordin[comps[2]], s=30, lw=1, edgecolors='k', cmap='Greys')
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(10).index]
scale = 20
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')

function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(10)

In [None]:
feats = list(set(function.index) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]) > 0

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(15, 15))
ax = fig.add_subplot(111, projection='3d')

comps = 'PC0', 'PC1', 'PC2'
ax.scatter(xs=ordin[comps[0]], ys=ordin[comps[1]], zs=ordin[comps[2]], color='k', s=5)
# Draw lines up to scatter points
baselevel=-10
for _, drow in ordin.iterrows():
    ax.plot([drow[comps[0]], drow[comps[0]]],
            [drow[comps[1]], drow[comps[1]]],
            [baselevel, drow[comps[2]]],
            c='k', lw=0.5)
    

for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.text(row[comps[0]], row[comps[1]], row[comps[2]], mag_id, color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    ax.set_zlabel('{} ({})'.format(comps[2], prop_explained[comps[2]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(10).index]
scale = 20
for func_id, row in important_compons.iterrows():
    ax.text(row[comps[0]] * scale, row[comps[1]] * scale, row[comps[2]] * scale, func_id, weight='bold', alpha=0.5)
ax.scatter([0], [0], [0], marker='x', color='k')

## Common Clusters

In [None]:
freq_thresh = 0.10

### Abundance

In [None]:
feats = list(set(freq[freq.all_freq > freq_thresh].index) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]).apply(np.sqrt)

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1', 'PC2'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, c=ordin[comps[2]], s=30, lw=1, edgecolors='k', cmap='Greys')
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(10).index]
scale = 20
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')
    
function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(10)

### Presence/Absence

In [None]:
feats = list(set(freq[freq.all_freq > freq_thresh].index) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]) > 0

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1', 'PC2'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, c=ordin[comps[2]], s=30, lw=1, edgecolors='k', cmap='Greys')
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).index]
scale = 20
for func_id, row in important_compons[:20].iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')

function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(10)

In [None]:
feats = list(set(freq[freq.all_freq > freq_thresh].index) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]) > 0

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(15, 15))
ax = fig.add_subplot(111, projection='3d')

comps = 'PC0', 'PC1', 'PC2'
ax.scatter(xs=ordin[comps[0]], ys=ordin[comps[1]], zs=ordin[comps[2]], color='k', s=5)
# Draw lines up to scatter points
baselevel=-10
for _, drow in ordin.iterrows():
    ax.plot([drow[comps[0]], drow[comps[0]]],
            [drow[comps[1]], drow[comps[1]]],
            [baselevel, drow[comps[2]]],
            c='k', lw=0.5)
    

for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.text(row[comps[0]], row[comps[1]], row[comps[2]], mag_id, color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    ax.set_zlabel('{} ({})'.format(comps[2], prop_explained[comps[2]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(10).index]
scale = 20
for func_id, row in important_compons.iterrows():
    ax.text(row[comps[0]] * scale, row[comps[1]] * scale, row[comps[2]] * scale, func_id, weight='bold', alpha=0.5)
ax.scatter([0], [0], [0], marker='x', color='k')

## Frequencies

In [None]:
a = function.join(freq)
a['diff'] = a.muri_freq - a.ormerod_freq
spread_x = 0.02
spread_y = 0.03
a['x'] = a.ormerod_freq + np.random.uniform(-spread_x, spread_x, size=len(a.ormerod_freq))
a['y'] = a.muri_freq + np.random.uniform(-spread_y, spread_y, size=len(a.muri_freq))
plt.scatter('x', 'y', data=a, c='diff', alpha=0.5)

In [None]:
sns.stripplot('muri_freq', 'ormerod_freq', data=freq, jitter=True, alpha=0.2)

### Present in both OTU-1 (both strains) and OTU-7

In [None]:
(function.loc[(data.loc['OTU-1.vA'] > 0) &
                  (data.loc['OTU-1.vB'] > 0) &
                  (data.loc['OTU-7.vA'] > 0)]
             .join(freq)
             .sort_values('muri_freq')
             
).head(20)

### Present in OTU-1.vA but not OTU-1.vB

In [None]:
(function.loc[(data.loc['OTU-1.vA'] > 0) &
                  (data.loc['OTU-1.vB'] == 0)]
             .join(freq)
             .sort_values('ormerod_freq')
             
)

### Present in OTU-1.vB but not OTU-1.vA

In [None]:
(function.loc[(data.loc['OTU-1.vA'] == 0) &
                  (data.loc['OTU-1.vB'] > 0)]
             .join(freq)
             .sort_values('ormerod_freq')
             
)

### Present in OTU-1-UM and OTU-7, but not in OTU-1-UT

In [None]:
(function.loc[(data.loc['OTU-1.vA'] > 0) &
                  (data.loc['OTU-1.vB'] == 0) &
                  (data.loc['OTU-7.vA'] > 0)]
             .join(freq)
             .sort_values('ormerod_freq')
             
).head(10)

### Present in OTU-1-UM and OTU-7

In [None]:
(function.loc[(data.loc['OTU-1.vA'] > 0) &
                  (data.loc['OTU-7.vA'] > 0)]
             .join(freq)
             .sort_values('muri_freq')
             
).head(10)

### Missing in OTU-1-UM and OTU-7 but not in OTU-1-UT

In [None]:
(function.loc[(data.loc['OTU-1.vA'] == 0) &
                  (data.loc['OTU-1.vB'] > 0) &
                  (data.loc['OTU-7.vA'] == 0)]
             .join(freq)
             .sort_values('ormerod_freq', ascending=False)
             
).head(10)

### Present in OTU-1-UT and OTU-7, but not in OTU-1-UM

In [None]:
(function.loc[(data.loc['OTU-1.vA'] == 0) &
                  (data.loc['OTU-1.vB'] > 0) &
                  (data.loc['OTU-7.vA'] > 0)]
            .join(freq)).sort_values('ormerod_freq')  #[lambda x: x.function_categories.str.contains('G')]


### Present in OTU-1 (both sites) but not OTU-7

In [None]:
(function.loc[(data.loc['OTU-1.vA'] > 0) &
                  (data.loc['OTU-1.vB'] > 0) &
                  (data.loc['OTU-7.vA'] == 0)]
             .join(freq)
             .sort_values('ormerod_freq')
             
).head(10)

### Present in OTU-7 but not OTU-1 (both sites)

In [None]:
(function.loc[(data.loc['OTU-1.vA'] == 0) &
                  (data.loc['OTU-1.vB'] == 0) &
                  (data.loc['OTU-7.vA'] > 0)]
             .join(freq)
             .sort_values('ormerod_freq')
             
).head(10)

### Missing in OTU-1 and OTU-7

In [None]:
(function.loc[(data.loc['OTU-1.vA'] == 0) &
                  (data.loc['OTU-1.vB'] == 0) &
                  (data.loc['OTU-7.vA'] == 0)]
             .join(freq)
             .sort_values('ormerod_freq', ascending=False)
             
).head(20)

# CAZy Domain Structures

In [None]:
data = (pd.read_table('res/core.a.mags.muri.dbCAN-hits.domain-clust.count.tsv',
                      names=['mag_id', 'func_id', 'tally'],
                      index_col=['mag_id', 'func_id'])
          .tally.unstack(fill_value=0)
          .rename({ 'Otu0001.vA': 'OTU-1.vA'
                  , 'Otu0001.vB': 'OTU-1.vB'
                  , 'Otu0004.vA': 'OTU-4.vA'
                  , 'Otu0005.vA': 'OTU-5.vA'
                  , 'Otu0007.vA': 'OTU-7.vA'
                  , 'Otu0009.vA': 'OTU-9.vA'
                  , 'Otu0017.vA': 'OTU-17.vA'
                  , 'Otu0049.vA': 'OTU-49.vA'
                  }
                 )
       )


In [None]:
function = pd.DataFrame({}, index=data.columns)

In [None]:
data.sum(1).sort_values(ascending=False)

In [None]:
freq_in_ormerod_strains = (data.loc[ormerod_strains] > 0).mean()
freq_in_plant_strains = (data.loc[plant_strain] > 0).mean()
freq_in_host_strains = (data.loc[host_strain] > 0).mean()
freq_in_starch_strains = (data.loc[starch_strain] > 0).mean()
freq_in_muri_strains = (data.loc[muri_strains] > 0).mean()
freq_in_all = (data.loc[muri_strains + ormerod_strains] > 0).mean()
freq = pd.DataFrame({ 'ormerod_freq': freq_in_ormerod_strains
                    , 'plant_freq': freq_in_plant_strains
                    , 'host_freq': freq_in_host_strains
                    , 'starch_freq': freq_in_starch_strains
                    , 'muri_freq': freq_in_muri_strains
                    , 'all_freq': freq_in_all
                    })
freq = freq[['all_freq', 'muri_freq', 'ormerod_freq', 'plant_freq', 'host_freq', 'starch_freq']]

## All Categories

### Abundance

In [None]:
feats = list(set(function.index) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]).apply(np.sqrt)

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1', 'PC2'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, c=ordin[comps[2]], s=30, lw=1, edgecolors='k', cmap='Greys')
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(10).index]
scale = 10
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')
    
function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(10)

### Presence/Absence

In [None]:
feats = list(set(function.index) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]) > 0

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1', 'PC2'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, c=ordin[comps[2]], s=30, lw=1, edgecolors='k', cmap='Greys')
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(10).index]
scale = 20
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')

function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(10)

In [None]:
feats = list(set(function.index) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]) > 0

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(15, 15))
ax = fig.add_subplot(111, projection='3d')

comps = 'PC0', 'PC1', 'PC2'
ax.scatter(xs=ordin[comps[0]], ys=ordin[comps[1]], zs=ordin[comps[2]], color='k', s=5)
# Draw lines up to scatter points
baselevel=-10
for _, drow in ordin.iterrows():
    ax.plot([drow[comps[0]], drow[comps[0]]],
            [drow[comps[1]], drow[comps[1]]],
            [baselevel, drow[comps[2]]],
            c='k', lw=0.5)
    

for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.text(row[comps[0]], row[comps[1]], row[comps[2]], mag_id, color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    ax.set_zlabel('{} ({})'.format(comps[2], prop_explained[comps[2]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(10).index]
scale = 20
for func_id, row in important_compons.iterrows():
    ax.text(row[comps[0]] * scale, row[comps[1]] * scale, row[comps[2]] * scale, func_id, weight='bold', alpha=0.5)
ax.scatter([0], [0], [0], marker='x', color='k')

## Common Clusters

In [None]:
freq_thresh = 0.10

### Abundance

In [None]:
feats = list(set(freq[freq.all_freq > freq_thresh].index) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]).apply(np.sqrt)

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1', 'PC2'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, c=ordin[comps[2]], s=30, lw=1, edgecolors='k', cmap='Greys')
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(10).index]
scale = 20
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')
    
function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(10)

### Presence/Absence

In [None]:
feats = list(set(freq[freq.all_freq > freq_thresh].index) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]) > 0

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

comps = 'PC0', 'PC1', 'PC2'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, c=ordin[comps[2]], s=30, lw=1, edgecolors='k', cmap='Greys')
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(10).index]
scale = 20
for func_id, row in important_compons.iterrows():
    ax.annotate(func_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')

function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(10)

In [None]:
feats = list(set(freq[freq.all_freq > freq_thresh].index) &
             set(data.columns))
d = (data.loc[ormerod_strains + muri_strains, feats]) > 0

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)

from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(15, 15))
ax = fig.add_subplot(111, projection='3d')

comps = 'PC0', 'PC1', 'PC2'
ax.scatter(xs=ordin[comps[0]], ys=ordin[comps[1]], zs=ordin[comps[2]], color='k', s=5)
# Draw lines up to scatter points
baselevel=-10
for _, drow in ordin.iterrows():
    ax.plot([drow[comps[0]], drow[comps[0]]],
            [drow[comps[1]], drow[comps[1]]],
            [baselevel, drow[comps[2]]],
            c='k', lw=0.5)
    

for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.text(row[comps[0]], row[comps[1]], row[comps[2]], mag_id, color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    ax.set_zlabel('{} ({})'.format(comps[2], prop_explained[comps[2]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(10).index]
scale = 20
for func_id, row in important_compons.iterrows():
    ax.text(row[comps[0]] * scale, row[comps[1]] * scale, row[comps[2]] * scale, func_id, weight='bold', alpha=0.5)
ax.scatter([0], [0], [0], marker='x', color='k')

## Frequencies

In [None]:
a = function.join(freq)
a['diff'] = a.muri_freq - a.ormerod_freq
spread_x = 0.02
spread_y = 0.03
a['x'] = a.ormerod_freq + np.random.uniform(-spread_x, spread_x, size=len(a.ormerod_freq))
a['y'] = a.muri_freq + np.random.uniform(-spread_y, spread_y, size=len(a.muri_freq))
plt.scatter('x', 'y', data=a, c='diff', alpha=0.5)

In [None]:
sns.stripplot('muri_freq', 'ormerod_freq', data=freq, jitter=True, alpha=0.2)

### Present in both OTU-1 and OTU-7

In [None]:
(function.loc[(data.loc['OTU-1.vA'] > 0) &
              (data.loc['OTU-7.vA'] > 0)]
             .join(freq)
             .sort_values('muri_freq')
             
).head(20)

### Present in OTU-1.vA but not OTU-1.vB

In [None]:
(function.loc[(data.loc['OTU-1.vA'] > 0) &
                  (data.loc['OTU-1.vB'] == 0)]
             .join(freq)
             .sort_values('ormerod_freq')
             
)

### Present in OTU-1-UM and OTU-7, but not in OTU-1-UT

In [None]:
(function.loc[(data.loc['OTU-1.vA'] > 0) &
                  (data.loc['OTU-1.vB'] == 0) &
                  (data.loc['OTU-7.vA'] > 0)]
             .join(freq)
             .sort_values('ormerod_freq')
             
).head(10)

### Present in OTU-1-UM and OTU-7

In [None]:
(function.loc[(data.loc['OTU-1.vA'] > 0) &
                  (data.loc['OTU-7.vA'] > 0)]
             .join(freq)
             .sort_values('ormerod_freq')
             
).head(10)

### Missing in OTU-1-UM and OTU-7 but not in OTU-1-UT

In [None]:
(function.loc[(data.loc['OTU-1.vA'] == 0) &
                  (data.loc['OTU-1.vB'] > 0) &
                  (data.loc['OTU-7.vA'] == 0)]
             .join(freq)
             .sort_values('ormerod_freq', ascending=False)
             
).head(10)

### Present in OTU-1-UT and OTU-7, but not in OTU-1-UM

In [None]:
(function.loc[(data.loc['OTU-1.vA'] == 0) &
                  (data.loc['OTU-1.vB'] > 0) &
                  (data.loc['OTU-7.vA'] > 0)]
            .join(freq)).sort_values('ormerod_freq')  #[lambda x: x.function_categories.str.contains('G')]


### Present in OTU-1 (both sites) but not OTU-7

In [None]:
(function.loc[(data.loc['OTU-1.vA'] > 0) &
                  (data.loc['OTU-1.vB'] > 0) &
                  (data.loc['OTU-7.vA'] == 0)]
             .join(freq)
             .sort_values('ormerod_freq')
             
).head(10)

### Present in OTU-7 but not OTU-1 (both sites)

In [None]:
(function.loc[(data.loc['OTU-1.vA'] == 0) &
                  (data.loc['OTU-1.vB'] == 0) &
                  (data.loc['OTU-7.vA'] > 0)]
             .join(freq)
             .sort_values('ormerod_freq')
             
).head(10)

### Missing in OTU-1 and OTU-7

In [None]:
(function.loc[(data.loc['OTU-1.vA'] == 0) &
                  (data.loc['OTU-1.vB'] == 0) &
                  (data.loc['OTU-7.vA'] == 0)]
             .join(freq)
             .sort_values('ormerod_freq', ascending=False)
             
).head(20)