# Preamble

In [None]:
import pandas as pd
from glob import glob
import matplotlib.pyplot as plt
from skbio.diversity import beta_diversity
from skbio.stats.ordination import pcoa
from sklearn.decomposition import PCA
import numpy as np
import seaborn as sns

In [None]:
def pcoa_t(counts, diss='jaccard', return_prop_explained=False):
    dist = beta_diversity(diss, counts.values, ids=counts.index)
    pcoa_fit = pcoa(dist)
    pcoa_result = pcoa_fit.samples
    if return_prop_explained:
        return pcoa_result, pd.Series(pcoa_fit.proportion_explained, index=pcoa_result.columns)
    else:
        return pcoa_result

def pca_t(counts, return_prop_explained=False):
    fit = PCA().fit(counts)
    values = fit.transform(counts)
    out = pd.DataFrame(values, index=counts.index)
    out.rename(lambda i: 'PC{}'.format(i), axis='columns', inplace=True)
    if return_prop_explained:
        return out, pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=out.columns)
    else:
        return out

In [None]:
ormerod_desig = { 'GP4': 'plant'
                , 'GP3': 'plant'
                , 'H7': 'plant'
                , 'K1': 'plant'
                , 'M13': 'plant'
                , 'GP1': 'plant'
                , 'M2': 'plant'
                , 'M8': 'plant'
                , 'M1': 'plant'
                , 'M12': 'plant'
                , 'H5': 'plant'
                , 'Homeothermus_arabinoxylanisolvens': 'plant'
                , 'GP2': 'host'
                , 'M9': 'host'
                , 'M14': 'host'
                , 'H6': 'host'
                , 'M5': 'host'
                , 'M6': 'starch'
                , 'M11': 'starch'
                , 'H2': 'starch'
                , 'H4': 'starch'
                , 'H10': 'starch'
                , 'M10': 'starch'
                , 'H3': 'starch'
                , 'H8': 'starch'
                , 'H9': 'starch'
                , 'H1': 'starch'
                , 'H3': 'starch'
                , 'M7': 'starch'
                , 'K10': 'starch'
                , 'M3': 'starch'
                }

mag_desig = {
      'OTU-3': 'Sphingopyxis'
    , 'OTU-2': 'Lactobacillus'
    , 'OTU-1-UM': 'Muribaculaceae'
    , 'OTU-1-UT': 'Muribaculaceae'
    , 'OTU-7': 'Muribaculaceae'
    , 'OTU-9': 'Muribaculaceae'
    , 'OTU-5': 'Muribaculaceae'
    , 'OTU-4': 'Muribaculaceae'
    , 'OTU-49': 'Muribaculaceae'
    , 'OTU-17': 'Muribaculaceae'
    , 'OTU-15': 'Lachnospiraceae'
    , 'OTU-25': 'Lachnospiraceae'
    , 'OTU-32': 'Lachnospiraceae'
    , 'OTU-12': 'Ruminococcaceae'
    , 'OTU-20-UM': 'Ruminococcaceae'
    , 'OTU-20-UT': 'Ruminococcaceae'
    , 'OTU-35': 'Ruminococcaceae'
    , 'OTU-6': 'Erysipelotrichaceae'
    , 'OTU-58': 'Mollicutes'
    , 'OTU-41': 'Bacteroides'
}

In [None]:
palette = {'starch': 'blue', 'host': 'purple', 'plant': 'green'}

# COGs

In [None]:
cog_function = pd.read_table('ref/cog_function.tsv', index_col='cog_id')

In [None]:
all_frames = []
for filepath in glob('res/core.a.mags.annot.d/*.cog.tsv') + glob('res/ref.mags.annot.d/*.cog.tsv'):
    frame = pd.read_table(filepath, names=['orf_id', 'cog'])
    frame['mag_id'] = filepath.split('/')[-1].split('.')[0]
    all_frames.append(frame)
    
cogdata = (pd.concat(all_frames)
          .reset_index(drop=True)
          .rename(columns={'cog': 'cog_id'})
          .groupby(['mag_id', 'cog_id'])
          .apply(len)
          .unstack('cog_id')
          .fillna(0).astype(int)
       )

In [None]:
for cog_id in set(cog_function.index) - set(cogdata.columns):
    cogdata[cog_id] = 0

In [None]:
ormerod_strains = list(cogdata[cogdata.index.str.contains('^(M|GP|H|K)[0-9]')].index)
muri_strains = [k for k in mag_desig if mag_desig[k] == 'Muribaculaceae']

In [None]:
cog_freq_in_ormerod_strains = cogdata.loc[ormerod_strains].mean()
cog_freq_in_ormerod_strains.name = 'freq'

## Abundance of Carbohydrate COGs


In [None]:
feats = list(set(cog_function[lambda x: x.function_categories.str.contains('G')].index) & set(cogdata.columns))
d = (cogdata.loc[ormerod_strains + muri_strains, feats])

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)
ordin, prop_explained = pca_t(np.sqrt(d), return_prop_explained=True)

comps = 'PC1', 'PC2'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, color='k', s=5)
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(10).index]
scale = 3
for cog_id, row in important_compons.iterrows():
    ax.annotate(cog_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')
    
cog_function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(10)

In [None]:
sns.clustermap(d.loc[:,d.sum() != 0].T.apply(np.sqrt), robust=True,
                   figsize=(10, 10), vmin=0)

## Abundance of Ormerod COGs (mostly carbs)

In [None]:
feats = list(set(['COG3507', 'COG3866', 'COG4677', 'COG2730', 'COG3693', 'COG0366', 'COG3525', 'COG3119']) &
             set(cogdata.columns))
d = (cogdata.loc[ormerod_strains + muri_strains, feats])

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)
ordin, prop_explained = pca_t(np.sqrt(d), return_prop_explained=True)

comps = 'PC1', 'PC2'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, color='k', s=5)
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(10).index]
scale = 3
for cog_id, row in important_compons.iterrows():
    ax.annotate(cog_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')
    
cog_function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(10)

## Abundance of XXX COGs

In [None]:
feats = list(set(cog_function[lambda x: x.function_categories.str.contains('G')].index) & set(cogdata.columns))
d = (cogdata.loc[ormerod_strains + muri_strains, feats])

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)
ordin, prop_explained = pca_t(np.sqrt(d), return_prop_explained=True)

comps = 'PC1', 'PC2'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, color='k', s=5)
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(10).index]
scale = 3
for cog_id, row in important_compons.iterrows():
    ax.annotate(cog_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')
    
cog_function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(10)

## COG Frequencies in Ormerod Strains

### Present in both OTU-1-UM and OTU-7


In [None]:
(cog_function.loc[(cogdata.loc['OTU-1-UM'] > 0) &
                  (cogdata.loc['OTU-7'] > 0)]
             .join(cog_freq_in_ormerod_strains)
             .sort_values('freq')
             
).head(10)

### Present in OTU-1-UM and OTU-7, but not in OTU-1-UT

In [None]:
(cog_function.loc[(cogdata.loc['OTU-1-UM'] > 0) &
                  (cogdata.loc['OTU-1-UT'] > 0) &
                  (cogdata.loc['OTU-7'] > 0)]
             .join(cog_freq_in_ormerod_strains)
             .sort_values('freq')
             
).head(10)

### Present in OTU-1-UT and OTU-7, but not in OTU-1-UM

In [None]:
(cog_function.loc[(cogdata.loc['OTU-1-UT'] > 0) &
                  (cogdata.loc['OTU-7'] > 0) &
                  (cogdata.loc['OTU-1-UM'] == 0)]
            .join(cog_freq_in_ormerod_strains)).sort_values('freq')  #[lambda x: x.function_categories.str.contains('G')]

### Present in OTU-1 (both sites) but not OTU-7

In [None]:
(cog_function.loc[((cogdata.loc['OTU-1-UT'] + cogdata.loc['OTU-1-UM']) > 0) &
                  (cogdata.loc['OTU-7'] == 0)]
             .join(cog_freq_in_ormerod_strains)).sort_values('freq')  #[lambda x: x.function_categories.str.contains('G')]

### Present in OTU-7 but not OTU-1 (both sites)

In [None]:
(cog_function
    .join(cog_freq_in_ormerod_strains).dropna()
    [(cogdata.loc['OTU-1-UM'] == 0) &
     (cogdata.loc['OTU-1-UT'] == 0) &
     (cogdata.loc['OTU-7'] > 0)
    ]
    .sort_values('freq').head())

# MinPath to MetaCyc pathways

In [None]:
path_function = pd.read_table('ref/metacyc_pathway_descriptions.tsv', names=['metacyc_id', 'description'], index_col='metacyc_id')

In [None]:
from itertools import chain

rawpathdata = {}

for filepath in chain(glob('res/core.a.mags.annot.d/*.minpath.list'),
                      glob('res/ref.mags.annot.d/*.minpath.list')):
    with open(filepath) as handle:
        genome_name = filepath.split('/')[-1].split('.')[0]
        rawpathdata[genome_name] = {}
        for pathname in (line.strip() for line in handle):
            rawpathdata[genome_name][pathname] = 1
            
pathdata = pd.DataFrame(rawpathdata).fillna(0).astype(int).T
pathdata.index.name = 'mag_id'
pathdata.columns.name = 'metacyc_id'

In [None]:
pathdata.sum(1).sort_values()

In [None]:
path_freq_in_ormerod_strains = pathdata.loc[ormerod_strains].mean()
path_freq_in_ormerod_strains.name = 'freq'

## Abundance of All Pathways

In [None]:
d = (pathdata.loc[ormerod_strains + muri_strains])
feats = d.columns

d_train = d.drop(muri_strains)
fit = PCA().fit(d_train)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)
ordin, prop_explained = pca_t(np.sqrt(d), return_prop_explained=True)

comps = 'PC1', 'PC2'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin, color='k', s=5)
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(10).index]

scale = 10
for path_id, row in important_compons.iterrows():
    ax.annotate(path_id, (row[comps[0]] * scale, row[comps[1]] * scale), weight='bold', alpha=0.5)
ax.scatter([0], [0], marker='x', color='k')
    
path_function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(10)

In [None]:
sns.clustermap(d.loc[:,d.sum() != 0].T.apply(np.sqrt), robust=True,
                   figsize=(10, 10), vmin=0)

In [None]:
(pathdata
     .loc[['OTU-1-UT', 'OTU-1-UM', 'OTU-7']]
     .T[lambda x: (x['OTU-1-UM'] &
                   x['OTU-7'] &
                   ~x['OTU-1-UT']).astype(bool)]
     .join(path_function))

## Path Frequencies in Ormerod Strains

### Present in both OTU-1-UM and OTU-4


In [None]:
(path_function
    .join(path_freq_in_ormerod_strains).dropna()
    [(pathdata.loc['OTU-1-UM'] > 0) & (pathdata.loc['OTU-7'] > 0)]
    .sort_values('freq').head())

### Present in OTU-1-UM and OTU-7, but not in OTU-1-UT

In [None]:
(path_function
    .join(path_freq_in_ormerod_strains).dropna()
    [(pathdata.loc['OTU-1-UM'] > 0) &
     (pathdata.loc['OTU-7'] > 0) &
     (pathdata.loc['OTU-1-UT'] == 0)
    ]
    .sort_values('freq').head())

### Present in OTU-1-UT and OTU-7, but not in OTU-1-UM

In [None]:
(path_function
    .join(path_freq_in_ormerod_strains).dropna()
    [(pathdata.loc['OTU-1-UT'] > 0) &
     (pathdata.loc['OTU-7'] > 0) &
     (pathdata.loc['OTU-1-UM'] == 0)
    ]
    .sort_values('freq').head())

### Present in OTU-1 (both sites) but not OTU-7

In [None]:
(path_function
    .join(path_freq_in_ormerod_strains).dropna()
    [(pathdata.loc['OTU-1-UM'] > 0) &
     (pathdata.loc['OTU-1-UT'] > 0) &
     (pathdata.loc['OTU-7'] == 0)
    ]
    .sort_values('freq').head())

### Present in OTU-7 but not OTU-1 (both sites)

In [None]:
(path_function
    .join(path_freq_in_ormerod_strains).dropna()
    [(pathdata.loc['OTU-1-UM'] == 0) &
     (pathdata.loc['OTU-1-UT'] == 0) &
     (pathdata.loc['OTU-7'] > 0)
    ]
    .sort_values('freq').head())