In [None]:
import pandas as pd
from glob import glob
import matplotlib.pyplot as plt
from skbio.diversity import beta_diversity
from skbio.stats.ordination import pcoa
from sklearn.decomposition import PCA
import numpy as np

In [None]:
cog_function = pd.read_table('ref/cog_function.tsv', index_col='cog_id')

In [None]:
all_frames = []
for path in glob('res/core.a.mags.annot.d/*.cog.tsv') + glob('res/ref.mags.annot.d/*.cog.tsv'):
    frame = pd.read_table(path, names=['orf_id', 'cog'])
    frame['mag_id'] = path.split('/')[-1].split('.')[0]
    all_frames.append(frame)
    
data = (pd.concat(all_frames)
          .reset_index(drop=True)
          .rename(columns={'cog': 'cog_id'})
          .groupby(['mag_id', 'cog_id'])
          .apply(len)
          .unstack('cog_id')
          .fillna(0).astype(int)
       )

In [None]:
for cog_id in set(cog_function.index) - set(data.columns):
    data[cog_id] = 0

In [None]:
def pcoa_t(counts, diss='jaccard', return_prop_explained=False):
    dist = beta_diversity(diss, counts.values, ids=counts.index)
    pcoa_fit = pcoa(dist)
    pcoa_result = pcoa_fit.samples
    if return_prop_explained:
        return pcoa_result, pd.Series(pcoa_fit.proportion_explained, index=pcoa_result.columns)
    else:
        return pcoa_result

def pca_t(counts, return_prop_explained=False):
    fit = PCA().fit(counts)
    values = fit.transform(counts)
    out = pd.DataFrame(values, index=counts.index)
    out.rename(lambda i: 'PC{}'.format(i), axis='columns', inplace=True)
    if return_prop_explained:
        return out, pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=out.columns)
    else:
        return out

In [None]:
ormerod_desig = { 'GP4': 'plant'
                , 'GP3': 'plant'
                , 'H7': 'plant'
                , 'K1': 'plant'
                , 'M13': 'plant'
                , 'GP1': 'plant'
                , 'M2': 'plant'
                , 'M8': 'plant'
                , 'M1': 'plant'
                , 'M12': 'plant'
                , 'H5': 'plant'
                , 'Homeothermus_arabinoxylanisolvens': 'plant'
                , 'GP2': 'host'
                , 'M9': 'host'
                , 'M14': 'host'
                , 'H6': 'host'
                , 'M5': 'host'
                , 'M6': 'starch'
                , 'M11': 'starch'
                , 'H2': 'starch'
                , 'H4': 'starch'
                , 'H10': 'starch'
                , 'M10': 'starch'
                , 'H3': 'starch'
                , 'H8': 'starch'
                , 'H9': 'starch'
                , 'H1': 'starch'
                , 'H3': 'starch'
                , 'M7': 'starch'
                , 'K10': 'starch'
                , 'M3': 'starch'
                }

In [None]:
feats = data.columns
d = (data[feats]
         .drop([ 'Otu0001_3'
               , 'Otu0003_1'
               , 'Otu0002_1'
               , 'Otu0012_1'
               , 'Otu0015_1'
               , 'Otu0006_1'
               ]))

ordin, prop_explained = pcoa_t(d, return_prop_explained=True, diss='euclidean')

comps = 'PC1', 'PC2'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin)
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        if ormerod_desig[mag_id] == 'starch':
            color = 'blue'
        elif ormerod_desig[mag_id] == 'host':
            color = 'purple'
        elif ormerod_desig[mag_id] == 'plant':
            color = 'green'
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))

In [None]:
feats = ['COG3507', 'COG3866', 'COG4677', 'COG2730', 'COG3693', 'COG0366', 'COG3525', 'COG3119']
d = (data[feats]
         .drop([ 'Otu0001_3'
               , 'Otu0003_1'
               , 'Otu0002_1'
               , 'Otu0012_1'
               , 'Otu0015_1'
               , 'Otu0006_1'
               ]))


fit = PCA().fit(d)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index)
ordin.rename(lambda i: 'PC{}'.format(i), axis='columns', inplace=True)
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)
ordin, prop_explained = pca_t(np.sqrt(d), return_prop_explained=True)

comps = 'PC1', 'PC2'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin)
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        if ormerod_desig[mag_id] == 'starch':
            color = 'blue'
        elif ormerod_desig[mag_id] == 'host':
            color = 'purple'
        elif ormerod_desig[mag_id] == 'plant':
            color = 'green'
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))

In [None]:
# Presence Absence of Carbohydrate COGs

feats = list(set(cog_function[lambda x: x.function_categories.str.contains('G')].index) & set(data.columns))
d = (data[feats]
         .drop([ 'Otu0001_3'
               , 'Otu0003_1'
               , 'Otu0002_1'
               , 'Otu0012_1'
               , 'Otu0015_1'
               , 'Otu0006_1'
               ])
          .apply(lambda x: (x > 0).astype(int)))

palette = {'starch': 'blue', 'host': 'purple', 'plant': 'green'}


fit = PCA().fit(d)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)
ordin, prop_explained = pca_t(np.sqrt(d), return_prop_explained=True)

comps = 'PC1', 'PC2'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin)
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(10).index]
scale = 8
#ax.scatter(comps[0], comps[1], data=important_compons * scale, color='grey')
for cog_id, row in important_compons.iterrows():
    ax.annotate(cog_id, (row[comps[0]] * scale, row[comps[1]] * scale), color=color)
    
cog_function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(10)

In [None]:
# Abundance of Carbohydrate COGs

feats = list(set(cog_function[lambda x: x.function_categories.str.contains('G')].index) & set(data.columns))
d = (data[feats]
         .drop([ 'Otu0001_3'
               , 'Otu0003_1'
               , 'Otu0002_1'
               , 'Otu0012_1'
               , 'Otu0015_1'
               , 'Otu0006_1'
               ]))

palette = {'starch': 'blue', 'host': 'purple', 'plant': 'green'}


fit = PCA().fit(d)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)
ordin, prop_explained = pca_t(np.sqrt(d), return_prop_explained=True)

comps = 'PC1', 'PC2'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin)
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(10).index]
scale = 8
#ax.scatter(comps[0], comps[1], data=important_compons * scale, color='grey')
for cog_id, row in important_compons.iterrows():
    ax.annotate(cog_id, (row[comps[0]] * scale, row[comps[1]] * scale), color=color)
    
cog_function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(10)

In [None]:
# Abundance of Ormerod COGs (mostly carbs)

feats = list(set(['COG3507', 'COG3866', 'COG4677', 'COG2730', 'COG3693', 'COG0366', 'COG3525', 'COG3119']) &
             set(data.columns))
d = (data[feats]
         .drop([ 'Otu0001_3'
               , 'Otu0003_1'
               , 'Otu0002_1'
               , 'Otu0012_1'
               , 'Otu0015_1'
               , 'Otu0006_1'
               ]))

palette = {'starch': 'blue', 'host': 'purple', 'plant': 'green'}


fit = PCA().fit(d)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)
ordin, prop_explained = pca_t(np.sqrt(d), return_prop_explained=True)

comps = 'PC1', 'PC2'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin)
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(10).index]
scale = 8
#ax.scatter(comps[0], comps[1], data=important_compons * scale, color='grey')
for cog_id, row in important_compons.iterrows():
    ax.annotate(cog_id, (row[comps[0]] * scale, row[comps[1]] * scale), color=color)
    
cog_function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(10)

In [None]:
# Presence/Absence All COGs

feats = list(set(cog_function.index) &
             set(data.columns))
d = (data[feats]
         .drop([ 'Otu0001_3'
               , 'Otu0003_1'
               , 'Otu0002_1'
               , 'Otu0012_1'
               , 'Otu0015_1'
               , 'Otu0006_1'
               ])
         .apply(lambda x: (x > 0).astype(int)))

palette = {'starch': 'blue', 'host': 'purple', 'plant': 'green'}


fit = PCA().fit(d)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index).rename(lambda i: 'PC{}'.format(i), axis='columns')
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)
ordin, prop_explained = pca_t(np.sqrt(d), return_prop_explained=True)

comps = 'PC1', 'PC2'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin)
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        color = palette[ormerod_desig[mag_id]]
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
compon = pd.DataFrame(fit.components_.T, index=feats).rename(lambda i: 'PC{}'.format(i), axis='columns')
magnitude = np.sqrt(compon[comps[0]]**2 + compon[comps[1]]**2)
magnitude.name = 'magnitude'
important_compons = compon.loc[magnitude.sort_values(ascending=False).head(10).index]
scale = 8
#ax.scatter(comps[0], comps[1], data=important_compons * scale, color='grey')
for cog_id, row in important_compons.iterrows():
    ax.annotate(cog_id, (row[comps[0]] * scale, row[comps[1]] * scale), color=color)
    
cog_function.join(magnitude).sort_values('magnitude', ascending=False).dropna(subset=['magnitude']).head(10)

In [None]:
# Frequency of COGs present in both OTU1 and OTU4 in Ormerod Strains
freq_in_mouse_strains = (data.loc[data.index.str.contains('^(M|GP|H|K)[0-9]')] > 0).mean()
freq_in_mouse_strains.name = 'freq'

(cog_function.loc[(data.loc['Otu0001_1'] > 0) &
                 (data.loc['Otu0007_1'] > 0)]
            .join(freq_in_mouse_strains)).sort_values('freq')[lambda x: x.function_categories.str.contains('E')]

In [None]:
(data.drop([ 'Otu0001_3'
               , 'Otu0003_1'
               , 'Otu0002_1'
               , 'Otu0012_1'
               , 'Otu0015_1'
               , 'Otu0006_1'
           ])
     .stack().reset_index()
     .join(cog_function, on='cog_id')
     )