In [None]:
import pandas as pd
from glob import glob
import matplotlib.pyplot as plt
from skbio.diversity import beta_diversity
from skbio.stats.ordination import pcoa
from sklearn.decomposition import PCA
import numpy as np

In [None]:
all_frames = []
for path in glob('res/core.a.mags.annot.d/*.cog.tsv') + glob('res/ref.mags.annot.d/*.cog.tsv'):
    frame = pd.read_table(path, names=['orf_id', 'cog'])
    frame['mag_id'] = path.split('/')[-1].split('.')[0]
    all_frames.append(frame)
    
data = pd.concat(all_frames).reset_index(drop=True).groupby(['mag_id', 'cog']).apply(len).unstack('cog').fillna(0).astype(int)   #.drop('Otu0001_3')

In [None]:
def pcoa_t(counts, diss='jaccard', return_prop_explained=False):
    dist = beta_diversity(diss, counts.values, ids=counts.index)
    pcoa_fit = pcoa(dist)
    pcoa_result = pcoa_fit.samples
    if return_prop_explained:
        return pcoa_result, pd.Series(pcoa_fit.proportion_explained, index=pcoa_result.columns)
    else:
        return pcoa_result

def pca_t(counts, return_prop_explained=False):
    fit = PCA().fit(counts)
    values = fit.transform(counts)
    out = pd.DataFrame(values, index=counts.index)
    out.rename(lambda i: 'PC{}'.format(i), axis='columns', inplace=True)
    if return_prop_explained:
        return out, pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=out.columns)
    else:
        return out

In [None]:
ormerod_desig = { 'GP4': 'plant'
                , 'GP3': 'plant'
                , 'H7': 'plant'
                , 'K1': 'plant'
                , 'M13': 'plant'
                , 'GP1': 'plant'
                , 'M2': 'plant'
                , 'M8': 'plant'
                , 'M1': 'plant'
                , 'M12': 'plant'
                , 'H5': 'plant'
                , 'Homeothermus_arabinoxylanisolvens': 'plant'
                , 'GP2': 'host'
                , 'M9': 'host'
                , 'M14': 'host'
                , 'H6': 'host'
                , 'M5': 'host'
                , 'M6': 'starch'
                , 'M11': 'starch'
                , 'H2': 'starch'
                , 'H4': 'starch'
                , 'H10': 'starch'
                , 'M10': 'starch'
                , 'H3': 'starch'
                , 'H8': 'starch'
                , 'H9': 'starch'
                , 'H1': 'starch'
                , 'H3': 'starch'
                , 'M7': 'starch'
                , 'K10': 'starch'
                , 'M3': 'starch'
                }

In [None]:
feats = data.columns
d = (data[feats]
         .drop([ 'Otu0001_3'
               , 'Otu0003_1'
               , 'Otu0002_1'
               , 'Otu0012_1'
               , 'Otu0015_1'
               , 'Otu0006_1'
               ]))

ordin, prop_explained = pcoa_t(d, return_prop_explained=True, diss='euclidean')

comps = 'PC1', 'PC2'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin)
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        if ormerod_desig[mag_id] == 'starch':
            color = 'blue'
        elif ormerod_desig[mag_id] == 'host':
            color = 'purple'
        elif ormerod_desig[mag_id] == 'plant':
            color = 'green'
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))

In [None]:
feats = ['COG3507', 'COG3866', 'COG4677', 'COG2730', 'COG3693', 'COG0366', 'COG3525', 'COG3119']
d = (data[feats]
         .drop([ 'Otu0001_3'
               , 'Otu0003_1'
               , 'Otu0002_1'
               , 'Otu0012_1'
               , 'Otu0015_1'
               , 'Otu0006_1'
               ]))


fit = PCA().fit(d)
values = fit.transform(d)
ordin = pd.DataFrame(values, index=d.index)
ordin.rename(lambda i: 'PC{}'.format(i), axis='columns', inplace=True)
prop_explained = pd.Series(fit.explained_variance_ / fit.explained_variance_.sum(), index=ordin.columns)
ordin, prop_explained = pca_t(np.sqrt(d), return_prop_explained=True)

comps = 'PC1', 'PC2'
fig, ax = plt.subplots(figsize=(12, 12))
ax.scatter(comps[0], comps[1], data=ordin)
for mag_id, row in ordin.iterrows():
    if mag_id in ormerod_desig:
        if ormerod_desig[mag_id] == 'starch':
            color = 'blue'
        elif ormerod_desig[mag_id] == 'host':
            color = 'purple'
        elif ormerod_desig[mag_id] == 'plant':
            color = 'green'
    else:
        color = 'k'
    ax.annotate(mag_id, (row[comps[0]], row[comps[1]]), color=color)
    ax.set_xlabel('{} ({})'.format(comps[0], prop_explained[comps[0]]))
    ax.set_ylabel('{} ({})'.format(comps[1], prop_explained[comps[1]]))
    
#for cog in feats:
#    a = pd.Series({cog: 1}, index=feats).fillna(0).to_frame()
#    ax.scatter(comps[0], comps[1], data=fit.transform(a))