In [None]:
%matplotlib inline
%run ridges.py

In [None]:
import pandas as pd
from glob import glob
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from random import sample

In [None]:
sns.set_palette('deep')

## Load data

In [None]:
orgs01 = pd.concat(pd.read_csv(filename, sep='\t', header=None, names=['community', 'org_id'])
                  for filename in glob('../communities/bin_rnd_01/*.tsv'))
orgs01["size"] = orgs01["community"].apply(lambda x: int(x.split("_")[1]))

In [None]:
orgs001 = pd.concat(pd.read_csv(filename, sep='\t', header=None, names=['community', 'org_id'])
                  for filename in glob('../communities/bin_rnd_001/*.tsv'))
orgs001["size"] = orgs001["community"].apply(lambda x: int(x.split("_")[1]))

In [None]:
sizes = [2,4,6,8,10,15,20,25,30,40]
orgs01 = orgs01[orgs01["size"].isin(sizes)]
orgs001 = orgs001[orgs001["size"].isin(sizes)]

In [None]:
samples = pd.read_csv('../data/emp_150bp_filtered.tsv', sep='\t')

samples['bin_value'] = 1
samples['log_value'] = np.log10(samples['value'])

samples_wide = pd.pivot_table(samples, index='org_id', columns='sample',
                              values='bin_value', fill_value=0)
del samples['bin_value']

## Merge co-ocurrence and composition

In [None]:
def merge_data(cooc):

    cooc['value'] = 1
    cooc_wide = pd.pivot_table(cooc, index='org_id', columns='community',
                               values='value', fill_value=0)
    del cooc['value']

    common = sorted(set(samples_wide.index) & set(cooc_wide.index))
    samples_common = samples_wide.loc[common,:]
    cooc_common = cooc_wide.loc[common,:]

    cooc_comms_wide = cooc_common.T.dot(samples_common)
    cooc_comms_bin = cooc_comms_wide.eq(cooc_wide.sum(axis=0), axis=0).astype(int)
    cooc_comms = cooc_comms_bin.unstack().reset_index()
    cooc_comms = cooc_comms[cooc_comms[0] > 0]
    cooc_comms.drop(columns=[0], inplace=True)
    
    cooc_comms = pd.merge(cooc_comms, cooc, on="community")
    cooc_comms = pd.merge(cooc_comms, samples, on=["sample", "org_id"])
    
    return cooc_comms

In [None]:
data01 = merge_data(orgs01)

In [None]:
data001 = merge_data(orgs001)

### estimate cumulative abundance of co-occurring communities per sample

In [None]:
sum_01 = data01.groupby(["sample", "community", "size"], as_index=False).agg({"value": sum})

In [None]:
sum_001 = data001.groupby(["sample", "community", "size"], as_index=False).agg({"value": sum})

### estimate abundance of random sub-communities per sample

In [None]:
def random_subsamples(df):
    reps = 100
    tmp = []
    idx_samples = samples.set_index("sample")

    for size, group in df.groupby("size"):
        samples_i = group["sample"].drop_duplicates()
        for s_id in samples_i:
            values = list(idx_samples.loc[s_id,"value"])
            for rep in range(reps):
                value = sum(sample(values, size))
                tmp.append((s_id, size, rep, value))

    return pd.DataFrame(tmp, columns=["sample", "size", "rep", "value"])

In [None]:
rnd_01 = random_subsamples(data01)

In [None]:
rnd_001 = random_subsamples(data001)

In [None]:
sum_01["type"] = 'bin_rnd_01'
sum_001["type"] = 'bin_rnd_001'
rnd_01["type"] = 'random'
rnd_001["type"] = 'random'

merged = pd.concat([sum_01[["sample", "size", "type", "value"]],
                    sum_001[["sample", "size", "type", "value"]],
                    rnd_01[["sample", "size", "type", "value"]],
                    rnd_001[["sample", "size", "type", "value"]]])

merged["log_value"] = np.log10(merged["value"])

In [None]:
axs = ridges(merged, "log_value", (-4.4,0.1), "subpopulation abundance")
axs[-1].set_xticks([-4, -3, -2, -1, 0])
axs[-1].set_xticklabels(['0.01%', '0.1%','1%', '10%', '100%'])
plt.savefig("../figures/fig_2d.png", dpi=300)