In [None]:
%matplotlib inline

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from glob import glob
from sklearn.decomposition import PCA
from matplotlib_venn import venn2

In [None]:
sns.set_palette('deep')

In [None]:
replicates = [1, 2, 3] 
thresholds = [0.01, 0.001, 0.0001, 0.00001]

In [None]:
types = [f"replicates/repl_10000_{x}_{y}" for y in replicates for x in thresholds]

In [None]:
orgs = {}
for commtype in types:
    orgs[commtype] = pd.concat(pd.read_csv(filename, sep='\t', header=None,
                                  names=["community", "org_id"])
                               for filename in glob(f'../communities/{commtype}/*.tsv'))

In [None]:
for commtype, orgs_i in orgs.items():
    orgs_i["size"] = orgs_i["community"].apply(lambda x: int(x.split("_")[1]))
    orgs_i["threshold"] = f"{float(commtype.split('_')[-2]):.3%}"
    orgs_i["replicate"] = int(commtype[-1])

In [None]:
df = pd.concat(orgs.values())
df.reset_index(inplace=True)
del df["index"]
df["value"] = 1

## PCA

In [None]:
dfb = df.query("size <= 30")
df3 = dfb.groupby(["size", "threshold", 'replicate', "org_id"], as_index=False).agg({"community": lambda x: len(x) / 1000}) 
df4 = pd.pivot_table(df3, index=['threshold', 'replicate', 'size'], columns='org_id', values='community', fill_value=0)

pca = PCA()
Y = pca.fit_transform(df4.values)
dfY = pd.DataFrame(Y, index=df4.index).reset_index()

pc1 = pca.explained_variance_ratio_[0]
pc2 = pca.explained_variance_ratio_[1]

In [None]:
hue_order = dfY["threshold"].drop_duplicates().sort_values(ascending=False)

In [None]:
samples = pd.read_csv('../data/emp_150bp_filtered.tsv', sep='\t')
samples['value'] = 1
samples_wide = pd.pivot_table(samples, index='org_id', columns='sample', values='value', fill_value=0)

In [None]:
def count_samples(dfi):
    df_wide = pd.pivot_table(dfi, index='org_id', columns=["community", "size"], values='value', fill_value=0)
    common = sorted(set(samples_wide.index) & set(df_wide.index))
    samples_common = samples_wide.loc[common,:]
    cooc_common = df_wide.loc[common,:]

    cooc_comms_wide = cooc_common.T.dot(samples_common)
    cooc_comms_bin = cooc_comms_wide.eq(df_wide.sum(axis=0), axis=0).astype(int)
    cooc_comms = pd.melt(cooc_comms_bin.reset_index(), id_vars=["community", "size"], 
                         value_vars=cooc_comms_bin.columns).query("value > 0")
    return cooc_comms

In [None]:
samples = {}
totals = []
df['value'] = 1

for x in [0.001, 0.0001]:
    for y in replicates:
        dfi = df.query(f"size <= 40 and threshold == '{x:.3%}' and replicate == {y}")
        si = count_samples(dfi)
        si["replicate"] = y
        si["threshold"] = f"{x:.3%}"
        ti = si.groupby(["community", "size", "replicate", "threshold"], 
                                as_index=False).agg({"value": sum})
        samples[(x,y)] = si
        totals.append(ti)

total = pd.concat(totals)

In [None]:
f, axs = plt.subplots(2,1, figsize=(10,7))

palette = {"1.000%": '#ed4517', "0.100%": '#ed7e17', "0.010%": '#1ba055', "0.001%": '#1ba09e'}

sns.scatterplot(data=dfY, x=0, y=1, hue="threshold", hue_order=hue_order, size="size", sizes=(10,100),
                     legend='brief', palette=palette, style="replicate", ax=axs[0])
axs[0].set_xlabel(f"PC1: {pc1:.1%}")
axs[0].set_ylabel(f"PC2: {pc2:.1%}")

axs[0].legend(bbox_to_anchor=(1.03, 1), loc=2)
axs[0].set_xlim(-2.4, 3.8)
axs[0].set_title("a")

sns.lineplot(data=total, x="size", y="value", hue="threshold", ci="sd", ax=axs[1],
            palette=palette)


axs[1].legend(bbox_to_anchor=(1.03, 1), loc=2)
axs[1].set_xlim(2,40)

axs[1].set_xlabel("Number of co-occurring species")
axs[1].set_ylabel("Number of samples")
axs[1].set_title("b")

plt.tight_layout()

plt.savefig("../figures/supp_fig_2.png", dpi=300)