# Analyze the results of tree trimming



## Setup

In [542]:
import os 
import gc
import re
import csv
import glob
import math
import umap
import json
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
from time import time
from tqdm import tqdm
from scipy import stats
from collections import * 
from sklearn import cluster
from sklearn import decomposition
from ete4 import NCBITaxa, Tree
import matplotlib.pyplot as plt
import matplotlib.colors as pltc
from scipy.spatial import distance
from scipy.cluster import hierarchy
from matplotlib.lines import Line2D
import matplotlib.patches as mpatches


In [2]:
import sys
sys.path.append('../repo-armbrust-metat-search')

In [3]:
import functions.fn_metat_files as fnf

In [96]:
ncbi = NCBITaxa()

In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
os.getcwd()

In [6]:
workdir = '/scratch/bgrodner/iron_ko_contigs'
os.chdir(workdir)


In [7]:
os.getcwd()

In [8]:
os.listdir()

Plotting

In [9]:
def general_plot(
    xlabel="", ylabel="", ft=12, dims=(5, 3), col="k", lw=1, pad=0, tr_spines=True
):
    fig, ax = plt.subplots(figsize=(dims[0], dims[1]), tight_layout={"pad": pad})
    for i in ax.spines:
        ax.spines[i].set_linewidth(lw)
    if not tr_spines:
        ax.spines["top"].set_visible(False)
        ax.spines["right"].set_visible(False)
    else:
        ax.spines["top"].set_color(col)
        ax.spines["right"].set_color(col)
    ax.spines["bottom"].set_color(col)
    ax.spines["left"].set_color(col)
    ax.tick_params(direction="in", labelsize=ft, color=col, labelcolor=col)
    ax.set_xlabel(xlabel, fontsize=ft, color=col)
    ax.set_ylabel(ylabel, fontsize=ft, color=col)
    ax.patch.set_alpha(0)
    return (fig, ax)

def plot_umap(
    embedding,
    figsize=(10, 10),
    markersize=10,
    alpha=0.5,
    colors="k",
    xticks=[],
    yticks=[],
    markerstyle='o',
    cmap_name='tab20',
    cl_lab=False
):
    fig, ax = general_plot(dims=figsize)
    if isinstance(markerstyle, str):
        ax.scatter(
            embedding[:, 0],
            embedding[:, 1],
            s=markersize,
            alpha=alpha,
            c=colors,
            edgecolors="none",
            marker=markerstyle,
            cmap=cmap_name
        )
    else:
        for e0, e1, c, m in zip(
            embedding[:, 0], 
            embedding[:, 1],
            colors,
            markerstyle 
        ):
            ax.scatter(
                e0,
                e1,
                s=markersize,
                alpha=alpha,
                c=c,
                edgecolors="none",
                marker=m
            )
    ax.set_aspect("equal")
    if len(xticks) > 0:
        ax.set_xticks(xticks)
    if len(yticks) > 0:
        ax.set_yticks(yticks)
    ax.set_xlabel("UMAP 1")
    ax.set_ylabel("UMAP 2")
    return fig, ax


#### Get KO dict

Get dataframe

In [91]:
ko_fn = "ko00001.json"
database = list()
for _, v in pd.read_json(ko_fn).iterrows():
    d = v["children"]
    cat_1 = d["name"]
    for child_1 in d["children"]:
        cat_2 = child_1["name"] # Module?
        for child_2 in child_1["children"]:
            cat_3 = child_2["name"]
            if "children" in child_2:
                for child_3 in child_2["children"]:
                    cat_4 = child_3["name"]
                    fields = [cat_1, cat_2, cat_3, cat_4]
                    database.append(fields)
df_kegg = pd.DataFrame(database, columns=["Level_A", "Level_B", "Level_C", "Level_D"])
df_kegg.shape


In [92]:
ld = df_kegg['Level_D'].values
ld[:5]

In [484]:
dict_ko_name = {}
for name in ld:
    ko = re.search(r"^\w+",name)[0]
    dict_ko_name[ko] = name

## Load example data

G1PA filenames

In [131]:
dir_ko_dict = '/scratch/bgrodner/iron_ko_contigs/metat_search_results/dicts_iron_KO_contig'
fn_dict_ko_contigs = f'{dir_ko_dict}/NPac.G1PA.bf100.id99.aa.best.Kofam.incT30.csv.gz-iron_KOs.txt-dict.json'

fn_dict_taxtrim_contigs = '/scratch/bgrodner/iron_ko_contigs/metat_search_results/dicts_iron_KO_contig/tidy_tables/NPac.G1PA.bf100.id99.aa.best.Kofam.incT30.csv.gz-iron_KOs.txt-tidys/NPac.G1PA.bf100.id99.aa.best.Kofam.incT30.csv.gz-iron_KOs.txt-dict_taxtrim_contigs.json'

dir_counts = '/scratch/bgrodner/iron_ko_contigs/metat_search_results/dicts_iron_KO_contig/dicts_contig_count'
glob_fn = f'{dir_counts}/*G1PA*.json'
fns = glob.glob(glob_fn)

fn_dict_contig_taxon = '/scratch/bgrodner/iron_ko_contigs/metat_search_results/dicts_iron_KO_contig/dicts_contig_tax/NPac.G1PA.bf100.id99.aa.best.Kofam.incT30.csv.gz-iron_KOs.txt-dict_contig_taxid.json'

Load estcounts ordered by sample

In [None]:
dict_contig_estcounts = defaultdict(list)
for fn in fns:
    # Load counts
    with open(fn, 'r') as f:
        dict_ctg_cnt = json.load(f)
    for ctg, cnt in dict_ctg_cnt.items():
        # map contig to counts
        dict_contig_estcounts[ctg].append(float(cnt[0]))

Load ko contigs and get inverted dict and rework contig from 6tr format

In [50]:
with open(fn_dict_ko_contigs, 'r') as f:
    dict_ko_contigs = json.load(f)

dict_contig_ko = {}
dict_ko_contigs_new = defaultdict(list)
for ko, contigs in dict_ko_contigs.items():
    for c in contigs:
        c_ = re.sub(r'_\d+$','',c)
        dict_ko_contigs_new[ko].append(c_)
        dict_contig_ko[c_] = ko
dict_ko_contigs = dict_ko_contigs_new

Load taxtrim contigs and undo 6tr and get inverted dict

In [53]:
with open(fn_dict_taxtrim_contigs, 'r') as f:
    dict_taxtrim_contigs = json.load(f)

dict_taxtrim_contigs_new = defaultdict(list)
dict_contig_taxtrim = {}
for tid, contigs in dict_taxtrim_contigs.items():
    for c in contigs:
        c_ = re.sub(r'_\d+$','',c)
        dict_taxtrim_contigs_new[tid].append(c_)
        dict_contig_taxtrim[c_] = tid

dict_taxtrim_contigs = dict_taxtrim_contigs_new

Load untrimmed taxa and undo 6tr

In [136]:
with open(fn_dict_contig_taxon, 'r') as f:
    dict_contig_taxon = json.load(f)

dict_contig_taxon_new= {}
for c, tid in dict_contig_taxon.items():
    c_ = re.sub(r'_\d+$','',c)
    dict_contig_taxon_new[c_] = tid

dict_contig_taxon = dict_contig_taxon_new

List of samples:

In [31]:
samples = []
for fn in fns:
    samples.append(re.search(r'(?<=\.tar\.gz\.).+(?=\.tsv-)', fn)[0])
samples[:3]

Get metadata for samples

In [30]:
meta_fn = '/scratch/bgrodner/repo-armbrust-metat/gradients1/g1_station_pa_metat/sample_metadata.csv'
meta = pd.read_csv(meta_fn)
df_meta = []
for s in samples:
    sid = re.search(r"(?<=G1PA\.).+(?=\.abundance)", s)[0]
    sid = re.sub('\.','_',sid)
    row = meta[meta['SampleID'] == sid].squeeze()
    df_meta.append(row)
df_meta = pd.DataFrame(df_meta, index=samples)
df_meta[:3]

## Raw numbers

How many KOs, contigs, taxa, samples are there?

In [68]:
print(f"""
There are 
    {len(dict_ko_contigs)} KOs 
    {len(dict_taxtrim_contigs)} taxa
    {len(dict_contig_estcounts):,} contigs
    {len(next(iter(dict_contig_estcounts.values())))} samples
      """)

How many taxon-ko pairs are there?

In [101]:
ko_taxon_pairs = set()
for ko, contigs in dict_ko_contigs.items():
    for c in contigs:
        tax = dict_contig_taxtrim.get(c)
        if tax:
            ko_taxon_pairs.add(f'{ko}-{tax}')
print(f"""
There are 
    {len(ko_taxon_pairs)} KO-taxon pairs 
      """)

Metadata info

In [43]:
print(f"""
There are
    Depths:\t{df_meta['Depth'].unique()}
    Filter:\t{df_meta['Filter'].unique()}
    Latitude:\t{[float(round(l,2)) for l in df_meta['Latitude'].sort_values().unique()]}
    Longitude:\t{[float(round(l,2)) for l in df_meta['Longitude'].sort_values().unique()]}
""")

## Distributions

How many samples is each KO in?

In [56]:
dict_ko_nsams = {}
for ko, contigs in dict_ko_contigs.items():
    sams = np.zeros(len(samples))
    for c in contigs:
        sams += dict_contig_estcounts[c]
    dict_ko_nsams[ko] = sams.astype(bool).sum()



In [57]:
fig, ax = general_plot()
_ = ax.hist(list(dict_ko_nsams.values()))
_ = ax.set_xlabel('Number of samples a KO is in')
_ = ax.set_ylabel('Number KOs')

How many taxa is each KO in?

In [61]:
dict_ko_ntax = {}
for ko, contigs in dict_ko_contigs.items():
    taxa = set()
    for c in contigs:
        tid = dict_contig_taxtrim.get(c)
        if tid:
            taxa.add(tid)
    dict_ko_ntax[ko] = len(taxa)

In [62]:
fig, ax = general_plot()
_ = ax.hist(list(dict_ko_ntax.values()))
_ = ax.set_xlabel('Number of taxa a KO is in')
_ = ax.set_ylabel('Number KOs')

How many samples is each taxon in?

In [74]:
dict_tax_nsams = {}
for tax, contigs in dict_taxtrim_contigs.items():
    sams = np.zeros(len(samples))
    for c in contigs:
        sams += dict_contig_estcounts[c]
    dict_tax_nsams[tax] = sams.astype(bool).sum()



In [75]:
fig, ax = general_plot()
_ = ax.hist(list(dict_tax_nsams.values()))
_ = ax.set_xlabel('Number of samples a taxon is in')
_ = ax.set_ylabel('Number taxa')

How many KOs does each taxon have?

In [69]:
dict_tax_nkos = {}
for tax, contigs in dict_taxtrim_contigs.items():
    kos = set()
    for c in contigs:
        ko = dict_contig_ko.get(c)
        if ko:
            kos.add(ko)
    dict_tax_nkos[tax] = len(kos)

In [71]:
fig, ax = general_plot()
_ = ax.hist(list(dict_tax_nkos.values()), bins=20)
_ = ax.set_xlabel('Number of KOs in a taxon')
_ = ax.set_ylabel('Number taxa')

How many samples is each KO in each taxon in?

In [84]:
dict_ko_tax_estcounts = defaultdict(lambda: defaultdict(lambda: np.zeros(len(samples))))
for ko, contigs in dict_ko_contigs.items():
    for c in contigs:
        tax = dict_contig_taxtrim.get(c)
        if tax:
            dict_ko_tax_estcounts[ko][tax] += dict_contig_estcounts[c]
kotax_nsams = []
for ko, d in dict_ko_tax_estcounts.items():
    for tax, ec in d.items():
        kotax_nsams.append(ec.astype(bool).sum())



In [88]:
fig, ax = general_plot()
_ = ax.hist(kotax_nsams, bins=20)
_ = ax.set_xlabel('Number of samples a KO in a givern taxon is in')
_ = ax.set_ylabel('Number KOs')

### Inspect interesting subsets

Which KOs are not in all the samples?

In [94]:
for ko, nsams in dict_ko_nsams.items():
    if nsams < 20:
        print(dict_ko_name[ko])

Which KOs are in many taxa?

In [95]:
for ko, ntax in dict_ko_ntax.items():
    if ntax > 60:
        print(dict_ko_name[ko])

Which taxa are not in all the samples?

In [98]:
for tax, nsams in dict_tax_nsams.items():
    if nsams < 47:
        print(ncbi.get_taxid_translator([tax]))

Which taxa have lots of KOs?

In [102]:
for tax, nkos in dict_tax_nkos.items():
    if nkos > 90:
        print(ncbi.get_taxid_translator([tax]))

## Compare bacteria and eukaryotes

Get dict defining each taxon as bacteria or eukaryote

In [110]:
lineage

In [111]:
dict_taxtrim_bacteuk = {}
for tax, _ in dict_taxtrim_contigs.items():
    lineage = ncbi.get_lineage(tax)
    if 2759 in lineage:
        dict_taxtrim_bacteuk[tax] = 'Eukaryota'
    elif 2 in lineage:
        dict_taxtrim_bacteuk[tax] = 'Bacteria'
    else:
        dict_taxtrim_bacteuk[tax] = 'other'

How many bacterial vs eukaryota KOs?

In [123]:
dict_ko_ebs = {}
for ko, contigs in dict_ko_contigs.items():
    ebs = set()
    for c in contigs:
        tax = dict_contig_taxtrim.get(c)
        if tax:
            eb = dict_taxtrim_bacteuk[tax]
            ebs.add(eb)
    if ebs:
        dict_ko_ebs[ko] = ', '.join(str(x) for x in ebs)
    else:
        dict_ko_ebs[ko] = 'none'
    

In [124]:
names, counts = np.unique(list(dict_ko_ebs.values()), return_counts=True)

fig, ax = general_plot()
_ = ax.bar(names, counts)
_ = ax.set_ylabel('Number KOs')
_ = ax.set_xticklabels(names,rotation=45, ha='right')

Which KOs are only in bacteria?

In [126]:
for ko, ebs, in dict_ko_ebs.items():
    if (ebs == 'Bacteria') | (ebs == 'other, Bacteria'):
        print(dict_ko_name[ko])

Which KOs are Eukariota and bacteria?

In [127]:
for ko, ebs, in dict_ko_ebs.items():
    if (ebs == 'Eukaryota, Bacteria'):
        print(dict_ko_name[ko])

Which taxa are the siderophore KOs in?

In [130]:
kos = ['K04792','K15721','K22736']
dict_kotarget_taxa = defaultdict(set)
for ko in kos:
    contigs = dict_ko_contigs[ko]
    for c in contigs:
        tax = dict_contig_taxtrim.get(c)
        if tax:
            dict_kotarget_taxa[ko].add(tax)

for ko, taxa in dict_kotarget_taxa.items():
    print(dict_ko_name[ko])
    for tid in taxa:
        print(f'\t{ncbi.get_taxid_translator([tid])}')

Which taxa (untrimmed?) are the siderophore KOs in?

In [138]:
kos = ['K04792','K15721','K22736']
dict_kotarget_taxa = defaultdict(set)
for ko in kos:
    contigs = dict_ko_contigs[ko]
    for c in contigs:
        tax = dict_contig_taxon.get(c)[0]
        if tax:
            dict_kotarget_taxa[ko].add(tax)

for ko, taxa in dict_kotarget_taxa.items():
    print(dict_ko_name[ko])
    for tid in taxa:
        print(f'\t{ncbi.get_taxid_translator([tid])}')

## Hamming distances between ko-taxon pairs

Get boolean array

In [141]:
arr_kotax = []
index_kotax = []
for ko, d in dict_ko_tax_estcounts.items():
    for tax, ec in d.items():
        index_kotax.append(f"{ko}-{tax}")
        arr_kotax.append(ec.astype(bool))
arr_kotax = np.array(arr_kotax)
arr_kotax.shape

UMAP on array

In [149]:
nn = 7
reducer = umap.UMAP(metric='hamming', n_neighbors=nn)
embedding = reducer.fit_transform(arr_kotax)

In [150]:
dims = (5,5)
alpha = 0.1
fig, ax = general_plot(dims=dims)
_ = ax.scatter(
    embedding[:, 0],
    embedding[:, 1],
    alpha=alpha
)
_ = ax.set_aspect('equal','datalim')

Cluster ko-taxon pairs

In [None]:
dists = distance.pdist(arr_kotax, metric='hamming')

In [188]:
link = hierarchy.linkage(dists, method='centroid')

In [189]:
fig = plt.figure(figsize=(25, 10))
dn = hierarchy.dendrogram(link, truncate_mode='lastp')

In [200]:
t = 0.2
clust_hier = hierarchy.fcluster(link, t=t, criterion='distance')
# nclust = 6
# clust_hier = hierarchy.fcluster(link, t=nclust, criterion='maxclust')


In [201]:
dims = (5,5)
alpha = 0.1
fig, ax = general_plot(dims=dims)
_ = ax.scatter(
    embedding[:, 0],
    embedding[:, 1],
    alpha=alpha,
    c=clust_hier,
    cmap='tab10'
)
_ = ax.set_aspect('equal','datalim')

In [202]:
np.unique(clust_hier, return_counts=True)

Try with agglomerative clustering

In [175]:
dists_square = distance.squareform(dists)

In [183]:
clust_agg_fit = cluster.AgglomerativeClustering(metric='precomputed', linkage='complete').fit(dists_square)
clust_agg = clust_agg_fit.labels_

In [184]:
np.unique(clust_agg, return_counts=True)

In [185]:
dims = (5,5)
alpha = 0.1
fig, ax = general_plot(dims=dims)
_ = ax.scatter(
    embedding[:, 0],
    embedding[:, 1],
    alpha=alpha,
    c=clust_agg,
    cmap='tab10'
)
_ = ax.set_aspect('equal','datalim')

What is the distribution across samples for each cluster?

In [199]:
cl_df = []
clusts = np.unique(clust_agg)
for cl in clusts:
    col = arr_kotax[clust_agg == cl].mean(axis=0).squeeze()
    cl_df.append(col)

pd.DataFrame(np.array(cl_df).T, columns=clusts, index=samples).sort_index()

So seemingly ^ cluster 1 is just ko-taxon pairs with fewer samples and cluster 0 is those with all or almost all the samples

### Find groups of taxon-ko pairs with very small hamming distance

What is the metric for 1, 2, 3 disagreements?

In [205]:
for i in range(10):
    print(i, i / len(samples))

Get 1 disagreement

In [208]:
t = 0.03
clust_hier = hierarchy.fcluster(link, t=t, criterion='distance')
clusts, counts = np.unique(clust_hier, return_counts=True)
print(counts[counts > 1])


#### Look at the big cluster

In [215]:
clids = clusts[counts >= 310]
clids

In [228]:
for clid in clids:
    bool_ind = (clust_hier == clid)
    inds = np.array(index_kotax)[bool_ind]
    arr_sub = arr_kotax[bool_ind]
    print(arr_sub.mean(axis=0))

^ This is the ko-taxa that are present across almost all samples

#### Look at the next biggest cluster

In [225]:
clids = clusts[counts == 31]
clids

In [227]:
for clid in clids:
    bool_ind = (clust_hier == clid)
    inds = np.array(index_kotax)[bool_ind]
    arr_sub = arr_kotax[bool_ind]
    print(arr_sub.mean(axis=0))

^ this is the almost empty pairs

#### Look at some other clusters

In [229]:
clids = clusts[(counts > 10) & (counts < 30)]
clids

In [231]:
for clid in clids:
    bool_ind = (clust_hier == clid)
    inds = np.array(index_kotax)[bool_ind]
    arr_sub = arr_kotax[bool_ind]
    print(clid)
    print(arr_sub.mean(axis=0))

^ this is mixed some are interesting

### Get 2 disagreements

In [232]:
t = 0.05
clust_hier = hierarchy.fcluster(link, t=t, criterion='distance')
clusts, counts = np.unique(clust_hier, return_counts=True)
print(counts[counts > 1])


Get those with count greater than three and present in a moderate number of samples

In [244]:
for cl, cnt in zip(clusts, counts):
    if cnt > 3:
        bool_ind = (clust_hier == cl)
        arr_sub = arr_kotax[bool_ind]
        mn = arr_sub.mean(axis=0)
        npos = sum(mn > 0.5)
        if (npos > 3) & (npos < 10):
            print('\n\n',cl, '\n', mn)

            inds = np.array(index_kotax)[bool_ind]
            dict_tk = defaultdict(list)
            for kotax in inds:
                ko, tax = kotax.split('-')
                t = list(ncbi.get_taxid_translator([tax]).values())[0]
                k = dict_ko_name[ko]
                dict_tk[t].append(k)
            for t, ks in dict_tk.items():
                print('\t', t)
                for k in ks:
                    print('\t\t',k)


# Merged samples tree

Get tensor filename

In [771]:
# fn_tensor = '/scratch/bgrodner/iron_ko_contigs/metat_search_results/dicts_iron_KO_contig/tidy_tables/merge_all/iron_KOs.txt-barnacle_tensor_tidy.csv'
# fn_tensor = '/scratch/bgrodner/iron_ko_contigs/metat_search_results/dicts_iron_KO_contig/tree_trim/merge_all/iron_KOs.txt-barnacle_tensor_tidy-tree_trim_thresh_60_minsamples_9_minbatches_4.csv'
fn_tensor = '/scratch/bgrodner/iron_ko_contigs/metat_search_results/dicts_iron_KO_contig/tree_trim/merge_all/iron_KOs.txt-barnacle_tensor_tidy-tree_trim_thresh_60_minsamples_20_minbatches_4.csv'

os.path.exists(fn_tensor)

Get header

In [772]:
with open(fn_tensor, 'r') as f:
    print(next(f))
    print(next(f))


Which csv reader is fastest?

In [773]:
with open(fn_tensor, 'r') as f:
    t0 = time()
    for i in range(10000):
        row = next(f)
        row = row.split(',')
        sample = row[0]
    t1 = time()
print(f"{(t1-t0) // 60} min {(t1-t0) % 60} sec")

In [263]:
with open(fn_tensor, 'r') as f:
    reader = csv.reader(f)
    t0 = time()
    for i in range(10000):
        row = next(reader)
        sample = row[0]
    t1 = time()
print(f"{(t1-t0) // 60} min {(t1-t0) % 60} sec")

In [264]:
with open(fn_tensor, 'r') as f:
    reader = csv.DictReader(f)
    t0 = time()
    for i in range(10000):
        row = next(reader)
        sample = row['assm_sample']
    t1 = time()
print(f"{(t1-t0) // 60} min {(t1-t0) % 60} sec")

Read in dict

In [774]:
dict_taxtrim_ko_sample_estcounts = defaultdict(
    lambda: defaultdict(dict)
)
i = 0
with open(fn_tensor, 'r') as f:
    _ = next(f)
    for row in f:
        i += 1
        sample, ko, tax, estcounts, rep = row.split(',')
        sam = f"{sample}-{rep}"
        dict_taxtrim_ko_sample_estcounts[tax][ko][sam] = float(estcounts)
i

## Raw numbers

How many KOs, contigs, taxa, samples are there?

In [775]:
taxa_all = set()
kos_all = set()
samples_all = set()
taxko_all = set()
for tax, dkse in dict_taxtrim_ko_sample_estcounts.items():
  taxa_all.add(tax)
  for ko, dse in dkse.items():
    taxko_all.add(f"{tax}-{ko}")
    kos_all.add(ko)
    for sam, ec in dse.items():
      samples_all.add(sam)

In [776]:
print(f"""
There are 
    {len(kos_all)} KOs 
    {len(taxa_all)} taxa
    {len(samples_all)} samples
    {len(taxko_all)} taxon-kos
    {len(kos_all)*len(taxa_all)*len(samples_all):,d} total cells
      """)

## Tree

Build tree

In [777]:
tree = ncbi.get_topology(taxa_all)
for n in tree.traverse():
    if str(n.name) in taxa_all:
        kos = list(dict_taxtrim_ko_sample_estcounts[n.name].keys())
        nkos = len(kos)
    else:
        nkos = 'NA'
    n.add_props(
        nkos=nkos
    )


Plot tree

In [795]:
print(tree.to_str(props=['sci_name','name','nkos'], compact=True))

## Distributions

How many samples is each KO in?

In [751]:
dict_ko_sam_ec = defaultdict(lambda: defaultdict(float))
for tax, dkse in dict_taxtrim_ko_sample_estcounts.items():
    for ko, dse in dkse.items():
        for sam, ec in dse.items():
            dict_ko_sam_ec[ko][sam] += ec
dict_ko_nsams = {}
for ko, dse in dict_ko_sam_ec.items():
    nsam = 0
    for sam, ec in dse.items():
        if ec:
            nsam += 1
    dict_ko_nsams[ko] = nsam

In [457]:
fig, ax = general_plot()
_ = ax.hist(list(dict_ko_nsams.values()))
_ = ax.set_xlabel('Number of samples a KO is in')
_ = ax.set_ylabel('Number KOs')

How many taxa is each KO in?

In [458]:
dict_ko_taxa = defaultdict(list)
for tax, dkse in dict_taxtrim_ko_sample_estcounts.items():
    for ko, dse in dkse.items():
        dict_ko_taxa[ko].append(tax)

dict_ko_ntax = {}
for ko, taxa in dict_ko_taxa.items():
    dict_ko_ntax[ko] = len(taxa)

In [459]:
fig, ax = general_plot()
_ = ax.hist(list(dict_ko_ntax.values()), bins=15)
_ = ax.set_xlabel('Number of taxa a KO is in')
_ = ax.set_ylabel('Number KOs')

How many samples is each taxon in?

In [460]:
dict_tax_sam_ec = defaultdict(lambda: defaultdict(float))
for tax, dkse in dict_taxtrim_ko_sample_estcounts.items():
    for ko, dse in dkse.items():
        for sam, ec in dse.items():
            dict_tax_sam_ec[tax][sam] += ec
dict_tax_nsams = {}
for tax, dse in dict_tax_sam_ec.items():
    nsam = 0
    for sam, ec in dse.items():
        if ec:
            nsam += 1
    dict_tax_nsams[tax] = nsam



In [461]:
fig, ax = general_plot()
_ = ax.hist(list(dict_tax_nsams.values()), bins=20)
_ = ax.set_xlabel('Number of samples a taxon is in')
_ = ax.set_ylabel('Number taxa')

How many KOs does each taxon have?

In [462]:
dict_tax_nkos = {}
for tax, dkse in dict_taxtrim_ko_sample_estcounts.items():
    dict_tax_nkos[tax] = len(dkse)

In [463]:
fig, ax = general_plot()
_ = ax.hist(list(dict_tax_nkos.values()), bins=20)
_ = ax.set_xlabel('Number of KOs in a taxon')
_ = ax.set_ylabel('Number taxa')

How many samples is each KO in each taxon in?

In [464]:
dict_taxko_sam_ec = defaultdict(lambda: defaultdict(float))
for tax, dkse in dict_taxtrim_ko_sample_estcounts.items():
    for ko, dse in dkse.items():
        taxko = f"{tax}-{ko}"
        for sam, ec in dse.items():
            dict_taxko_sam_ec[taxko][sam] += ec
dict_taxko_nsams = {}
for taxko, dse in dict_taxko_sam_ec.items():
    nsam = 0
    for sam, ec in dse.items():
        if ec:
            nsam += 1
    dict_taxko_nsams[taxko] = nsam

In [465]:
fig, ax = general_plot()
_ = ax.hist(list(dict_taxko_nsams.values()), bins=100)
_ = ax.set_xlabel('Number of samples a KO in a givern taxon is in')
_ = ax.set_ylabel('Number KOs')

## Compare bacteria and eukaryotes

Get dict defining each taxon as bacteria or eukaryote

In [466]:
dict_taxtrim_bacteuk = {}
for tax in taxa_all:
    lineage = ncbi.get_lineage(tax)
    if 2759 in lineage:
        dict_taxtrim_bacteuk[tax] = 'Eukaryota'
    elif 2 in lineage:
        dict_taxtrim_bacteuk[tax] = 'Bacteria'
    else:
        dict_taxtrim_bacteuk[tax] = 'other'

How many bacterial vs eukaryota KOs?

In [467]:
dict_ko_ebsset = defaultdict(set)
for tax, dkse in dict_taxtrim_ko_sample_estcounts.items():
    for ko, _ in dkse.items():
        eb = dict_taxtrim_bacteuk[tax]
        dict_ko_ebsset[ko].add(eb)

dict_ko_ebs = {}
for ko, ebs in dict_ko_ebsset.items():
    if ebs:
        dict_ko_ebs[ko] = ', '.join(str(x) for x in ebs)
    else:
        dict_ko_ebs[ko] = 'none'
    

In [468]:
names, counts = np.unique(list(dict_ko_ebs.values()), return_counts=True)

fig, ax = general_plot()
_ = ax.bar(names, counts)
_ = ax.set_ylabel('Number KOs')
_ = ax.set_xticklabels(names,rotation=45, ha='right')


Which KOs are only in bacteria?

In [469]:
for ko, ebs, in dict_ko_ebs.items():
    if (ebs == 'Bacteria'):
        print(dict_ko_name[ko])

Which KOs are only in eukaryota?

In [470]:
for ko, ebs, in dict_ko_ebs.items():
    if (ebs == 'Eukaryota'):
        print(dict_ko_name[ko])

## Make a tensor with only a few taxa

Get tensor filename

In [485]:
fn_tensor = '/scratch/bgrodner/iron_ko_contigs/metat_search_results/dicts_iron_KO_contig/tidy_tables/merge_all/iron_KOs.txt-barnacle_tensor_tidy.csv'

os.path.exists(fn_tensor)

Get header

In [486]:
with open(fn_tensor, 'r') as f:
    print(next(f))
    print(next(f))


Output filename

In [494]:
fn_tensor_sub = "/scratch/bgrodner/iron_ko_contigs/metat_search_results/dicts_iron_KO_contig/tidy_tables/merge_all/iron_KOs.txt-barnacle_tensor_tidy-subset_taxa_01.csv"

Get a list of taxa to subset

In [523]:
# Species, taxid, nkos
nodes_sub_01 = [
    "Pelagomonadales,54409,211",
    "Thalassiosirales,33847,234",
    "Pseudomonadota,1224,326"
]

nodes_sub_02 = [
    "Pelagomonadales,54409,211",
    "Thalassiosirales,33847,234",
    "Pseudomonadota,1224,326",
    "Pelagomonas calceolata,35677,252",
    "Bacillati,1783272,288",
    "Micromonas,38832,226",
]

nodes_sub_03 = [
    "Pelagomonadales,54409,211",
    "Thalassiosirales,33847,234",
    "Pseudomonadota,1224,326",
    "Pelagomonas calceolata,35677,252",
    "Bacillati,1783272,288",
    "Micromonas,38832,226",
    "Pseudomonadati,3379134,298",
    "Bacteria,2,298",
    "Karlodinium veneficum,407301,243",
    "Karenia brevis,156230,203",
    "Symbiodiniaceae,252141,265",
    "Pelagodinium beii,43686,221",
]

taxa_sub = []
# for t in nodes_sub_01:
for t in nodes_sub_03:
    _, tid, _ = t.split(',')
    taxa_sub.append(tid)

taxa_sub = set(taxa_sub)
taxa_sub

Write rows to tidytable

In [477]:
i = 0
with open(fn_tensor, 'r') as fr, open(fn_tensor_sub, 'w') as fw:
    row = next(fr)
    fw.write(row)
    for row in fr:
        sample, ko, tax, estcounts, rep = row.split(",")
        if tax in taxa_sub:
            i += 1
            fw.write(row)

i

### Subset 01

Component 16

In [479]:
print(dict_ko_name['K08906'])

taxa = [54409, 1224]

dict_tax_name = ncbi.get_taxid_translator(taxa)

for t in taxa:
    print(t,dict_tax_name[t])

Component 45

In [481]:
kos = [
    'K22341', 'K00532'
]
for ko in kos:
    print(dict_ko_name[ko])

taxa = [
    '33847', '1224'
]
dict_tax_name = ncbi.get_taxid_translator(taxa)
for t in taxa:
    print(t,dict_tax_name[int(t)])


### Sample clusters

In [None]:
dtn = {
    '54409' : "Pelagomonadales",
    '33847' : "Thalassiosirales",
    '1224' : "Pseudomonadota",
}

cluster 3

In [483]:
kos = [
    'K00216', 'K00362', 'K02364', 'K03839', 'K03840', 'K07243', 'K11532',
       'K12239', 'K14690', 'K17877', 'K21990'
]
dkn = {}
for k in kos:
    dkn[k] = dict_ko_name[k]
dkn    

## Subset batches in taxon subset

Output filename

In [524]:
fn_tensor_sub = "/scratch/bgrodner/iron_ko_contigs/metat_search_results/dicts_iron_KO_contig/tidy_tables/merge_all/iron_KOs.txt-barnacle_tensor_tidy-subset_taxa_03-subset_batches_01.csv"
# fn_tensor_sub = "/scratch/bgrodner/iron_ko_contigs/metat_search_results/dicts_iron_KO_contig/tidy_tables/merge_all/iron_KOs.txt-barnacle_tensor_tidy-subset_taxa_01-subset_batches_01.csv"

Get a list of batches to subset

In [525]:
# Species, taxid, nkos
batches_sub_01 = [
    "D1PA",
    "G1PA",
    "G2PA",
    "G3PA"
]
batches_sub_02 = [
    "D1PA",
    "G1PA",
    "G2PA",
]

Get metadata

In [738]:
fn_meta = '/scratch/bgrodner/iron_ko_contigs/metat_search_results/dicts_iron_KO_contig/tidy_tables/merge_all/iron_KOs.txt-metadata.csv'
metadata = pd.read_csv(fn_meta)
metadata[:2]

Get dict mapping assm sample to batch

In [519]:
dict_assmsam_batch = dict(zip(
    metadata['assm_sample'],
    metadata['assembly']
))

Write rows to tidytable

In [526]:
i = 0
with open(fn_tensor, 'r') as fr, open(fn_tensor_sub, 'w') as fw:
    row = next(fr)  # Write header
    fw.write(row)
    for row in fr:
        sample, _, tax, _, _ = row.split(",")
        bool_batch = (dict_assmsam_batch[sample] in batches_sub_01)
        bool_tax = (tax in taxa_sub)
        if bool_batch & bool_tax:
            i += 1
            fw.write(row)

i

### Sub taxa 03, sub batches 01: Find component 1 latitudes

Component 1

In [758]:
kos = ['K07214', 'K02641', 'K02639', 'K00927', 'K03320']
taxa = ['35677', '2', '3379134']
samples = ['G3PA-UW32-7-3.0', 'G3PA-UW31#2-7-3.0', 'G2PA-S17C1-15m-3um', 'G3PA-UW32#3-7-3.0', 'G3PA-UW25-7-3.0', 'G2PA-S02C1-15m-3um', 'G2PA-S07C1-15m-3um', 'G1PA-S02C1-3um', 'G1PA-S14C1-3um', 'G3PA-UW40-7-3.0', 'G2PA-S18C1-15m-3um', 'G2PA-S11C1-15m-3um', 'G3PA-UW29-7-3.0', 'G2PA-S06C1-15m-3um', 'D1PA-S33C1-1800', 'G1PA-S13C1-3um', 'G3PA-UW35#2-7-3.0', 'D1PA-S35C1-200', 'G2PA-S15C1-15m-3um', 'G2PA-S16C1-15m-3um']


Get sample metadata

In [756]:
metadata[metadata.assembly == 'G3PA'][:3]

In [762]:
meta_sams = []
for s in samples:
    assm, sam = s.split('-')[:2]
    bool_a = metadata['assembly'] == assm
    bool_s = metadata['sample'] == sam
    row = metadata[bool_a & bool_s]
    print(s, row.latitude.unique())

## Make a G3NS tensor with only a few taxa

Get tensor filename

In [352]:
fn_tensor = "/scratch/bgrodner/iron_ko_contigs/metat_search_results/dicts_iron_KO_contig/tidy_tables/G3NS_kofam2021_ALL.csv-iron_KOs.txt-tidys/G3NS_kofam2021_ALL.csv-iron_KOs.txt-barnacle_tensor_tidy.csv"
os.path.exists(fn_tensor)

Get header

In [354]:
with open(fn_tensor, 'r') as f:
    print(next(f))
    print(next(f))


Read in dict

In [355]:
dict_taxtrim_ko_sample_estcounts = defaultdict(
    lambda: defaultdict(dict)
)
i = 0
with open(fn_tensor, 'r') as f:
    _ = next(f)
    for row in f:
        i += 1
        sample, ko, tax, estcounts, rep = row.split(',')
        sam = f"{sample}-{rep}"
        dict_taxtrim_ko_sample_estcounts[tax][ko][sam] = float(estcounts)
i

How many KOs, contigs, taxa, samples are there?

In [356]:
taxa_all = set()
kos_all = set()
samples_all = set()
taxko_all = set()
for tax, dkse in dict_taxtrim_ko_sample_estcounts.items():
  taxa_all.add(tax)
  for ko, dse in dkse.items():
    taxko_all.add(f"{tax}-{ko}")
    kos_all.add(ko)
    for sam, ec in dse.items():
      samples_all.add(sam)

In [357]:
print(f"""
There are 
    {len(kos_all)} KOs 
    {len(taxa_all)} taxa
    {len(samples_all)} samples
    {len(taxko_all)} taxon-kos
      """)

Build tree

In [358]:
tree = ncbi.get_topology(taxa_all)
for n in tree.traverse():
    if str(n.name) in taxa_all:
        kos = list(dict_taxtrim_ko_sample_estcounts[n.name].keys())
        nkos = len(kos)
    else:
        nkos = 'NA'
    n.add_props(
        nkos=nkos
    )


Plot tree

In [359]:
print(tree.to_str(props=['sci_name','name','nkos'], compact=True))

### 7 taxa

Output filename

In [362]:
fn_tensor_sub = "/scratch/bgrodner/iron_ko_contigs/metat_search_results/dicts_iron_KO_contig/tidy_tables/G3NS_kofam2021_ALL.csv-iron_KOs.txt-tidys/G3NS_kofam2021_ALL.csv-iron_KOs.txt-barnacle_tensor_tidy-sub_taxa_01.csv"

Get a list of taxa to subset

In [363]:
# Species, taxid, nkos
nodes_sub_01 = [
    "Pseudoalteromonas sp. '520P1 No. 412',304208,47",
    "Alteromonas,226,86",
    "Roseobacteraceae,2854170,45",
    "Paracoccaceae,31989,52",
    "Pelagomonas calceolata,35677,134",
    "Thalassiosira,35127,42",
    "Stramenopiles MAST-4,1735725,60",
]


taxa_sub = []
for t in nodes_sub_01:
    _, tid, _ = t.split(',')
    taxa_sub.append(tid)

taxa_sub = set(taxa_sub)
taxa_sub

Write rows to tidytable

In [366]:
i = 0
with open(fn_tensor, 'r') as fr, open(fn_tensor_sub, 'w') as fw:
    row = next(fr)
    fw.write(row)
    for row in fr:
        sample, ko, tax, estcounts, rep = row.split(",")
        if tax in taxa_sub:
            i += 1
            fw.write(row)

i

### 14 taxa

Output filename

In [378]:
fn_tensor_sub = "/scratch/bgrodner/iron_ko_contigs/metat_search_results/dicts_iron_KO_contig/tidy_tables/G3NS_kofam2021_ALL.csv-iron_KOs.txt-tidys/G3NS_kofam2021_ALL.csv-iron_KOs.txt-barnacle_tensor_tidy-sub_taxa_02.csv"

Get a list of taxa to subset

In [379]:
# Species, taxid, nkos
nodes_sub_01 = [
    "Pseudoalteromonas sp. '520P1 No. 412',304208,47",
    "Alteromonas,226,86",
    "Roseobacteraceae,2854170,45",
    "Paracoccaceae,31989,52",
    "Pelagomonas calceolata,35677,134",
    "Thalassiosira,35127,42",
    "Stramenopiles MAST-4,1735725,60",
    "Flavobacteriaceae,49546,139",
    "Flavobacteria bacterium MS024-2A,487796,61",
    "Alphaproteobacteria,28211,143",
    "Vibrionales,135623,158",
    "Dinophyceae,2864,225",
    "Chlorophyta,3041,80",
    "Ochrophyta,2696291,167"


]


taxa_sub = []
for t in nodes_sub_01:
    _, tid, _ = t.split(',')
    taxa_sub.append(tid)

taxa_sub = set(taxa_sub)
taxa_sub

Write rows to tidytable

In [380]:
i = 0
with open(fn_tensor, 'r') as fr, open(fn_tensor_sub, 'w') as fw:
    row = next(fr)
    fw.write(row)
    for row in fr:
        sample, ko, tax, estcounts, rep = row.split(",")
        if tax in taxa_sub:
            i += 1
            fw.write(row)

i

In [342]:
taxa = [      2,     976,    2759,   49546, 2696291,  131567,    2864,
         35677,    1236,   33313,   78238,    3041,    2916,    2836,
        200644, 2698737,   35675,   33656,  135623,   28211]

dict_tax_name = ncbi.get_taxid_translator(taxa)

for t in taxa:
    print(t,dict_tax_name[t])

Component 1

In [371]:
taxa = ['35677', '31989', '226', '304208']

dict_tax_name = ncbi.get_taxid_translator(taxa)

for t in taxa:
    print(t,dict_tax_name[int(t)])

In [368]:
kos = ['K02722', 'K00372' ,'K07214', 'K02691', 'K00855', 'K02709']

for k in kos:
    print(dict_ko_name[k])

Component 2

In [377]:
taxa = ['35677', '226','35127']

dict_tax_name = ncbi.get_taxid_translator(taxa)

for t in taxa:
    print(t,dict_tax_name[int(t)])

In [369]:
kos = ['K00532', 'K06441', 'K22338', 'K02705', 'K22341', 'K10850', 'K00372', 'K02598', 'K01602', 'K02567', 'K02709']
for k in kos:
    print(dict_ko_name[k])

Component 3

In [373]:

taxa = ['1735725', '31989', '226', '35127']

dict_tax_name = ncbi.get_taxid_translator(taxa)

for t in taxa:
    print(t,dict_tax_name[int(t)])

In [375]:
kos = ['K00368', 'K16087', 'K22552', 'K02567', 'K19611', 'K04565']
for k in kos:
    print(dict_ko_name[k])

Component 4

In [374]:

taxa = ['35127', '226', '304208', '1735725']

dict_tax_name = ncbi.get_taxid_translator(taxa)

for t in taxa:
    print(t,dict_tax_name[int(t)])

In [376]:
kos = ['K07214', 'K23723', 'K10850', 'K07243', 'K01595', 'K00855']
for k in kos:
    print(dict_ko_name[k])

## Subset 02

Component 23

In [390]:

taxa = ['135623', '28211', '49546', '2864']

dict_tax_name = ncbi.get_taxid_translator(taxa)

for t in taxa:
    print(t,dict_tax_name[int(t)])

In [389]:
kos = ['K03320', 'K00362']
for k in kos:
    print(dict_ko_name[k])

In [400]:
samples = ['G3.UW.NS-UW32-32.93-3.7m-0.2um', 'G3.UW.NS-UW29-29.46-1.7m-0.2um', 'G3.UW.NS-UW25-25.87-1.7m-0.2um', 'G3.UW.NS-UW32-32.93-3.7m-3um', 'G3.UW.NS-UW31-31.43-2.7m-3um', 'G3.UW.NS-UW35-35.83-2.7m-0.2um', 'G3.UW.NS-UW32-32.3-1.7m-3um', 'G3.UW.NS-UW32-32.3-1.7m-0.2um', 'G3.UW.NS-UW37-37.0-1.7m-0.2um', 'G3.UW.NS-UW35-35.83-2.7m-3um']
lats = []
infos = []
for s in samples:
    assm, sam, lat, depth, size = s.split('-')
    lats.append(lat)
    infos.append(f"{lat}-{depth}-{size}")

# [x for _, x in sorted(zip(lats, infos))]
infos

Component 58

In [396]:

taxa = ['487796', '226', '49546', '2696291', '3041']

dict_tax_name = ncbi.get_taxid_translator(taxa)

for t in taxa:
    print(t,dict_tax_name[int(t)])

In [397]:
kos = ['K19611', 'K06503', 'K00264', 'K01851', 'K00265', 'K16090', 'K25224', 'K00615', 'K00134', 'K03832']

for k in kos:
    print(dict_ko_name[k])

In [399]:
samples = ['G3.UW.NS-UW32-32.93-3.7m-0.2um', 'G3.UW.NS-UW32-32.3-1.7m-0.2um', 'G3.UW.NS-UW29-29.46-1.7m-0.2um', 'G3.UW.NS-UW25-25.87-1.7m-0.2um', 'G3.UW.NS-UW35-35.96-1.7m-0.2um', 'G3.UW.NS-UW32-32.3-1.7m-3um', 'G3.UW.NS-UW31-31.43-2.7m-3um', 'G3.UW.NS-UW40-40.09-2.7m-0.2um', 'G3.UW.NS-UW32-32.93-3.7m-3um', 'G3.UW.NS-UW25-25.87-1.7m-3um']

lats = []
infos = []
for s in samples:
    assm, sam, lat, depth, size = s.split('-')
    lats.append(lat)
    infos.append(f"{lat}-{depth}-{size}")

infos

Component 62

In [401]:

taxa = ['2864', '2696291', '35677', '49546', '28211', '3041', '135623', '1735725', '35127', '226']

dict_tax_name = ncbi.get_taxid_translator(taxa)

for t in taxa:
    print(t,dict_tax_name[int(t)])

In [402]:
kos = ['K00264', 'K00265', 'K00284', 'K01673', 'K00266', 'K00615', 'K01624', 'K04759', 'K24034', 'K02364']

for k in kos:
    print(dict_ko_name[k])

In [403]:
samples = ['G3.UW.NS-UW40-40.88-1.7m-3um', 'G3.UW.NS-UW40-40.88-1.7m-0.2um', 'G3.UW.NS-UW38-38.97-1.7m-3um', 'G3.UW.NS-UW29-29.46-1.7m-3um', 'G3.UW.NS-UW38-38.97-1.7m-0.2um', 'G3.UW.NS-UW35-35.96-1.7m-3um', 'G3.UW.NS-UW40-40.09-2.7m-3um', 'G3.UW.NS-UW35-35.96-1.7m-0.2um', 'G3.UW.NS-UW32-32.3-1.7m-3um', 'G3.UW.NS-UW25-25.87-1.7m-3um']

lats = []
infos = []
for s in samples:
    assm, sam, lat, depth, size = s.split('-')
    lats.append(lat)
    infos.append(f"{lat}-{depth}-{size}")

infos

Taxa clusters

In [410]:
taxa = ['226', '2864', '3041', '28211', '31989', '35127', '35677', '49546',
       '135623', '304208', '487796', '1735725', '2696291', '2854170']

dict_tax_name = ncbi.get_taxid_translator(taxa)
dtn = {}
for t in taxa:
    print(t,dict_tax_name[int(t)])
    dtn[t] = dict_tax_name[int(t)]
dtn

KO clusters 2

In [405]:
kos = ['K07214', 'K10850', 'K04783', 'K00372', 'K23723', 'K16087', 'K04564',
       'K00855', 'K02217', 'K02705']

for k in kos:
    print(dict_ko_name[k])

KO clusters 3

In [406]:
kos = ['K19611', 'K06503', 'K00265', 'K16090', 'K01851', 'K08906', 'K11707',
       'K11709', 'K11710', 'K11951']

for k in kos:
    print(dict_ko_name[k])

KO clusters 8

In [407]:
kos = ['K01595', 'K00134', 'K01803', 'K01624', 'K16088', 'K00927', 'K00615',
       'K04565', 'K04782', 'K11951']

for k in kos:
    print(dict_ko_name[k])

KO clusters 9

In [408]:
kos = ['K00264', 'K01595', 'K16087', 'K02638', 'K03699', 'K00175', 'K00265',
       'K00179', 'K01601', 'K07684']

for k in kos:
    print(dict_ko_name[k])

Sample cluster 7 and 17 and 2 and 4

In [414]:
kos = ['K00265', 'K00368', 'K00615', 'K00855', 'K01012', 'K01601', 'K01602',
       'K01624', 'K01672', 'K01673', 'K02364', 'K02639', 'K02689', 'K02695',
       'K02699', 'K02703', 'K02704', 'K02705', 'K02706', 'K02708', 'K02711',
       'K02717', 'K02718', 'K02721', 'K02722', 'K02724', 'K03320', 'K03542',
       'K03594', 'K03839', 'K03841', 'K04564', 'K04565', 'K04755', 'K04759',
       'K04784', 'K04787', 'K05374', 'K05524', 'K07214', 'K08940', 'K10850',
       'K11645', 'K12237', 'K13859', 'K13860', 'K16087', 'K21567', 'K22336',
       'K22552', 'K23723', 'K24110',
       
       'K00264', 'K00265', 'K00266', 'K00284', 'K00362', 'K00522', 'K00532',
       'K00855', 'K01601', 'K01602', 'K02011', 'K02574', 'K02697', 'K02699',
       'K02706', 'K02714', 'K02716', 'K02722', 'K03320', 'K03594', 'K03841',
       'K04565', 'K04641', 'K04784', 'K04786', 'K06441', 'K06503', 'K08906',
       'K11959', 'K13575', 'K14578', 'K15579', 'K16087', 'K19611', 'K19791',
       'K21949', 'K22336', 'K22338', 'K22339', 'K23184', 'K23910', 'K25286',

       'K02012', 'K02689', 'K02690', 'K02711', 'K02717', 'K02724',

       'K00265', 'K00372', 'K01595', 'K02704', 'K02716', 'K02719', 'K02720',
       'K03839', 'K04564', 'K04783', 'K05710', 'K07214', 'K14718', 'K23725',
       'K24245'
]
dkn = {}
for k in kos:
    dkn[k] = dict_ko_name[k]
dkn

taxon cluster 1 and 4

In [413]:
kos = [
    'K00265', 'K01012', 'K01595', 'K01672', 'K01726', 'K02217', 'K02255',
    'K02364', 'K02638', 'K02639', 'K03320', 'K04564', 'K04641', 'K04783',
    'K04784', 'K05524', 'K07214', 'K12237', 'K16087', 'K19611', 'K22336',
    'K22552', 'K23910', 'K25224',

    'K03320'
]
dkn = {}
for k in kos:
    dkn[k] = dict_ko_name[k]
dkn

Sample

## Test batch minimum tree trimming

In [422]:
sys.path.append('../repo-armbrust-metat-search/functions')
from cl_tree_trim_02 import TreeTrim

### Write a test table with only two batches

get tensor filename

In [435]:
fn_tidy_merge = '/scratch/bgrodner/iron_ko_contigs/metat_search_results/dicts_iron_KO_contig/tidy_tables/merge_all/iron_KOs.txt-tidy_all.csv'
with open(fn_tidy_merge, 'r') as f:
    print(next(f))
    print(next(f))

Get metadata

In [428]:
fn_meta = '/scratch/bgrodner/iron_ko_contigs/metat_search_results/dicts_iron_KO_contig/tidy_tables/merge_all/iron_KOs.txt-metadata.csv'
metadata = pd.read_csv(fn_meta)
metadata[:2]

Output filename

In [436]:
merges = ['D1PA','G1PA']
fn_tidy_merge_sub = f"/scratch/bgrodner/iron_ko_contigs/metat_search_results/dicts_iron_KO_contig/tidy_tables/merge_all/iron_KOs.txt-tidy_merge_{merges[0]}_{merges[1]}.csv"
out_dir = os.path.split(fn_tidy_merge_sub)[0]
if not os.path.exists(out_dir):
    os.makedirs(out_dir)
    print(f"Made dir: {out_dir}")

Write new file

In [438]:
# Get dict sample batch
dict_sample_batch = dict(zip(
    metadata['fn_sample_counts'],
    metadata['assembly']
))
merges = set(merges)  # faster "in"
# count rows written
i = 0
# open read and write
with open(fn_tidy_merge, 'r') as fr, open(fn_tidy_merge_sub, 'w') as fw:
    row = next(fr)
    fw.write(row)
    for row in fr:
        contig, sample, ko, tax, estcounts = row.split(",")
        batch = dict_sample_batch[sample]
        if batch in merges:
            i += 1
            fw.write(row)

i

### Trim tree with subset merge 

Load object

In [440]:
tree = TreeTrim(fn_tidy_merge_sub)

Trim

In [442]:
tree.trim_tree(
    'nkos_in_gt_minbatches',
    thresh=30,
    dict_sample_batch=dict_sample_batch,
    minsamples=3,
    minbatches=2
)

Inspect tree

In [443]:
print(tree.treetrim.to_str(props=['sci_name','name'], compact=True))

Add info to trees

In [446]:
for n in tree.treetrim.traverse():
    tid = n.name
    nkos = 0
    for ko, d in tree.dict_taxtrim_ko_sam_estcounts[tid].items():
        dbs = defaultdict(int)
        for sam, estcounts in d.items():
            batch = dict_sample_batch[sam]
            dbs[batch] += bool(estcounts)
        minb = True
        for nsam in dbs.values():
            if nsam < 3:
                minb = False
        if minb:
            nkos += 1
    n.add_props(
        nkos=nkos
    )
print(tree.treetrim.to_str(props=['sci_name','name','nkos'], compact=True))

In [478]:
dict_ko_name['K08906']

In [521]:
kos = [
    'K07214', 'K02364', 'K02639'
]

for ko in kos:
    print(dict_ko_name[ko])

## Summarize barnacle fitting

Load csv

In [527]:
fn_table_summary = '/scratch/bgrodner/iron_ko_contigs/metat_search_results/barnacle/table_summary_barnacle_fitting.csv'
df_summary = pd.read_csv(fn_table_summary)
df_summary

Remove failed rows and convert all to float

In [538]:
notfailed = df_summary['Column 1'] != 'FAILED'
df_summary_trim = df_summary[notfailed]
df_summary_trim = df_summary_trim.drop('Column 1', axis=1)
df_summary_trim = df_summary_trim.astype(str)
for col in df_summary_trim.columns:
    df_summary_trim[col] = df_summary_trim[col].str.replace(',','')
df_summary_trim = df_summary_trim.astype(float)
df_summary_trim

Correlations for each column

In [None]:
# shp = len(df_summary_trim.columns)
# arr_corr = np.zeros((shp,shp))
# arr_pval = np.zeros((shp,shp))
# for i, coli in enumerate(df_summary_trim.columns):
#     for j, colj in enumerate(df_summary_trim.columns):
#         if coli < colj:
#             ci = df_summary_trim[coli]
#             cj = df_summary_trim[colj]
#             corr = stats.spearmanr(ci, cj)
#             arr_corr[i,j] = corr.statistic
#             arr_pval[i,j] = corr.pvalue


In [544]:
corr_df = df_summary_trim.corr()
corr_df = corr_df.replace({np.nan: 0})
# Precalculate linkage to extract clusters later
link = hierarchy.linkage(distance.pdist(np.asarray(corr_df)))
# make clustered heatmap
# using precalculated linkage
g = sns.clustermap(
    corr_df.fillna(0), 
    row_linkage=link, col_linkage=link,
    mask=corr_df.isna(), 
    cmap='PuOr_r', vmin=-1, vmax=1, 
    cbar_kws={'shrink':0.5, 'label':'Pearson\nCorrelation'}, 
    xticklabels=True, yticklabels=True
)

Plot filled cells vs rank

In [545]:
df_summary_trim.columns

In [546]:
x = 'rank'
y = 'number of filled cells (tax-ko-sam_rep with estcounts > 0)'
fig, ax = general_plot()
ax.scatter(
    df_summary_trim[x],
    df_summary_trim[y]
)
ax.set_xlabel(x)
ax.set_ylabel(y)

Plot SSE vs taxon-kos

In [547]:
x = 'number of taxa-kos (with estcounts > 0)'
y = 'SSE'
fig, ax = general_plot()
ax.scatter(
    df_summary_trim[x],
    df_summary_trim[y]
)
ax.set_xlabel(x)
ax.set_ylabel(y)

total cells vs rank

In [548]:
x = 'rank'
y = 'Number of cells'
fig, ax = general_plot()
ax.scatter(
    df_summary_trim[x],
    df_summary_trim[y]
)
ax.set_xlabel(x)
ax.set_ylabel(y)

In [622]:
f"{5084*3*178:,d}"

In [621]:
f"{355*65*531:,d}"

In [624]:
355*65*531/500000*80

In [625]:
355*65

## Check correlation between enterobactin esterase and pelagomonas abundance

Table name

In [552]:
fn_tidy_merge_sub = '/scratch/bgrodner/iron_ko_contigs/metat_search_results/dicts_iron_KO_contig/tidy_tables/merge_all/iron_KOs.txt-barnacle_tensor_tidy-subset_taxa_03-subset_batches_01.csv'

Targets

In [553]:
ko = 'K07214'  # enterobactin esterase
tid = '35677'  # Pelagomonas calceolata

Get sample values

In [561]:
dict_sam_koestcounts = defaultdict(float)
dict_sam_tidestcounts = defaultdict(float)
with open(fn_tidy_merge_sub, 'r') as f:
    reader = csv.DictReader(f)
    for row in reader:
        if row['taxon_trim'] == tid:
            sam = row['assm_sample']
            estcounts = float(row['estcounts'])
            ko_ = row['KO']
            dict_sam_tidestcounts[sam] += estcounts
            if ko_ == ko:
                dict_sam_koestcounts[sam] += estcounts

Get df of values

In [613]:
sams = set(dict_sam_koestcounts.keys()).union(set(dict_sam_tidestcounts.keys()))
koestcounts = []
tidestcounts = []
for sam in sams:
    koestcounts.append(dict_sam_koestcounts.get(sam, 0))
    tidestcounts.append(dict_sam_tidestcounts.get(sam, 0))


Plot correlation

In [566]:
fig, ax = general_plot()
ax.scatter(tidestcounts, koestcounts)
ax.set_ylabel(f'{ko} counts')
ax.set_xlabel(f'{tid} counts')

Plot log scale

In [589]:
fig, ax = general_plot()
ax.scatter(tidestcounts, koestcounts)
ax.set_ylabel(f'Pelagomonas calceolata\nEnterobactin esterase counts')
ax.set_xlabel(f'Pelagomonas calceolata counts')
ax.set_yscale('log')
ax.set_xscale('log')
ax.axis('equal')
ax.plot([10,20,40,80,160,320,500],[10,20,40,80,160,320,500],'k')


In [588]:
fig, ax = general_plot()
ax.scatter(tidestcounts, koestcounts)
ax.set_ylabel(f'Pelagomonas calceolata\nEnterobactin esterase counts')
ax.set_xlabel(f'Pelagomonas calceolata counts')
# ax.set_yscale('log')
ax.set_xscale('log')
ax.plot([10,20,40,80,160,320,500],[10,20,40,80,160,320,500],'k')


In [576]:
any([x == 0 for x in koestcounts])

fraction of ko vs pelag total

In [592]:
fig, ax = general_plot()
ax.scatter(tidestcounts, np.array(koestcounts)/ np.array(tidestcounts))
ax.set_ylabel(f'Pelagomonas calceolata\nEnterobactin esterase counts')
ax.set_xlabel(f'Pelagomonas calceolata counts')
# ax.set_yscale('log')
ax.set_xscale('log')
# ax.plot([10,20,40,80,160,320,500],[10,20,40,80,160,320,500],'k')


Get dict for all kos in tid

In [593]:
dict_ko_sam_estcounts = defaultdict(lambda: defaultdict(float))
with open(fn_tidy_merge_sub, 'r') as f:
    reader = csv.DictReader(f)
    for row in reader:
        if row['taxon_trim'] == tid:
            sam = row['assm_sample']
            estcounts = float(row['estcounts'])
            ko = row['KO']
            dict_ko_sam_estcounts[ko][sam] += estcounts


Which ko is in most samples?

In [602]:
kos = []
nsams = []
for ko, dict_sam_estcounts in dict_ko_sam_estcounts.items():
    kos.append(ko)
    ns = 0
    for sam, estcounts in dict_sam_estcounts.items():
        if estcounts > 0:
            ns += 1
    nsams.append(ns)
kos = [x for _, x in sorted(zip(nsams, kos))]
nsams = sorted(nsams)
dict_ko_nsams = dict(zip(kos, nsams))
pd.DataFrame({"kos":kos, "nsams":nsams})[-20:], dict_ko_nsams['K07214']

In [603]:
dict_ko_name['K02575']

Plot target vs common ko

In [608]:
kos = ['K02575', 'K07214']
vals = []
for ko in kos:
    d = dict_ko_sam_estcounts[ko]
    v = []
    for sam in sams:
        v.append(d[sam])
    vals.append(v)

fig, ax = general_plot()
ax.scatter(vals[0], vals[1])
ax.set_xlabel(f"{dict_ko_name[kos[0]]}\nestcounts")
ax.set_ylabel(f"{dict_ko_name[kos[1]]}\nestcounts")
ax.set_xscale('log')
# ax.set_yscale('log')

In [609]:
kos = ['K02364', 'K07214']
vals = []
for ko in kos:
    d = dict_ko_sam_estcounts[ko]
    v = []
    for sam in sams:
        v.append(d[sam])
    vals.append(v)

fig, ax = general_plot()
ax.scatter(vals[0], vals[1])
ax.set_xlabel(f"{dict_ko_name[kos[0]]}\nestcounts")
ax.set_ylabel(f"{dict_ko_name[kos[1]]}\nestcounts")
ax.set_xscale('log')
# ax.set_yscale('log')

In [610]:
kos = ['K16267', 'K07214']
vals = []
for ko in kos:
    d = dict_ko_sam_estcounts[ko]
    v = []
    for sam in sams:
        v.append(d[sam])
    vals.append(v)

fig, ax = general_plot()
ax.scatter(vals[0], vals[1])
ax.set_xlabel(f"{dict_ko_name[kos[0]]}\nestcounts")
ax.set_ylabel(f"{dict_ko_name[kos[1]]}\nestcounts")
ax.set_xscale('log')
# ax.set_yscale('log')

Which samples have the most esterase?

In [614]:
frac_ko = np.array(koestcounts)/ np.array(tidestcounts)
sams_fracsort = [x for _, x in sorted(zip(frac_ko, sams))]
frac_ko_sort = sorted(frac_ko)
pd.DataFrame({"sams_fracsort":sams_fracsort, "frac_ko":frac_ko_sort})[-10:]

In [617]:
s = 'G3PA-UW32#3-7-3.0'
dke = {}
for ko, d in dict_ko_sam_estcounts.items():
    ec = d[s]
    if ec > 0:
        dke[dict_ko_name[ko]] = ec
dke

In [618]:
s = 'G3PA-UW32-7-3.0'
dke = {}
for ko, d in dict_ko_sam_estcounts.items():
    ec = d[s]
    if ec > 0:
        dke[dict_ko_name[ko]] = ec
dke

In [None]:
def get_meta(sn_type):
    meta_fn = input_table.loc[
        input_table['sn_type_parse_kallisto'] == sn_type, 
        'fn_sample_metadata'
    ].values[0]
    return pd.read_csv(meta_fn)

def get_size_lat_depth_rep_timep(meta_sample, rnd_lat, skip=[]):
    vals = []
    if not 'size' in skip:
        size = str(float(meta_sample['Filter'].values[0]))
        size += 'um'
        vals.append(size)
    if not 'lat' in skip:
        lat = str(round(float(meta_sample['Latitude'].values[0]), rnd_lat))
        lat += 'deg'
        vals.append(lat)
    if not 'depth' in skip:
        depth = str(float(meta_sample['Depth'].values[0]))
        depth += 'm'
        vals.append(depth)
    if not 'rep' in skip:
        rep = meta_sample['Replicate'].values[0]
        vals.append(rep)
    if not 'timep' in skip:
        timep = meta_sample['Datetime'].values[0]
        timep = re.sub('/','_', timep)
        timep = re.sub(r'\s','-', timep)
        vals.append(timep)
    return vals

def parse_fn_kallisto_sn(fn='', sn_type='', get_columns=False, rnd_lat=2):
    if not get_columns:
        ass, sample, lat, ammend, timep, depth, size, rep = [''] * 8
        if sn_type == 'G1NS':
            splt = fn.split('.')
            ass, sm_sz, rp = splt[:3]
            sample, sz = sm_sz.split('_',1)
            # size = str(float(re.sub('_','.',sz)))
            alias2 = sm_sz + rp
            meta = get_meta(sn_type)
            meta_sample = meta.loc[meta['Alias2'] == alias2, :]
            size, lat, depth, rep, timep = get_size_lat_depth_rep_timep(meta_sample, rnd_lat)
        elif sn_type == 'G2NS':
            ass, sample, dp, sz, rp, _ = fn.split('.')
            meta = get_meta(sn_type)
            alias2 = f'{sample}.{dp}.{sz}.{rp}'
            meta_sample = meta.loc[meta['Alias2'] == alias2, :]
            size, lat, depth, rep, timep = get_size_lat_depth_rep_timep(meta_sample, rnd_lat)
        elif sn_type == 'G3NS':
            meta = get_meta(sn_type)
            sid = os.path.splitext(fn)[0]
            meta_sample = meta.loc[meta['SampleID'] == sid, :]
            size, lat, depth, rep, timep = get_size_lat_depth_rep_timep(meta_sample, rnd_lat)
            ass = re.match(r'.+NS', fn)[0]
            sample = re.search(r'UW\d+_\d', fn)[0]
        elif sn_type == 'G5':
            ass, sample, ammend, timep, rep, _ = fn.split('.')
        elif sn_type == 'D1':
            ass, sm_rep_tp, _, _ = fn.split('.')
            sample, rep, timep = sm_rep_tp.split('_')
        elif sn_type == 'G1PA':
            meta = get_meta(sn_type)
            ass, fn_ = fn.split('.', 1)
            sid = re.match(r'.+(?=\.abundance)', fn_)[0]
            sid = re.sub(r'\.','_',sid)
            meta_sample = meta.loc[meta['SampleID'] == sid, :]
            size, lat, depth, rep, timep = get_size_lat_depth_rep_timep(meta_sample, rnd_lat)
            sample, _ = fn_.split('_', 1)
        elif sn_type == 'G2PA':
            _, ass, sample, dp, sz, rp, _, _ = fn.split('.')
            meta = get_meta(sn_type)
            sid = f"{sample}.{dp}.{sz}.{rp}"
            meta_sample = meta.loc[meta['SampleID'] == sid, :]
            size, lat, rep = get_size_lat_depth_rep_timep(
                meta_sample, 
                rnd_lat, 
                skip=['depth','timep']
            )
            depth = str(float(dp[:-1])) + 'm'
            # size = re.sub('_','.',sz)
        elif sn_type == 'G3PA.UW':
            meta = get_meta(sn_type)
            ass, sample_ = fn.split('.')[:2]
            meta_sample = meta.loc[meta['Alias2'] == sample_, :]
            size, lat, depth, rep, timep = get_size_lat_depth_rep_timep(meta_sample, rnd_lat)
            sample_list = str(meta_sample['Alias1'].values[0]).split(' ')
            sample = sample_list[0]
            if '#' in sample_list[1]:
                sample += sample_list[1]
        elif sn_type == 'G3PA.diel':
            ass1, ass2, sample, rp, _, _, _, _ = fn.split('.')
            ass = f'{ass1}.{ass2}'
            meta = get_meta(sn_type)
            sid = f"{ass}.{sample}.{rp}"
            meta_sample = meta.loc[meta['SampleID'] == sid, :]
            size, lat, depth, rep, timep = get_size_lat_depth_rep_timep(meta_sample, rnd_lat)

        elif sn_type == 'G3PA.PM':
            ass = sn_type
            sample = re.search(r'UW\d+_\d', fn)[0]
            meta = get_meta(sn_type)
            sid = re.match(r'.+(?=\.unstranded)', fn)[0]
            print(sid)
            meta_sample = meta.loc[meta['SampleID'] == sid, :]
            size, lat, depth, rep, timep = get_size_lat_depth_rep_timep(meta_sample, rnd_lat)
        else:
            raise ValueError(
                f"""
                Sample name parse type {sn_type} not configured or not provided 
                (sn_type_parse_kallisto column in file table)
                """
            )
        return [ass, sample, lat, ammend, timep, depth, size, rep]
    else:
        return ['assembly', 'sample', 'latitude','ammendment', 'timepoint', 'depth', 'size', 'rep']


In [737]:
input_table_fn = 'file_table.240210.kofam_filt.csv'
input_table = pd.read_csv(input_table_fn, keep_default_na=False)
# fn = 'G3.UW.NS.UW40_2.7m.0_2um.A.tsv'
# fn = 'G3.UW.NS.UW40_1.7m.3um.B.tsv'
# sn_type = 'G3NS'
# fn = 'G1NS.S13C1_3um.B.tsv'
# sn_type = 'G1NS'
# fn = 'G2NS.S18C1.15m.3um.B.tsv'
# sn_type = 'G2NS'
fn = 'G1PA.S10_0.2umA.abundance.tsv'
sn_type = 'G1PA'
# fn = 'G2PA.G2PA.S18C1.15m.3um.C.abundance.tsv'
# sn_type = 'G2PA'
# fn = 'G3PA.UW9.unstranded.abundance.tsv'
# sn_type = 'G3PA.UW'
# fn = 'G3PA.diel.S4C8.C.unstranded.abundance.tsv.gz'
# sn_type = 'G3PA.diel'
# fn = 'G3.UW.PA.UW42_1.7m_PM.3um.C.unstranded.abundance.tsv.gz'
# sn_type = 'G3PA.PM'
dict(zip(parse_fn_kallisto_sn(get_columns=True), parse_fn_kallisto_sn(fn, sn_type)))

In [688]:
datetime = '4/16/19 13:39'
datetime = re.sub('/','_', datetime)
datetime = re.sub(r'\s','-', datetime)
datetime

## Subset tree_trim_thresh_60_minsamples_20_minbatches_4 to only bacteria

### Write tensor

Get tensor filename

In [779]:
fn_tensor = '/scratch/bgrodner/iron_ko_contigs/metat_search_results/dicts_iron_KO_contig/tree_trim/merge_all/iron_KOs.txt-barnacle_tensor_tidy-tree_trim_thresh_60_minsamples_20_minbatches_4.csv'
os.path.exists(fn_tensor)

Get header

In [780]:
with open(fn_tensor, 'r') as f:
    print(next(f))
    print(next(f))


Output filename

In [781]:
fn_tensor_sub = '/scratch/bgrodner/iron_ko_contigs/metat_search_results/dicts_iron_KO_contig/tree_trim/merge_all/iron_KOs.txt-barnacle_tensor_tidy-tree_trim_thresh_60_minsamples_20_minbatches_4-sub_taxa_bact.csv'

Get a list of taxa to subset

In [811]:
# Species, taxid, nkos
nodes_sub_bact = [
    "Bacteria,2,334",
    "Bacteroidota,976,277",
    "Pseudomonadota,1224,282",
    "Alphaproteobacteria,28211,270",
    "Gammaproteobacteria,1236,316"
]

taxa_sub = []
# for t in nodes_sub_01:
for t in nodes_sub_bact:
    _, tid, _ = t.split(',')
    taxa_sub.append(tid)

taxa_sub = set(taxa_sub)
taxa_sub

In [812]:
ncbi.get_taxid_translator(taxa_sub)

Write rows to tidytable

In [783]:
i = 0
with open(fn_tensor, 'r') as fr, open(fn_tensor_sub, 'w') as fw:
    row = next(fr)
    fw.write(row)
    for row in fr:
        sample, ko, tax, estcounts, rep = row.split(",")
        if tax in taxa_sub:
            i += 1
            fw.write(row)

i

Tensor shape

In [791]:
shps = {"KO": 254, "taxon_trim": 5, "sample_replicate_id": 474}
f"{np.prod(list(shps.values())):,d}"

## Subset tree_trim_thresh_60_minsamples_20_minbatches_4 to only Ochrophyta

### Write tensor

Get tensor filename

In [None]:
fn_tensor = '/scratch/bgrodner/iron_ko_contigs/metat_search_results/dicts_iron_KO_contig/tree_trim/merge_all/iron_KOs.txt-barnacle_tensor_tidy-tree_trim_thresh_60_minsamples_20_minbatches_4.csv'
os.path.exists(fn_tensor)

Get header

In [None]:
with open(fn_tensor, 'r') as f:
    print(next(f))
    print(next(f))


Output filename

In [799]:
fn_tensor_sub = '/scratch/bgrodner/iron_ko_contigs/metat_search_results/dicts_iron_KO_contig/tree_trim/merge_all/iron_KOs.txt-barnacle_tensor_tidy-tree_trim_thresh_60_minsamples_20_minbatches_4-sub_taxa_ochrophyta.csv'

Get a list of taxa to subset

In [813]:
# # Species, taxid, nkos
# nodes_sub_bact = [
#     "Bacteria,2,334",
#     "Bacteroidota,976,277",
#     "Pseudomonadota,1224,282",
#     "Alphaproteobacteria,28211,270",
#     "Gammaproteobacteria,1236,316"
# ]

# taxa_sub = []
# # for t in nodes_sub_01:
# for t in nodes_sub_bact:
#     _, tid, _ = t.split(',')
#     taxa_sub.append(tid)

tax_groupby = '2696291'  # Ochrophyta
taxa_sub = [n.name for n in tree[tax_groupby].descendants()]
taxa_sub.append(tax_groupby)

taxa_sub = set(taxa_sub)
taxa_sub

In [814]:
ncbi.get_taxid_translator(taxa_sub)

Write rows to tidytable

In [801]:
i = 0
with open(fn_tensor, 'r') as fr, open(fn_tensor_sub, 'w') as fw:
    row = next(fr)
    fw.write(row)
    for row in fr:
        sample, ko, tax, estcounts, rep = row.split(",")
        if tax in taxa_sub:
            i += 1
            fw.write(row)

i

Tensor shape

In [803]:
shps = {"KO": 177, "taxon_trim": 5, "sample_replicate_id": 531}
f"{np.prod(list(shps.values())):,d}"

## Subset tree_trim_thresh_60_minsamples_20_minbatches_4 to only Dinophyceae

### Write tensor

Get tensor filename

In [804]:
fn_tensor = '/scratch/bgrodner/iron_ko_contigs/metat_search_results/dicts_iron_KO_contig/tree_trim/merge_all/iron_KOs.txt-barnacle_tensor_tidy-tree_trim_thresh_60_minsamples_20_minbatches_4.csv'
os.path.exists(fn_tensor)

Get header

In [805]:
with open(fn_tensor, 'r') as f:
    print(next(f))
    print(next(f))


Output filename

In [806]:
fn_tensor_sub = '/scratch/bgrodner/iron_ko_contigs/metat_search_results/dicts_iron_KO_contig/tree_trim/merge_all/iron_KOs.txt-barnacle_tensor_tidy-tree_trim_thresh_60_minsamples_20_minbatches_4-sub_taxa_dinophyceae.csv'

Get a list of taxa to subset

In [807]:
# # Species, taxid, nkos
# nodes_sub_bact = [
#     "Bacteria,2,334",
#     "Bacteroidota,976,277",
#     "Pseudomonadota,1224,282",
#     "Alphaproteobacteria,28211,270",
#     "Gammaproteobacteria,1236,316"
# ]

# taxa_sub = []
# # for t in nodes_sub_01:
# for t in nodes_sub_bact:
#     _, tid, _ = t.split(',')
#     taxa_sub.append(tid)

tax_groupby = '2864'  # Dinophyceae
taxa_sub = [n.name for n in tree[tax_groupby].descendants()]
taxa_sub.append(tax_groupby)

taxa_sub = set(taxa_sub)
taxa_sub

Write rows to tidytable

In [808]:
i = 0
with open(fn_tensor, 'r') as fr, open(fn_tensor_sub, 'w') as fw:
    row = next(fr)
    fw.write(row)
    for row in fr:
        sample, ko, tax, estcounts, rep = row.split(",")
        if tax in taxa_sub:
            i += 1
            fw.write(row)

i

Tensor shape

In [809]:
shps = {"KO": 243, "taxon_trim": 6, "sample_replicate_id": 531}
f"{np.prod(list(shps.values())):,d}"

## Subset tree_trim_thresh_60_minsamples_20_minbatches_4 to only Viridiplantae, Prmnesiophyceae

### Write tensor

Get tensor filename

In [815]:
fn_tensor = '/scratch/bgrodner/iron_ko_contigs/metat_search_results/dicts_iron_KO_contig/tree_trim/merge_all/iron_KOs.txt-barnacle_tensor_tidy-tree_trim_thresh_60_minsamples_20_minbatches_4.csv'
os.path.exists(fn_tensor)

Get header

In [816]:
with open(fn_tensor, 'r') as f:
    print(next(f))
    print(next(f))


Output filename

In [817]:
fn_tensor_sub = '/scratch/bgrodner/iron_ko_contigs/metat_search_results/dicts_iron_KO_contig/tree_trim/merge_all/iron_KOs.txt-barnacle_tensor_tidy-tree_trim_thresh_60_minsamples_20_minbatches_4-sub_taxa_viridiplantae_prymnesiophyceae.csv'

Get a list of taxa to subset

In [821]:
# # Species, taxid, nkos
# nodes_sub_bact = [
#     "Bacteria,2,334",
#     "Bacteroidota,976,277",
#     "Pseudomonadota,1224,282",
#     "Alphaproteobacteria,28211,270",
#     "Gammaproteobacteria,1236,316"
# ]

# taxa_sub = []
# # for t in nodes_sub_01:
# for t in nodes_sub_bact:
#     _, tid, _ = t.split(',')
#     taxa_sub.append(tid)

tax_groupby = ['33090','2608131']  # Viridiplantae, Prymnesiophyceae
taxa_sub = []
for t in tax_groupby:
    taxa_sub += [n.name for n in tree[t].descendants()]
    taxa_sub.append(t)

taxa_sub = set(taxa_sub)
taxa_sub

In [825]:
ncbi.get_taxid_translator(taxa_sub)

Write rows to tidytable

In [822]:
i = 0
with open(fn_tensor, 'r') as fr, open(fn_tensor_sub, 'w') as fw:
    row = next(fr)
    fw.write(row)
    for row in fr:
        sample, ko, tax, estcounts, rep = row.split(",")
        if tax in taxa_sub:
            i += 1
            fw.write(row)

i

Tensor shape

In [824]:
shps = {"KO": 203, "taxon_trim": 6, "sample_replicate_id": 531}
f"{np.prod(list(shps.values())):,d}"

## Subset tree_trim_thresh_60_minsamples_20_minbatches_4 to only Opisthokonta

### Write tensor

Get tensor filename

In [842]:
fn_tensor = '/scratch/bgrodner/iron_ko_contigs/metat_search_results/dicts_iron_KO_contig/tree_trim/merge_all/iron_KOs.txt-barnacle_tensor_tidy-tree_trim_thresh_60_minsamples_20_minbatches_4.csv'
os.path.exists(fn_tensor)

Get header

In [843]:
with open(fn_tensor, 'r') as f:
    print(next(f))
    print(next(f))


Output filename

In [844]:
fn_tensor_sub = '/scratch/bgrodner/iron_ko_contigs/metat_search_results/dicts_iron_KO_contig/tree_trim/merge_all/iron_KOs.txt-barnacle_tensor_tidy-tree_trim_thresh_60_minsamples_20_minbatches_4-sub_taxa_viridiplantae_opisthokonta.csv'

Get a list of taxa to subset

In [845]:
# # Species, taxid, nkos
# nodes_sub_bact = [
#     "Bacteria,2,334",
#     "Bacteroidota,976,277",
#     "Pseudomonadota,1224,282",
#     "Alphaproteobacteria,28211,270",
#     "Gammaproteobacteria,1236,316"
# ]

# taxa_sub = []
# # for t in nodes_sub_01:
# for t in nodes_sub_bact:
#     _, tid, _ = t.split(',')
#     taxa_sub.append(tid)

tax_groupby = ['33154']  # Opisthokonta
taxa_sub = []
for t in tax_groupby:
    taxa_sub += [n.name for n in tree[t].descendants()]
    taxa_sub.append(t)

taxa_sub = set(taxa_sub)
taxa_sub

In [846]:
ncbi.get_taxid_translator(taxa_sub)

Write rows to tidytable

In [847]:
i = 0
with open(fn_tensor, 'r') as fr, open(fn_tensor_sub, 'w') as fw:
    row = next(fr)
    fw.write(row)
    for row in fr:
        sample, ko, tax, estcounts, rep = row.split(",")
        if tax in taxa_sub:
            i += 1
            fw.write(row)

i

Tensor shape

In [832]:
shps = {"KO": 227, "taxon_trim": 4, "sample_replicate_id": 531}
f"{np.prod(list(shps.values())):,d}"

## Subset tree_trim_thresh_60_minsamples_20_minbatches_4 to only high level groups and opisthokonta

### Write tensor

Get tensor filename

In [836]:
fn_tensor = '/scratch/bgrodner/iron_ko_contigs/metat_search_results/dicts_iron_KO_contig/tree_trim/merge_all/iron_KOs.txt-barnacle_tensor_tidy-tree_trim_thresh_60_minsamples_20_minbatches_4.csv'
os.path.exists(fn_tensor)

Get header

In [837]:
with open(fn_tensor, 'r') as f:
    print(next(f))
    print(next(f))


Output filename

In [838]:
fn_tensor_sub = '/scratch/bgrodner/iron_ko_contigs/metat_search_results/dicts_iron_KO_contig/tree_trim/merge_all/iron_KOs.txt-barnacle_tensor_tidy-tree_trim_thresh_60_minsamples_20_minbatches_4-sub_taxa_highlevel_opisthokonta.csv'

Get a list of taxa to subset

In [839]:
# Species, taxid, nkos
nodes_sub_high = [
    "cellular organisms,131567,351",
    "Eukaryota,2759,342",
    "Sar,2698737,287",
]

taxa_sub = []
# for t in nodes_sub_01:
for t in nodes_sub_high:
    _, tid, _ = t.split(',')
    taxa_sub.append(tid)

tax_groupby = ['33154']  # Opisthokonta
for t in tax_groupby:
    taxa_sub += [n.name for n in tree[t].descendants()]
    taxa_sub.append(t)

taxa_sub = set(taxa_sub)
taxa_sub

In [840]:
ncbi.get_taxid_translator(taxa_sub)

Write rows to tidytable

In [841]:
i = 0
with open(fn_tensor, 'r') as fr, open(fn_tensor_sub, 'w') as fw:
    row = next(fr)
    fw.write(row)
    for row in fr:
        sample, ko, tax, estcounts, rep = row.split(",")
        if tax in taxa_sub:
            i += 1
            fw.write(row)

i

Tensor shape

In [None]:
shps = {"KO": 227, "taxon_trim": 4, "sample_replicate_id": 531}
f"{np.prod(list(shps.values())):,d}"

In [None]:
28164865-27841030

In [None]:
re.split(r'_\d+$','abcd_i1')[0]  # Remove "_{digit}" aa reading frame from the contig name

In [794]:
re.split(r'_\d+$','abcd_i1')[0]  # Remove "_{digit}" aa reading frame from the contig name

In [849]:
metadata[metadata['sample'] == 'S02C1']