# Isolate PA selection of 3um filter

Given a table with sample x KO populated with normalized counts, make interesting plots

## Setup

In [1]:
import os 
import re
import glob
import math
import umap
import numpy as np
import pandas as pd
from time import time
from scipy import stats
from collections import * 
from sklearn import cluster
from sklearn import decomposition
import matplotlib.pyplot as plt
import matplotlib.colors as pltc
from scipy.spatial import distance
from scipy.cluster import hierarchy
from matplotlib.lines import Line2D
import matplotlib.patches as mpatches

In [2]:
os.getcwd()

In [3]:
workdir = '/scratch/bgrodner/iron_ko_contigs'
os.chdir(workdir)


In [288]:
os.getcwd()

In [5]:
os.listdir()

Plotting

In [15]:
def general_plot(
    xlabel="", ylabel="", ft=12, dims=(5, 3), col="k", lw=1, pad=0, tr_spines=True
):
    fig, ax = plt.subplots(figsize=(dims[0], dims[1]), tight_layout={"pad": pad})
    for i in ax.spines:
        ax.spines[i].set_linewidth(lw)
    if not tr_spines:
        ax.spines["top"].set_visible(False)
        ax.spines["right"].set_visible(False)
    else:
        ax.spines["top"].set_color(col)
        ax.spines["right"].set_color(col)
    ax.spines["bottom"].set_color(col)
    ax.spines["left"].set_color(col)
    ax.tick_params(direction="in", labelsize=ft, color=col, labelcolor=col)
    ax.set_xlabel(xlabel, fontsize=ft, color=col)
    ax.set_ylabel(ylabel, fontsize=ft, color=col)
    ax.patch.set_alpha(0)
    return (fig, ax)

def plot_umap(
    embedding,
    figsize=(10, 10),
    markersize=10,
    alpha=0.5,
    colors="k",
    xticks=[],
    yticks=[],
    markerstyle='o',
    cmap_name='tab20',
    cl_lab=False
):
    fig, ax = general_plot(dims=figsize)
    if isinstance(markerstyle, str):
        ax.scatter(
            embedding[:, 0],
            embedding[:, 1],
            s=markersize,
            alpha=alpha,
            c=colors,
            edgecolors="none",
            marker=markerstyle,
            cmap=cmap_name
        )
    else:
        for e0, e1, c, m in zip(
            embedding[:, 0], 
            embedding[:, 1],
            colors,
            markerstyle 
        ):
            ax.scatter(
                e0,
                e1,
                s=markersize,
                alpha=alpha,
                c=c,
                edgecolors="none",
                marker=m
            )
    ax.set_aspect("equal")
    if len(xticks) > 0:
        ax.set_xticks(xticks)
    if len(yticks) > 0:
        ax.set_yticks(yticks)
    ax.set_xlabel("UMAP 1")
    ax.set_ylabel("UMAP 2")
    return fig, ax


## Load table and inspect

In [109]:
dir_combined = 'metat_search_results/dicts_iron_KO_contig/dicts_contig_count/tables_norm_count/combined'
fn_table_sample_ko_norm_counts = f'{dir_combined}/iron_KOs.txt-table_samples_KOs_norm_count.csv'

table = pd.read_csv(fn_table_sample_ko_norm_counts)
table.shape

In [7]:
table.columns[:11]

In [8]:
table[:3]

### Get KO dict

Get dataframe

In [9]:
ko_fn = "ko00001.json"
database = list()
for _, v in pd.read_json(ko_fn).iterrows():
    d = v["children"]
    cat_1 = d["name"]
    for child_1 in d["children"]:
        cat_2 = child_1["name"] # Module?
        for child_2 in child_1["children"]:
            cat_3 = child_2["name"]
            if "children" in child_2:
                for child_3 in child_2["children"]:
                    cat_4 = child_3["name"]
                    fields = [cat_1, cat_2, cat_3, cat_4]
                    database.append(fields)
df_kegg = pd.DataFrame(database, columns=["Level_A", "Level_B", "Level_C", "Level_D"])
df_kegg.shape


In [10]:
ld = df_kegg['Level_D'].values
ld[:5]

In [11]:
dict_ko_name = {}
for name in ld:
    ko = re.search(r"^\w+",name)[0]
    dict_ko_name[ko] = name

## Inspect G1PA 3um

Subset table

In [110]:
bool_ass = table['assembly'] == 'G1PA'
bool_size = table['size'] == '3um'
table_sub = table[bool_ass & bool_size]
table_sub.shape

Numer of nonzero samples for each ko

In [111]:
ko_list = table.columns[10:]
ko_list[:5]

In [63]:
filt_nsam = 15

n_nonzero = []
for ko in ko_list:
    sc = table_sub[ko].values.astype(bool)
    n_nonzero.append(sc.sum())

fig, ax = general_plot()
ax.scatter(np.arange(len(n_nonzero)), np.sort(n_nonzero))
xlims = ax.get_xlim()
ax.plot(xlims, [filt_nsam]*2, 'r')
ax.set_ylabel('Number of nonzero samples')
ax.set_xlabel('KO')

Number of nonzero KOs for each sample

In [21]:
n_sam_nonzero = []
table_sub_vals = table_sub.iloc[:,10:].values
for sc in table_sub_vals:
    n_sam_nonzero.append(sc.astype(bool).sum())

fig, ax = general_plot()
ax.scatter(np.arange(len(n_sam_nonzero)), np.sort(n_sam_nonzero))
ax.set_ylabel('Number of nonzero KOs')
ax.set_xlabel('Sample')

Filter for genes present in many samples

In [25]:
bool_ko = np.array(n_nonzero) > filt_nsam
ko_list_filt = np.array(ko_list)[bool_ko]
table_sub_vals_filt = table_sub_vals[:,bool_ko]

### Correlation between log fraction of total counts

In [26]:
dict_ko_counts = {}
for ko in ko_list_filt:
    dict_ko_counts[ko] = table_sub[ko].values

In [35]:
total_counts_sub = table_sub['sample_sum_counts'].values

nkos = len(ko_list_filt)
arr_corr_rel = np.zeros((nkos,nkos,2))
for i, koi in enumerate(ko_list_filt):
    ci = dict_ko_counts[koi] / total_counts_sub
    booli = ci.astype(bool)
    for j, koj in enumerate(ko_list_filt):
        if i < j:
            cj = dict_ko_counts[koj] / total_counts_sub
            boolj = cj.astype(bool)
            bools = booli*boolj
            if sum(bools):
                cib = ci[bools]
                cjb = cj[bools]
                cibl = np.log(cib)
                cjbl = np.log(cjb)
                res = stats.spearmanr(cibl,cjbl)
                stat = res.statistic
                pval = res.pvalue
            else:
                stat = 0
                pval = 1
            arr_corr_rel[i,j,:] = [stat, pval]

Get both corners of array

In [36]:
arr_cor_rel_nonan = np.nan_to_num(arr_corr_rel)
arr_corr_rel_stat = arr_cor_rel_nonan[:,:,0]
arr_corr_rel_stat_full = arr_corr_rel_stat + np.transpose(arr_corr_rel_stat)

Get linkage

In [37]:
lnk = hierarchy.linkage(arr_corr_rel_stat_full)
fig = plt.figure(figsize=(40,10))
dn = hierarchy.dendrogram(lnk)
plt.show()

Get linkage order

In [38]:
ko_order = hierarchy.leaves_list(lnk)

Map index to KO

In [115]:
dict_idx_koname = {}
for idx, ogidx in enumerate(ko_order):
    ko = ko_list_filt[ogidx]
    koname = dict_ko_name[ko]
    dict_idx_koname[idx] = koname

Reorder matrix rows

In [116]:
dict_idx_row = {i: r for i, r in enumerate(arr_corr_rel_stat_full)}
arr_new = []
for idx in ko_order:
    arr_new.append(dict_idx_row[idx])
arr_new = np.array(arr_new)

Reorder matrix columns

In [117]:
dict_idx_row = {i: r for i, r in enumerate(arr_new.T)}
arr_corr_rel_stat_full_order = []
for idx in ko_order:
    arr_corr_rel_stat_full_order.append(dict_idx_row[idx])
arr_corr_rel_stat_full_order = np.array(arr_corr_rel_stat_full_order)

Plot clustered matrix

In [118]:
plt.imshow(arr_corr_rel_stat_full_order, cmap='coolwarm')
plt.colorbar()

New cluster

In [119]:
tselect=1.8

clusters = hierarchy.fcluster(lnk, tselect, criterion='distance')
clusters_sort = []
cl_sort_order = []
for idx in ko_order:
    cl = clusters[idx]
    clusters_sort.append(cl)
    if cl not in cl_sort_order:
        # Order of clusters
        cl_sort_order.append(cl)

# Rename so that cluster values are increasing when sorted
# This makes is to that tab20 separates better
dict_cl_new_cl = {cl:i+1 for i, cl in enumerate(cl_sort_order)}
clusters_sort_new = [dict_cl_new_cl[cl] for cl in clusters_sort]

# now make a new colormap
cmap_list = list(plt.get_cmap('tab20').colors)
factor = math.ceil(len(cl_sort_order) / 20)
cmap_list *= factor
cmap_list = cmap_list[:len(clusters_sort_new)]
cmap = pltc.ListedColormap(cmap_list)


In [120]:
dims=(10,10)



clusters_plot = np.vstack([clusters_sort_new]*2)
fig = plt.figure(figsize=dims)
axs = {
    0: fig.add_axes([0.05, 0.955, 0.9, 0.025]),
    1: fig.add_axes([0.05, 0.05, 0.9, 0.9]),
}
axs[0].imshow(clusters_plot, cmap=cmap, aspect='auto')
axs[0].set_axis_off()
axs[1].imshow(arr_corr_rel_stat_full_order, cmap='coolwarm', origin='upper')
axs[1].set_yticks([])
axs[1].set_xticks(np.arange(25, arr_corr_rel_stat_full_order.shape[1],25))
# axs[1].invert_yaxis()
axs[1].xaxis.tick_top()
axs[1].set_ylim(arr_corr_rel_stat_full_order.shape[0], 0)
axs[1].set_xlim(0,arr_corr_rel_stat_full_order.shape[1])
for sp in axs[1].spines:
    axs[1].spines[sp].set_visible(False)


idxs = [65,76]
for idx in idxs:
    axs[1].plot([idx,idx],[1,arr_corr_rel_stat_full_order.shape[0]], 'k', alpha=0.5)

plt.show()


# print([int(i) for i in clusters_sort[idxs[0]:idxs[-1]]])

for i in range(idxs[0],idxs[-1]+1):
    print(f'Cluster:: {clusters_sort[i]}\tKO: {dict_idx_koname[i]}')
    

New cluster

In [121]:
tselect=2.2

clusters = hierarchy.fcluster(lnk, tselect, criterion='distance')
clusters_sort = []
cl_sort_order = []
for idx in ko_order:
    cl = clusters[idx]
    clusters_sort.append(cl)
    if cl not in cl_sort_order:
        # Order of clusters
        cl_sort_order.append(cl)

# Rename so that cluster values are increasing when sorted
# This makes is to that tab20 separates better
dict_cl_new_cl = {cl:i+1 for i, cl in enumerate(cl_sort_order)}
clusters_sort_new = [dict_cl_new_cl[cl] for cl in clusters_sort]

# now make a new colormap
cmap_list = list(plt.get_cmap('tab20').colors)
factor = math.ceil(len(cl_sort_order) / 20)
cmap_list *= factor
cmap_list = cmap_list[:len(clusters_sort_new)]
cmap = pltc.ListedColormap(cmap_list)


In [122]:
dims=(10,10)



clusters_plot = np.vstack([clusters_sort_new]*2)
fig = plt.figure(figsize=dims)
axs = {
    0: fig.add_axes([0.05, 0.955, 0.9, 0.025]),
    1: fig.add_axes([0.05, 0.05, 0.9, 0.9]),
}
axs[0].imshow(clusters_plot, cmap=cmap, aspect='auto')
axs[0].set_axis_off()
axs[1].imshow(arr_corr_rel_stat_full_order, cmap='coolwarm', origin='upper')
axs[1].set_yticks([])
axs[1].set_xticks(np.arange(25, arr_corr_rel_stat_full_order.shape[1],25))
# axs[1].invert_yaxis()
axs[1].xaxis.tick_top()
axs[1].set_ylim(arr_corr_rel_stat_full_order.shape[0], 0)
axs[1].set_xlim(0,arr_corr_rel_stat_full_order.shape[1])
for sp in axs[1].spines:
    axs[1].spines[sp].set_visible(False)


idxs = [52,95]
for idx in idxs:
    axs[1].plot([idx,idx],[1,arr_corr_rel_stat_full_order.shape[0]], 'k', alpha=0.5)

plt.show()


# print([int(i) for i in clusters_sort[idxs[0]:idxs[-1]]])

for i in range(idxs[0],idxs[-1]+1):
    print(f'Cluster:: {clusters_sort[i]}\tKO: {dict_idx_koname[i]}')
    

New cluster

In [123]:
tselect=2.1

clusters = hierarchy.fcluster(lnk, tselect, criterion='distance')
clusters_sort = []
cl_sort_order = []
for idx in ko_order:
    cl = clusters[idx]
    clusters_sort.append(cl)
    if cl not in cl_sort_order:
        # Order of clusters
        cl_sort_order.append(cl)

# Rename so that cluster values are increasing when sorted
# This makes is to that tab20 separates better
dict_cl_new_cl = {cl:i+1 for i, cl in enumerate(cl_sort_order)}
clusters_sort_new = [dict_cl_new_cl[cl] for cl in clusters_sort]

# now make a new colormap
cmap_list = list(plt.get_cmap('tab20').colors)
factor = math.ceil(len(cl_sort_order) / 20)
cmap_list *= factor
cmap_list = cmap_list[:len(clusters_sort_new)]
cmap = pltc.ListedColormap(cmap_list)


In [124]:
dims=(10,10)



clusters_plot = np.vstack([clusters_sort_new]*2)
fig = plt.figure(figsize=dims)
axs = {
    0: fig.add_axes([0.05, 0.955, 0.9, 0.025]),
    1: fig.add_axes([0.05, 0.05, 0.9, 0.9]),
}
axs[0].imshow(clusters_plot, cmap=cmap, aspect='auto')
axs[0].set_axis_off()
axs[1].imshow(arr_corr_rel_stat_full_order, cmap='coolwarm', origin='upper')
axs[1].set_yticks([])
axs[1].set_xticks(np.arange(25, arr_corr_rel_stat_full_order.shape[1],25))
# axs[1].invert_yaxis()
axs[1].xaxis.tick_top()
axs[1].set_ylim(arr_corr_rel_stat_full_order.shape[0], 0)
axs[1].set_xlim(0,arr_corr_rel_stat_full_order.shape[1])
for sp in axs[1].spines:
    axs[1].spines[sp].set_visible(False)


idxs = [135,145]
for idx in idxs:
    axs[1].plot([idx,idx],[1,arr_corr_rel_stat_full_order.shape[0]], 'k', alpha=0.5)

plt.show()


# print([int(i) for i in clusters_sort[idxs[0]:idxs[-1]]])

for i in range(idxs[0],idxs[-1]+1):
    print(f'Cluster:: {clusters_sort[i]}\tKO: {dict_idx_koname[i]}')
    

New cluster

In [125]:
tselect=2.1

clusters = hierarchy.fcluster(lnk, tselect, criterion='distance')
clusters_sort = []
cl_sort_order = []
for idx in ko_order:
    cl = clusters[idx]
    clusters_sort.append(cl)
    if cl not in cl_sort_order:
        # Order of clusters
        cl_sort_order.append(cl)

# Rename so that cluster values are increasing when sorted
# This makes is to that tab20 separates better
dict_cl_new_cl = {cl:i+1 for i, cl in enumerate(cl_sort_order)}
clusters_sort_new = [dict_cl_new_cl[cl] for cl in clusters_sort]

# now make a new colormap
cmap_list = list(plt.get_cmap('tab20').colors)
factor = math.ceil(len(cl_sort_order) / 20)
cmap_list *= factor
cmap_list = cmap_list[:len(clusters_sort_new)]
cmap = pltc.ListedColormap(cmap_list)


In [222]:
dims=(10,10)



clusters_plot = np.vstack([clusters_sort_new]*2)
fig = plt.figure(figsize=dims)
axs = {
    0: fig.add_axes([0.05, 0.955, 0.9, 0.025]),
    1: fig.add_axes([0.05, 0.05, 0.9, 0.9]),
}
axs[0].imshow(clusters_plot, cmap=cmap, aspect='auto')
axs[0].set_axis_off()
axs[1].imshow(arr_corr_rel_stat_full_order, cmap='coolwarm', origin='upper')
axs[1].set_yticks([])
axs[1].set_xticks(np.arange(25, arr_corr_rel_stat_full_order.shape[1],25))
# axs[1].invert_yaxis()
axs[1].xaxis.tick_top()
axs[1].set_ylim(arr_corr_rel_stat_full_order.shape[0], 0)
axs[1].set_xlim(0,arr_corr_rel_stat_full_order.shape[1])
for sp in axs[1].spines:
    axs[1].spines[sp].set_visible(False)


idxs = [145,160]
for idx in idxs:
    axs[1].plot([idx,idx],[1,arr_corr_rel_stat_full_order.shape[0]], 'k', alpha=0.5)

plt.show()


# print([int(i) for i in clusters_sort[idxs[0]:idxs[-1]]])

for i in range(idxs[0],idxs[-1]+1):
    print(f'Cluster:: {clusters_sort[i]}\tKO: {dict_idx_koname[i]}')
    

### Plot clusters and view samples

In [127]:
cluster_0 = [
    'K01623',
    'K02575',
    'K00266',
    'K03320',
    'K03737',
    'K08905',
    'K02717',
    'K02639',
    'K01624',
    'K02694',
]

cluster_1 = [
    'K14716',
    'K18245',
    'K13861',
    'K16627',
    'K00909',
    'K14688',
    'K14719',
    'K22070',
    'K14634',
]

Correlations within cluster 0

In [156]:
kos_ex = np.random.choice(cluster_0, 2, replace=False)


x = table_sub[kos_ex[0]].values / total_counts_sub
y = table_sub[kos_ex[1]].values / total_counts_sub

print(dict_ko_name[kos_ex[0]])
print(dict_ko_name[kos_ex[1]])


booli = x.astype(bool)
boolj = y.astype(bool)
bools = booli * boolj
if sum(bools):
    cib = x[bools]
    cjb = y[bools]
    cibl = np.log(cib)
    cjbl = np.log(cjb)
    res = stats.spearmanr(cibl,cjbl)
    stat = res.statistic
    pval = res.pvalue
else:
    stat = 0
    pval = 1

print(stat)
print(pval)

plt.scatter(x,y, alpha=0.5)
plt.xlabel(f'Fraction of total counts\n{dict_ko_name[kos_ex[0]]}', fontsize=8)
plt.ylabel(f'Fraction of total counts\n{dict_ko_name[kos_ex[1]]}', fontsize=8)
plt.xscale('log')
plt.yscale('log')


Correlations within cluster 1

In [188]:
kos_ex = np.random.choice(cluster_1, 2, replace=False)


x = table_sub[kos_ex[0]].values / total_counts_sub
y = table_sub[kos_ex[1]].values / total_counts_sub

print(dict_ko_name[kos_ex[0]])
print(dict_ko_name[kos_ex[1]])


booli = x.astype(bool)
boolj = y.astype(bool)
bools = booli * boolj
if sum(bools):
    cib = x[bools]
    cjb = y[bools]
    cibl = np.log(cib)
    cjbl = np.log(cjb)
    res = stats.spearmanr(cibl,cjbl)
    stat = res.statistic
    pval = res.pvalue
else:
    stat = 0
    pval = 1

print(stat)
print(pval)

plt.scatter(x,y, alpha=0.5)
plt.xlabel(f'Fraction of total counts\n{dict_ko_name[kos_ex[0]]}', fontsize=8)
plt.ylabel(f'Fraction of total counts\n{dict_ko_name[kos_ex[1]]}', fontsize=8)
plt.xscale('log')
plt.yscale('log')


Correlations between clusters

In [197]:
ko_0 = np.random.choice(cluster_0, 1, replace=False)[0]
ko_1 = np.random.choice(cluster_1, 1, replace=False)[0]


x = table_sub[ko_0].values / total_counts_sub
y = table_sub[ko_1].values / total_counts_sub

print(dict_ko_name[ko_0])
print(dict_ko_name[ko_1])


booli = x.astype(bool)
boolj = y.astype(bool)
bools = booli * boolj
if sum(bools):
    cib = x[bools]
    cjb = y[bools]
    cibl = np.log(cib)
    cjbl = np.log(cjb)
    res = stats.spearmanr(cibl,cjbl)
    stat = res.statistic
    pval = res.pvalue
else:
    stat = 0
    pval = 1

print(stat)
print(pval)

plt.scatter(x,y, alpha=0.5)
plt.xlabel(f'Fraction of total counts\n{dict_ko_name[ko_0]}', fontsize=8)
plt.ylabel(f'Fraction of total counts\n{dict_ko_name[ko_1]}', fontsize=8)
plt.xscale('log')
plt.yscale('log')


Ordered samples

In [None]:
table_sub_sub_rel = table_sub[cluster_0 + cluster_1] / total_counts_sub[:,None]
table_nms = table_sub.iloc[:,:10]
table_sub_sub_rel = pd.concat([table_nms, table_sub_sub_rel], axis=1)

table_sub_sub_rel['mean_cl_0'] = np.ma.masked_invalid(
    np.log(table_sub_sub_rel[cluster_0])
).mean(axis=1)
table_sub_sub_rel['mean_cl_1'] = np.ma.masked_invalid(
    np.log(table_sub_sub_rel[cluster_1])
).mean(axis=1)

table_sub_sub_rel['diff_mean_cl'] = table_sub_sub_rel['mean_cl_0'] - table_sub_sub_rel['mean_cl_1']
table_sub_sub_rel_sort = table_sub_sub_rel.sort_values(by='diff_mean_cl', ascending=True)
table_sub_sub_rel_sort[:3]

Plot ordered samples

In [198]:
dims=(10,4)
ft=5
fig, ax = plt.subplots(figsize=dims)

dict_plot = [
    [cluster_0, 'k', 'o', 'cluster 1'],
    [cluster_1, 'r', 'x', 'cluster 2'],
]
x = np.arange(table_sub_sub_rel_sort.shape[0])
for l, c, m, n in dict_plot:
    vals = table_sub_sub_rel_sort[l].values.T
    x_ = np.vstack([x[None,:]]*vals.shape[0])
    ax.scatter(
        x_, vals,
        color=c,
        marker=m,
        label=n
    )
_ = ax.legend()
ax.set_yscale('log')
_ = ax.set_xticks(x)
_ = ax.set_xticklabels(table_sub_sub_rel_sort['fn_sample_counts'], rotation=45, size=ft, ha='right', rotation_mode='anchor')
ax.set_ylabel('Fraction of sample counts')
ax.set_xlabel('Sample')


Get latitudes

In [199]:
samples = table.loc[bool_ass, 'sample'].values
np.unique(samples)

Load metadata table

In [200]:
dir_diel = '../repo-armbrust-metat/gradients1/g1_station_pa_metat'
fn_G3PA_diel_metadata = f'{dir_diel}/sample_metadata.csv'

G3PA_diel_metadata = pd.read_csv(fn_G3PA_diel_metadata)
G3PA_diel_metadata.shape, G3PA_diel_metadata[:3]

Build sample latitude dict

In [202]:
dict_sample_lat = {}
for i, row in G3PA_diel_metadata.iterrows():
    alias = row.Alias2
    if isinstance(alias, str):
        sample = alias.split('_')[0]
        dict_sample_lat[sample] = row.Latitude
dict_sample_lat

Plot ordered samples with latitude values

In [207]:
dims=(10,4)
ft=12
fig, ax = plt.subplots(figsize=dims)

dict_plot = [
    [cluster_0, 'k', 'o', 'cluster 1'],
    [cluster_1, 'r', 'x', 'cluster 2'],
]
x = np.arange(table_sub_sub_rel_sort.shape[0])
markers = list(Line2D.markers.keys())
for l, c, m, n in dict_plot:
    vals = table_sub_sub_rel_sort[l].values.T
    x_ = np.vstack([x[None,:]]*vals.shape[0])
    ax.scatter(
        x_, vals,
        color=c,
        marker=m,
        label=n
    )
_ = ax.legend()
ax.set_yscale('log')
_ = ax.set_xticks(x)
lats = [round(dict_sample_lat[s],3) for s in table_sub_sub_rel_sort['sample'].values]
_ = ax.set_xticklabels(lats, rotation=45, size=ft, ha='right', rotation_mode='anchor')
ax.set_ylabel('Fraction of sample counts')
ax.set_xlabel('Latitude')


Plot ordered by latitudes

In [221]:
dims=(4,4)
ft=8
alpha=0.35
fig, ax = plt.subplots(figsize=dims)

dict_plot = [
    [cluster_0, 'k', 'o', 'cluster 1'],
    [cluster_1, 'r', 'x', 'cluster 2'],
]
lats = [round(dict_sample_lat[s],3) for s in table_sub_sub_rel['sample'].values]
for l, c, m, n in dict_plot:
    vals = table_sub_sub_rel[l].values.T
    x_ = np.vstack([lats]*vals.shape[0])
    ax.scatter(
        x_, vals,
        color=c,
        marker=m,
        label=n,
        alpha=alpha
    )
    lat_sort = sorted(lats)
    mval_sort = [x for _, x in sorted(zip(lats, np.mean(vals, axis=0)))]
    ax.plot(lat_sort, mval_sort, color=c)
_ = ax.legend()
ax.set_yscale('log')
_ = ax.set_xticks(np.unique(lats))
xlab = ax.get_xticklabels()
_ = ax.set_xticklabels(xlab, rotation=45, size=ft, ha='right', rotation_mode='anchor')
ax.set_ylabel('Fraction of sample counts')
ax.set_xlabel('Latitude')


## Inspect G1NS 3um

Subset table

In [223]:
bool_ass = table['assembly'] == 'G1NS'
bool_size = table['size'] == '3um'
table_sub = table[bool_ass & bool_size]
table_sub.shape

Numer of nonzero samples for each ko

In [224]:
ko_list = table.columns[10:]
ko_list[:5]

In [225]:
filt_nsam = 15

n_nonzero = []
for ko in ko_list:
    sc = table_sub[ko].values.astype(bool)
    n_nonzero.append(sc.sum())

fig, ax = general_plot()
ax.scatter(np.arange(len(n_nonzero)), np.sort(n_nonzero))
xlims = ax.get_xlim()
ax.plot(xlims, [filt_nsam]*2, 'r')
ax.set_ylabel('Number of nonzero samples')
ax.set_xlabel('KO')

Number of nonzero KOs for each sample

In [226]:
n_sam_nonzero = []
table_sub_vals = table_sub.iloc[:,10:].values
for sc in table_sub_vals:
    n_sam_nonzero.append(sc.astype(bool).sum())

fig, ax = general_plot()
ax.scatter(np.arange(len(n_sam_nonzero)), np.sort(n_sam_nonzero))
ax.set_ylabel('Number of nonzero KOs')
ax.set_xlabel('Sample')

Filter for genes present in many samples

In [252]:
bool_ko = np.array(n_nonzero) > filt_nsam
ko_list_filt = np.array(ko_list)[bool_ko]
table_sub_vals_filt = table_sub_vals[:,bool_ko]
table_sub_vals_filt.shape

### Correlation between log fraction of total counts

In [228]:
dict_ko_counts = {}
for ko in ko_list_filt:
    dict_ko_counts[ko] = table_sub[ko].values

In [229]:
total_counts_sub = table_sub['sample_sum_counts'].values

nkos = len(ko_list_filt)
arr_corr_rel = np.zeros((nkos,nkos,2))
for i, koi in enumerate(ko_list_filt):
    ci = dict_ko_counts[koi] / total_counts_sub
    booli = ci.astype(bool)
    for j, koj in enumerate(ko_list_filt):
        if i < j:
            cj = dict_ko_counts[koj] / total_counts_sub
            boolj = cj.astype(bool)
            bools = booli*boolj
            if sum(bools):
                cib = ci[bools]
                cjb = cj[bools]
                cibl = np.log(cib)
                cjbl = np.log(cjb)
                res = stats.spearmanr(cibl,cjbl)
                stat = res.statistic
                pval = res.pvalue
            else:
                stat = 0
                pval = 1
            arr_corr_rel[i,j,:] = [stat, pval]

Get both corners of array

In [230]:
arr_cor_rel_nonan = np.nan_to_num(arr_corr_rel)
arr_corr_rel_stat = arr_cor_rel_nonan[:,:,0]
arr_corr_rel_stat_full = arr_corr_rel_stat + np.transpose(arr_corr_rel_stat)

Get linkage

In [231]:
lnk = hierarchy.linkage(arr_corr_rel_stat_full)
fig = plt.figure(figsize=(40,10))
dn = hierarchy.dendrogram(lnk)
plt.show()

Get linkage order

In [232]:
ko_order = hierarchy.leaves_list(lnk)

Map index to KO

In [233]:
dict_idx_koname = {}
for idx, ogidx in enumerate(ko_order):
    ko = ko_list_filt[ogidx]
    koname = dict_ko_name[ko]
    dict_idx_koname[idx] = koname

Reorder matrix rows

In [234]:
dict_idx_row = {i: r for i, r in enumerate(arr_corr_rel_stat_full)}
arr_new = []
for idx in ko_order:
    arr_new.append(dict_idx_row[idx])
arr_new = np.array(arr_new)

Reorder matrix columns

In [235]:
dict_idx_row = {i: r for i, r in enumerate(arr_new.T)}
arr_corr_rel_stat_full_order = []
for idx in ko_order:
    arr_corr_rel_stat_full_order.append(dict_idx_row[idx])
arr_corr_rel_stat_full_order = np.array(arr_corr_rel_stat_full_order)

Plot clustered matrix

In [236]:
plt.imshow(arr_corr_rel_stat_full_order, cmap='coolwarm')
plt.colorbar()

New cluster

In [239]:
tselect=3

clusters = hierarchy.fcluster(lnk, tselect, criterion='distance')
clusters_sort = []
cl_sort_order = []
for idx in ko_order:
    cl = clusters[idx]
    clusters_sort.append(cl)
    if cl not in cl_sort_order:
        # Order of clusters
        cl_sort_order.append(cl)

# Rename so that cluster values are increasing when sorted
# This makes is to that tab20 separates better
dict_cl_new_cl = {cl:i+1 for i, cl in enumerate(cl_sort_order)}
clusters_sort_new = [dict_cl_new_cl[cl] for cl in clusters_sort]

# now make a new colormap
cmap_list = list(plt.get_cmap('tab20').colors)
factor = math.ceil(len(cl_sort_order) / 20)
cmap_list *= factor
cmap_list = cmap_list[:len(clusters_sort_new)]
cmap = pltc.ListedColormap(cmap_list)


In [241]:
dims=(10,10)



clusters_plot = np.vstack([clusters_sort_new]*2)
fig = plt.figure(figsize=dims)
axs = {
    0: fig.add_axes([0.05, 0.955, 0.9, 0.025]),
    1: fig.add_axes([0.05, 0.05, 0.9, 0.9]),
}
axs[0].imshow(clusters_plot, cmap=cmap, aspect='auto')
axs[0].set_axis_off()
axs[1].imshow(arr_corr_rel_stat_full_order, cmap='coolwarm', origin='upper')
axs[1].set_yticks([])
axs[1].set_xticks(np.arange(25, arr_corr_rel_stat_full_order.shape[1],25))
# axs[1].invert_yaxis()
axs[1].xaxis.tick_top()
axs[1].set_ylim(arr_corr_rel_stat_full_order.shape[0], 0)
axs[1].set_xlim(0,arr_corr_rel_stat_full_order.shape[1])
for sp in axs[1].spines:
    axs[1].spines[sp].set_visible(False)


idxs = [50,90]
for idx in idxs:
    axs[1].plot([idx,idx],[1,arr_corr_rel_stat_full_order.shape[0]], 'k', alpha=0.5)

plt.show()


# print([int(i) for i in clusters_sort[idxs[0]:idxs[-1]]])

for i in range(idxs[0],idxs[-1]+1):
    print(f'Cluster:: {clusters_sort[i]}\tKO: {dict_idx_koname[i]}')
    

New cluster

In [242]:
tselect=2.5

clusters = hierarchy.fcluster(lnk, tselect, criterion='distance')
clusters_sort = []
cl_sort_order = []
for idx in ko_order:
    cl = clusters[idx]
    clusters_sort.append(cl)
    if cl not in cl_sort_order:
        # Order of clusters
        cl_sort_order.append(cl)

# Rename so that cluster values are increasing when sorted
# This makes is to that tab20 separates better
dict_cl_new_cl = {cl:i+1 for i, cl in enumerate(cl_sort_order)}
clusters_sort_new = [dict_cl_new_cl[cl] for cl in clusters_sort]

# now make a new colormap
cmap_list = list(plt.get_cmap('tab20').colors)
factor = math.ceil(len(cl_sort_order) / 20)
cmap_list *= factor
cmap_list = cmap_list[:len(clusters_sort_new)]
cmap = pltc.ListedColormap(cmap_list)


In [245]:
dims=(10,10)



clusters_plot = np.vstack([clusters_sort_new]*2)
fig = plt.figure(figsize=dims)
axs = {
    0: fig.add_axes([0.05, 0.955, 0.9, 0.025]),
    1: fig.add_axes([0.05, 0.05, 0.9, 0.9]),
}
axs[0].imshow(clusters_plot, cmap=cmap, aspect='auto')
axs[0].set_axis_off()
axs[1].imshow(arr_corr_rel_stat_full_order, cmap='coolwarm', origin='upper')
axs[1].set_yticks([])
axs[1].set_xticks(np.arange(25, arr_corr_rel_stat_full_order.shape[1],25))
# axs[1].invert_yaxis()
axs[1].xaxis.tick_top()
axs[1].set_ylim(arr_corr_rel_stat_full_order.shape[0], 0)
axs[1].set_xlim(0,arr_corr_rel_stat_full_order.shape[1])
for sp in axs[1].spines:
    axs[1].spines[sp].set_visible(False)


idxs = [119,140]
for idx in idxs:
    axs[1].plot([idx,idx],[1,arr_corr_rel_stat_full_order.shape[0]], 'k', alpha=0.5)

plt.show()


# print([int(i) for i in clusters_sort[idxs[0]:idxs[-1]]])

for i in range(idxs[0],idxs[-1]+1):
    print(f'Cluster:: {clusters_sort[i]}\tKO: {dict_idx_koname[i]}')
    

New cluster

In [246]:
tselect=2.25

clusters = hierarchy.fcluster(lnk, tselect, criterion='distance')
clusters_sort = []
cl_sort_order = []
for idx in ko_order:
    cl = clusters[idx]
    clusters_sort.append(cl)
    if cl not in cl_sort_order:
        # Order of clusters
        cl_sort_order.append(cl)

# Rename so that cluster values are increasing when sorted
# This makes is to that tab20 separates better
dict_cl_new_cl = {cl:i+1 for i, cl in enumerate(cl_sort_order)}
clusters_sort_new = [dict_cl_new_cl[cl] for cl in clusters_sort]

# now make a new colormap
cmap_list = list(plt.get_cmap('tab20').colors)
factor = math.ceil(len(cl_sort_order) / 20)
cmap_list *= factor
cmap_list = cmap_list[:len(clusters_sort_new)]
cmap = pltc.ListedColormap(cmap_list)


In [248]:
dims=(10,10)



clusters_plot = np.vstack([clusters_sort_new]*2)
fig = plt.figure(figsize=dims)
axs = {
    0: fig.add_axes([0.05, 0.955, 0.9, 0.025]),
    1: fig.add_axes([0.05, 0.05, 0.9, 0.9]),
}
axs[0].imshow(clusters_plot, cmap=cmap, aspect='auto')
axs[0].set_axis_off()
axs[1].imshow(arr_corr_rel_stat_full_order, cmap='coolwarm', origin='upper')
axs[1].set_yticks([])
axs[1].set_xticks(np.arange(25, arr_corr_rel_stat_full_order.shape[1],25))
# axs[1].invert_yaxis()
axs[1].xaxis.tick_top()
axs[1].set_ylim(arr_corr_rel_stat_full_order.shape[0], 0)
axs[1].set_xlim(0,arr_corr_rel_stat_full_order.shape[1])
for sp in axs[1].spines:
    axs[1].spines[sp].set_visible(False)


idxs = [174,190]
for idx in idxs:
    axs[1].plot([idx,idx],[1,arr_corr_rel_stat_full_order.shape[0]], 'k', alpha=0.5)

plt.show()


# print([int(i) for i in clusters_sort[idxs[0]:idxs[-1]]])

for i in range(idxs[0],idxs[-1]+1):
    print(f'Cluster:: {clusters_sort[i]}\tKO: {dict_idx_koname[i]}')
    

New cluster

In [None]:
tselect=2.25

clusters = hierarchy.fcluster(lnk, tselect, criterion='distance')
clusters_sort = []
cl_sort_order = []
for idx in ko_order:
    cl = clusters[idx]
    clusters_sort.append(cl)
    if cl not in cl_sort_order:
        # Order of clusters
        cl_sort_order.append(cl)

# Rename so that cluster values are increasing when sorted
# This makes is to that tab20 separates better
dict_cl_new_cl = {cl:i+1 for i, cl in enumerate(cl_sort_order)}
clusters_sort_new = [dict_cl_new_cl[cl] for cl in clusters_sort]

# now make a new colormap
cmap_list = list(plt.get_cmap('tab20').colors)
factor = math.ceil(len(cl_sort_order) / 20)
cmap_list *= factor
cmap_list = cmap_list[:len(clusters_sort_new)]
cmap = pltc.ListedColormap(cmap_list)


In [249]:
dims=(10,10)



clusters_plot = np.vstack([clusters_sort_new]*2)
fig = plt.figure(figsize=dims)
axs = {
    0: fig.add_axes([0.05, 0.955, 0.9, 0.025]),
    1: fig.add_axes([0.05, 0.05, 0.9, 0.9]),
}
axs[0].imshow(clusters_plot, cmap=cmap, aspect='auto')
axs[0].set_axis_off()
axs[1].imshow(arr_corr_rel_stat_full_order, cmap='coolwarm', origin='upper')
axs[1].set_yticks([])
axs[1].set_xticks(np.arange(25, arr_corr_rel_stat_full_order.shape[1],25))
# axs[1].invert_yaxis()
axs[1].xaxis.tick_top()
axs[1].set_ylim(arr_corr_rel_stat_full_order.shape[0], 0)
axs[1].set_xlim(0,arr_corr_rel_stat_full_order.shape[1])
for sp in axs[1].spines:
    axs[1].spines[sp].set_visible(False)


idxs = [195,205]
for idx in idxs:
    axs[1].plot([idx,idx],[1,arr_corr_rel_stat_full_order.shape[0]], 'k', alpha=0.5)

plt.show()


# print([int(i) for i in clusters_sort[idxs[0]:idxs[-1]]])

for i in range(idxs[0],idxs[-1]+1):
    print(f'Cluster:: {clusters_sort[i]}\tKO: {dict_idx_koname[i]}')
    

New cluster

In [None]:
tselect=2.25

clusters = hierarchy.fcluster(lnk, tselect, criterion='distance')
clusters_sort = []
cl_sort_order = []
for idx in ko_order:
    cl = clusters[idx]
    clusters_sort.append(cl)
    if cl not in cl_sort_order:
        # Order of clusters
        cl_sort_order.append(cl)

# Rename so that cluster values are increasing when sorted
# This makes is to that tab20 separates better
dict_cl_new_cl = {cl:i+1 for i, cl in enumerate(cl_sort_order)}
clusters_sort_new = [dict_cl_new_cl[cl] for cl in clusters_sort]

# now make a new colormap
cmap_list = list(plt.get_cmap('tab20').colors)
factor = math.ceil(len(cl_sort_order) / 20)
cmap_list *= factor
cmap_list = cmap_list[:len(clusters_sort_new)]
cmap = pltc.ListedColormap(cmap_list)


In [250]:
dims=(10,10)



clusters_plot = np.vstack([clusters_sort_new]*2)
fig = plt.figure(figsize=dims)
axs = {
    0: fig.add_axes([0.05, 0.955, 0.9, 0.025]),
    1: fig.add_axes([0.05, 0.05, 0.9, 0.9]),
}
axs[0].imshow(clusters_plot, cmap=cmap, aspect='auto')
axs[0].set_axis_off()
axs[1].imshow(arr_corr_rel_stat_full_order, cmap='coolwarm', origin='upper')
axs[1].set_yticks([])
axs[1].set_xticks(np.arange(25, arr_corr_rel_stat_full_order.shape[1],25))
# axs[1].invert_yaxis()
axs[1].xaxis.tick_top()
axs[1].set_ylim(arr_corr_rel_stat_full_order.shape[0], 0)
axs[1].set_xlim(0,arr_corr_rel_stat_full_order.shape[1])
for sp in axs[1].spines:
    axs[1].spines[sp].set_visible(False)


idxs = [201,275]
for idx in idxs:
    axs[1].plot([idx,idx],[1,arr_corr_rel_stat_full_order.shape[0]], 'k', alpha=0.5)

plt.show()


# print([int(i) for i in clusters_sort[idxs[0]:idxs[-1]]])

for i in range(idxs[0],idxs[-1]+1):
    print(f'Cluster:: {clusters_sort[i]}\tKO: {dict_idx_koname[i]}')
    

New cluster

In [None]:
tselect=2.25

clusters = hierarchy.fcluster(lnk, tselect, criterion='distance')
clusters_sort = []
cl_sort_order = []
for idx in ko_order:
    cl = clusters[idx]
    clusters_sort.append(cl)
    if cl not in cl_sort_order:
        # Order of clusters
        cl_sort_order.append(cl)

# Rename so that cluster values are increasing when sorted
# This makes is to that tab20 separates better
dict_cl_new_cl = {cl:i+1 for i, cl in enumerate(cl_sort_order)}
clusters_sort_new = [dict_cl_new_cl[cl] for cl in clusters_sort]

# now make a new colormap
cmap_list = list(plt.get_cmap('tab20').colors)
factor = math.ceil(len(cl_sort_order) / 20)
cmap_list *= factor
cmap_list = cmap_list[:len(clusters_sort_new)]
cmap = pltc.ListedColormap(cmap_list)


In [251]:
dims=(10,10)



clusters_plot = np.vstack([clusters_sort_new]*2)
fig = plt.figure(figsize=dims)
axs = {
    0: fig.add_axes([0.05, 0.955, 0.9, 0.025]),
    1: fig.add_axes([0.05, 0.05, 0.9, 0.9]),
}
axs[0].imshow(clusters_plot, cmap=cmap, aspect='auto')
axs[0].set_axis_off()
axs[1].imshow(arr_corr_rel_stat_full_order, cmap='coolwarm', origin='upper')
axs[1].set_yticks([])
axs[1].set_xticks(np.arange(25, arr_corr_rel_stat_full_order.shape[1],25))
# axs[1].invert_yaxis()
axs[1].xaxis.tick_top()
axs[1].set_ylim(arr_corr_rel_stat_full_order.shape[0], 0)
axs[1].set_xlim(0,arr_corr_rel_stat_full_order.shape[1])
for sp in axs[1].spines:
    axs[1].spines[sp].set_visible(False)


idxs = [272,290]
for idx in idxs:
    axs[1].plot([idx,idx],[1,arr_corr_rel_stat_full_order.shape[0]], 'k', alpha=0.5)

plt.show()


# print([int(i) for i in clusters_sort[idxs[0]:idxs[-1]]])

for i in range(idxs[0],idxs[-1]+1):
    print(f'Cluster:: {clusters_sort[i]}\tKO: {dict_idx_koname[i]}')
    

New cluster

In [None]:
tselect=2.25

clusters = hierarchy.fcluster(lnk, tselect, criterion='distance')
clusters_sort = []
cl_sort_order = []
for idx in ko_order:
    cl = clusters[idx]
    clusters_sort.append(cl)
    if cl not in cl_sort_order:
        # Order of clusters
        cl_sort_order.append(cl)

# Rename so that cluster values are increasing when sorted
# This makes is to that tab20 separates better
dict_cl_new_cl = {cl:i+1 for i, cl in enumerate(cl_sort_order)}
clusters_sort_new = [dict_cl_new_cl[cl] for cl in clusters_sort]

# now make a new colormap
cmap_list = list(plt.get_cmap('tab20').colors)
factor = math.ceil(len(cl_sort_order) / 20)
cmap_list *= factor
cmap_list = cmap_list[:len(clusters_sort_new)]
cmap = pltc.ListedColormap(cmap_list)


In [253]:
dims=(10,10)



clusters_plot = np.vstack([clusters_sort_new]*2)
fig = plt.figure(figsize=dims)
axs = {
    0: fig.add_axes([0.05, 0.955, 0.9, 0.025]),
    1: fig.add_axes([0.05, 0.05, 0.9, 0.9]),
}
axs[0].imshow(clusters_plot, cmap=cmap, aspect='auto')
axs[0].set_axis_off()
axs[1].imshow(arr_corr_rel_stat_full_order, cmap='coolwarm', origin='upper')
axs[1].set_yticks([])
axs[1].set_xticks(np.arange(25, arr_corr_rel_stat_full_order.shape[1],25))
# axs[1].invert_yaxis()
axs[1].xaxis.tick_top()
axs[1].set_ylim(arr_corr_rel_stat_full_order.shape[0], 0)
axs[1].set_xlim(0,arr_corr_rel_stat_full_order.shape[1])
for sp in axs[1].spines:
    axs[1].spines[sp].set_visible(False)


idxs = [290,323]
for idx in idxs:
    axs[1].plot([idx,idx],[1,arr_corr_rel_stat_full_order.shape[0]], 'k', alpha=0.5)

plt.show()


# print([int(i) for i in clusters_sort[idxs[0]:idxs[-1]]])

for i in range(idxs[0],idxs[-1]+1):
    print(f'Cluster:: {clusters_sort[i]}\tKO: {dict_idx_koname[i]}')
    

## Correlation between G1PA and NS 3um

Get values from each

In [256]:
asss = ['G1PA','G1NS']
size = '3um'
filt_nsam = 15
ko_lists = []
dicts = []
total_counts = []
for ass in asss:
    # Subset table
    bool_ass = table['assembly'] == ass
    bool_size = table['size'] == size
    table_sub = table[bool_ass & bool_size]
    ko_list = table.columns[10:]
    # get kos with lots of nonzero samples
    n_nonzero = []
    for ko in ko_list:
        sc = table_sub[ko].values.astype(bool)
        n_nonzero.append(sc.sum())
    bool_ko = np.array(n_nonzero) > filt_nsam
    ko_list_filt = np.array(ko_list)[bool_ko]
    # Build out dictionaries 
    ko_lists.append(ko_list_filt)
    dicts.append(table_sub[ko_list_filt].to_dict(orient='list'))
    total_counts.append(table_sub['sample_sum_counts'].values)



Get correlations

In [262]:
shape = [len(l) for l in ko_lists]
arr_corr_rel = np.zeros(shape + [2])
for i, koi in enumerate(ko_lists[0]):
    ci = np.array(dicts[0][koi]) / total_counts[0]
    booli = ci.astype(bool)
    for j, koj in enumerate(ko_lists[1]):
        cj = np.array(dicts[1][koj]) / total_counts[1]
        boolj = cj.astype(bool)
        bools = booli*boolj
        if sum(bools):
            cib = ci[bools]
            cjb = cj[bools]
            cibl = np.log(cib)
            cjbl = np.log(cjb)
            res = stats.spearmanr(cibl,cjbl)
            stat = res.statistic
            pval = res.pvalue
        else:
            stat = 0
            pval = 1
        arr_corr_rel[i,j,:] = [stat, pval]                

Get linkage in both axes

In [263]:
arr_corr_rel_stat_full = arr_corr_rel[:,:,0]

lnki = hierarchy.linkage(arr_corr_rel_stat_full)
fig = plt.figure(figsize=(40,10))
dn = hierarchy.dendrogram(lnki)
plt.show()

In [264]:
lnkj = hierarchy.linkage(arr_corr_rel_stat_full.T)
fig = plt.figure(figsize=(40,10))
dn = hierarchy.dendrogram(lnkj)
plt.show()

Get linkage order

In [265]:
ko_orderi = hierarchy.leaves_list(lnki)
ko_orderj = hierarchy.leaves_list(lnkj)

Map index to KO

In [284]:
dict_idi_koname = {}
for idx, ogidx in enumerate(ko_orderi):
    ko = ko_lists[0][ogidx]
    koname = dict_ko_name[ko]
    dict_idi_koname[idx] = koname

dict_idj_koname = {}
for idx, ogidx in enumerate(ko_orderj):
    ko = ko_lists[1][ogidx]
    koname = dict_ko_name[ko]
    dict_idj_koname[idx] = koname

Reorder matrix rows

In [278]:
dict_idx_row = {i: r for i, r in enumerate(arr_corr_rel_stat_full)}
arr_new = []
for idx in ko_orderi:
    arr_new.append(dict_idx_row[idx])
arr_new = np.array(arr_new)

Reorder matrix columns

In [279]:
dict_idx_row = {i: r for i, r in enumerate(arr_new.T)}
arr_corr_rel_stat_full_order = []
for idx in ko_orderj:
    arr_corr_rel_stat_full_order.append(dict_idx_row[idx])
arr_corr_rel_stat_full_order = np.array(arr_corr_rel_stat_full_order)

Plot clustered matrix

In [280]:
plt.imshow(arr_corr_rel_stat_full_order, cmap='coolwarm')
plt.colorbar()
plt.xlabel('KOs G1PA 3um')
plt.ylabel('KOs G1NS 3um')

Pick out genes

In [287]:
dims=(10,10)



clusters_plot = np.vstack([clusters_sort_new]*2)
fig = plt.figure(figsize=dims)
axs = {
    0: fig.add_axes([0.05, 0.05, 0.9, 0.9]),
}

axs[0].imshow(arr_corr_rel_stat_full_order, cmap='coolwarm', origin='upper')
# axs[1].set_yticks([])
# axs[1].set_xticks(np.arange(25, arr_corr_rel_stat_full_order.shape[1],25))
# # axs[1].invert_yaxis()
# axs[1].xaxis.tick_top()
axs[0].set_ylim(arr_corr_rel_stat_full_order.shape[0], 0)
axs[0].set_xlim(0,arr_corr_rel_stat_full_order.shape[1])
# for sp in axs[1].spines:
#     axs[1].spines[sp].set_visible(False)

idxcol = [135,180]
for idx in idxcol:
    axs[0].plot([idx,idx],[1,arr_corr_rel_stat_full_order.shape[0]], 'k', alpha=0.5)

idxrow = [300,323]
for idx in idxrow:
    axs[0].plot([1,arr_corr_rel_stat_full_order.shape[1]], [idx,idx], 'k', alpha=0.5)

plt.show()


# print([int(i) for i in clusters_sort[idxs[0]:idxs[-1]]])

print('\nG1PA')
for i in range(idxcol[0],idxcol[-1]):
    print(dict_idi_koname[i])


print('\nG1NS')
for i in range(idxrow[0],idxrow[-1]):
    print(dict_idj_koname[i])
    