In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering  as AC
from sklearn.manifold import MDS
from scripts.cluster_proteins import matrix_from_sparse

In [None]:
data = pd.read_table('res/core.a.mags.muri.dbCAN-hits.blastp_dist.tsv',
                     names=['qseqid', 'sseqid', 'dissimilarity'], index_col=['qseqid', 'sseqid'])
dmatrix = matrix_from_sparse(data.dissimilarity)

In [None]:
domains = pd.read_table('res/core.a.mags.muri.dbCAN-hits.domains.tsv',
                     names=['domain_id', 'domain_',
                            'query_id', 'query_length',
                            'evalue', 'domain_start',
                            'domain_end', 'query_start',
                            'query_end', 'score'])

In [None]:
opu = pd.read_table('res/core.a.mags.muri.dbCAN-hits.denovo-clust.tsv', names=['query_id', 'cluster'])
opu[opu.cluster == 'Opu0972']

In [None]:
from collections import Counter

Counter(domains[domains.query_id.isin(opu[opu.cluster == 'Opu0972'].query_id)].domain_id)

In [None]:
ii = domains[domains.domain_id == 'GH13'].query_id.unique()

In [None]:
ac = AC(n_clusters=1, affinity='precomputed', linkage='average', ).fit(dmatrix.loc[ii, ii])
cluster = pd.DataFrame({'cluster': ac.labels_}, index=dmatrix.loc[ii, ii].index)

In [None]:
from scipy.cluster.hierarchy import dendrogram

children = ac.children_

distance = np.arange(children.shape[0])
position = np.arange(2, children.shape[0]+2)

linkage_matrix = np.column_stack([
    children, distance, position]
).astype(float)

fig, ax = plt.subplots(figsize=(10, 15))
ax.set_frame_on(False)

dgrm = dendrogram(linkage_matrix, orientation='left', ax=ax, labels=cluster.index)

leaf_order = pd.Series(dgrm['ivl'])
cluster_start, cluster_stop = list(leaf_order[leaf_order.isin(opu[opu.cluster == 'Opu0972'].query_id)].iloc[[0,-1]].index)
ax.vlines(0, (cluster_start + 1)*10, (cluster_stop + 1)*10)

fig.tight_layout()
fig.savefig('test.pdf')



In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
ax.imshow(1 - dmatrix.loc[leaf_order, leaf_order])

ax.vlines(cluster_start , cluster_start, cluster_stop, color='r')
ax.vlines(cluster_stop, cluster_start, cluster_stop, color='r')
ax.hlines(cluster_start, cluster_start, cluster_stop, color='r')
ax.hlines(cluster_stop, cluster_start, cluster_stop, color='r')

TODO: Check out OTU-7.vA.a.scaffolds.pilon.ctrim_01769 , and OTU-1.vB.a.scaffolds.pilon.ctrim_01382 ,
as well as OTU-7.vA.a.scaffolds.pilon.ctrim_02383 , and OTU-1.vB.a.scaffolds.pilon.ctrim_00001
each pair of which have perfect identity despite being in different backgrounds.

In [None]:
opu = pd.read_table('res/core.a.mags.muri.dbCAN-hits.denovo-clust.tsv', names=['query_id', 'cluster'])
opu[opu.cluster == 'Opu0819']