In [1]:
# Open dct and clans dict from pickle
import pickle

# DCT df is used for clustering
with open('data/dct_df.pkl', 'rb') as f:
    dct_df = pickle.load(f)

In [3]:
from scipy.cluster.hierarchy import dendrogram, linkage

# Get linkage data from desired clustering method and save to pickle
linkage_data = linkage(dct_df, method='ward', metric='euclidean')
with open('data/linkage_data.pkl', 'wb') as f:
    pickle.dump(linkage_data, f)

In [6]:
# Graph linkage data as dendrogram
dendrogram(linkage_data)

In [13]:
from scipy.cluster.hierarchy import fcluster

# Load linkage data from pickle and get clusters
# Linkage function takes a minute to run, so if we're just testing different fcluster params its faster to load
with open('data/linkage_data.pkl', 'rb') as f:
    linkage_data = pickle.load(f)
clusters = fcluster(linkage_data, 2500, criterion='distance')

# Indices of clusters should match the indices of the families
# Make a dictionary of clusters with cluster as key and list of families as value
dct_families = dct_df.index
cluster_dict = {}
for i in range(len(clusters)):
    if clusters[i] not in cluster_dict:
        cluster_dict[clusters[i]] = [dct_families[i]]
    else:
        cluster_dict[clusters[i]].append(dct_families[i])

In [14]:
# Create len(fam) x len(fam) matrix
import pandas as pd

cluster_df = pd.DataFrame(0, index=dct_families, columns=dct_families)

# Fill matrix with 1s where both families are in the same cluster
for cluster in cluster_dict:
    for family1 in cluster_dict[cluster]:
        for family2 in cluster_dict[cluster]:
            cluster_df.loc[family1, family2] = 1

# Save matrix as pickle file
cluster_df.to_pickle('data/cluster_df.pkl')

In [15]:
cluster_df.head()

Unnamed: 0,PF00001,PF00002,PF00003,PF00004,PF00005,PF00006,PF00007,PF00008,PF00009,PF00010,...,PF20615,PF20616,PF20617,PF20618,PF20619,PF20620,PF20621,PF20622,PF20623,PF20624
PF00001,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PF00002,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PF00003,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PF00004,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PF00005,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
for cluster in cluster_dict:
    for family1 in cluster_dict[cluster]:
        row = cluster_df.loc[family1].values
        print(row)
        print(family1)
        for family2 in cluster_dict[cluster]:
            print(family2)
            cluster_df.loc[family1, family2] = 1
            break
        break
    break

[1 0 0 ... 0 0 0]
PF00001
PF00001


In [None]:
'''TODO
Convert family names to simple ascending list of integers.
cluster_dict will contain these integers instead of family names.
When creating cluster_df, assign a 1 to the row of an array with corresponding indices to the families in the cluster, then assign row to df.
This should be faster than assigning 1s to each cell individually with the .loc function.
Once all rows have been assigned, convert integers back to family names.
'''