In [1]:
# Open dct and clans dict from pickle
import pickle

# DCT df is used for clustering
with open('data/dct_df.pkl', 'rb') as f:
    dct_df = pickle.load(f)

In [2]:
from scipy.cluster.hierarchy import dendrogram, linkage

# Get linkage data from desired clustering method and save to pickle
linkage_data = linkage(dct_df, method='ward', metric='euclidean')
with open('data/linkage_data.pkl', 'wb') as f:
    pickle.dump(linkage_data, f)

In [6]:
# Graph linkage data as dendrogram
dendrogram(linkage_data)

In [34]:
from scipy.cluster.hierarchy import fcluster

# Load linkage data from pickle and get clusters
# Linkage function takes a minute to run, so if we're just testing different fcluster params its faster to load
with open('data/linkage_data.pkl', 'rb') as f:
    linkage_data = pickle.load(f)
clusters = fcluster(linkage_data, 1000, criterion='distance')

# Rename dct_families to be in increasing order so we can assign values to an array based off index
dct_families = dct_df.index
dct_fam_ints = {}
for i, fam in enumerate(dct_families):
    dct_fam_ints[fam] = i

# Indices of clusters should match the indices of the families
# Make a dictionary of clusters with cluster as key and list of families as value
cluster_dict = {}
for i in range(len(clusters)):
    if clusters[i] not in cluster_dict:
        cluster_dict[clusters[i]] = [list(dct_fam_ints.values())[i]]
    else:
        cluster_dict[clusters[i]].append(list(dct_fam_ints.values())[i])

In [35]:
# Create len(fam) x len(fam) matrix
import pandas as pd

cluster_df = pd.DataFrame(0, index=dct_families, columns=dct_families)

# For each cluster, we want to assign a 1 to the cell between each family in the cluster
for cluster in cluster_dict:
    for family1 in cluster_dict[cluster]:
        row = [0 for i in range(len(dct_df))]  # This row indicates relations of family1 to all other families
        for family2 in cluster_dict[cluster]:
            row[family2] = 1  # If family2 is in the same cluster as family1, set index to 1
        
        # Convert family integer to family name after we indexed the row and place whole row in df
        # This is MUCH faster than using .loc[fam1, fam2] = 1 for each individual cell
        family1_name = list(dct_fam_ints.keys())[list(dct_fam_ints.values()).index(family1)]
        cluster_df.loc[family1_name] = row
cluster_df.to_pickle('data/cluster_df.pkl')

In [36]:
len(cluster_dict)

2591

In [11]:
cluster_df.head()

Unnamed: 0,PF00001,PF00002,PF00003,PF00004,PF00005,PF00006,PF00007,PF00008,PF00009,PF00010,...,PF20615,PF20616,PF20617,PF20618,PF20619,PF20620,PF20621,PF20622,PF20623,PF20624
PF00001,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PF00002,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PF00003,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PF00004,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PF00005,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
