# Cluster Analyses


In [None]:
import pandas as pd
import numpy as np
%cd Kinase_Enrichment_Comparisons

# % cd X2K_Web/Kinase_Enrichment_Comparisons
# Choose which file to evaluate
gmtPath = '../../X2K_Databases/KINASE/KEA_2018/KEA2018_Mouse-Human_merged_KINASES_unfiltered.gmt'
#gmtPath = '../../X2K_Databases/TF/ENCODE_ChEA_Consensus/ENCODE-CHEA_Consensus_UnknownSpecies_TF.gmt'
#gmtPath = '../../X2K_Databases/TF/ENCODE_ChEA_Consensus/Processing/ENCODE-CHEA_Consensus_UnknownSpecies_unfiltered_TF.gmt'


## Adjacency Matrices: TF/Kinase Similarity Based on Substrates

In [None]:

def makeSubstrateDict(inputGMT):
    with open(inputGMT) as GMT:
        gmt = GMT.readlines()
    subDict={}
    for line in gmt:
        lineSp = line.split("\t")
        target = lineSp[0]
        substrates = lineSp[2:]
        substrates[-1] = substrates[-1].strip("\n")
        subDict[target] = np.array(substrates)
    return subDict

subDict = makeSubstrateDict(gmtPath)

def manual_Jaccard(listA, listB):
    Intersection = set(listA).intersection(set(listB))
    Union = set(listA).union(set(listB))
    return len(Intersection) / len(Union)

# Jaccard Index
## Dict/array method
def jaccard_adjacency_matrix(subDict, saveMatrix=False):
    from sklearn.metrics import jaccard_similarity_score
    jaccardDict={}
    for key in subDict.keys():
        print(key)
        
        jaccardScores=[]
        for target in subDict.keys():
            #print(target)
            np.array(subDict[key])
            geneListA = subDict[key]
            geneListB =  subDict[target]
            """
            # Make lists the same length by filling shorter one with NAs
            NAs = [np.NaN] * abs(len(geneListA)-len(geneListB))
            if len(geneListA) > len(geneListB):
                geneListB = np.append(geneListB, NAs)
            else:
                geneListA =  np.append(geneListA, NAs)
            # Compute jaccard index
            jaccardIndex = jaccard_similarity_score(geneListA, geneListB, normalize=True)
            """
            # Compute jaccard index
            jaccardIndex = manual_Jaccard(geneListA, geneListB)
            jaccardScores.append(jaccardIndex)
        # Add new entry for the key kinase
        jaccardDict[key] = dict(zip(subDict.keys(), jaccardScores))
    # Construct DF from Dict
    jaccardDF = pd.DataFrame.from_dict(jaccardDict)
    if saveMatrix!=False:
        jaccardDF.to_csv('Results/Adjacency_Matrices/'+saveMatrix+".txt", sep="\t")
    return jaccardDF

 
adjMatrix_name = gmtPath.split("/")[-1].strip('.gmt')
jaccardDF = jaccard_adjacency_matrix(subDict=subDict, saveMatrix=adjMatrix_name)

## K-means Clustering

In [None]:
X = pd.read_table('Results/Adjacency_Matrices/'+ adjMatrix_name +'.txt', index_col=0)

import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans
 

# Determine K:
## Elbow method
def elbowMethod(X):
    distortions = []
    K = range(1,10)
    for k in K:
        kmeanModel = KMeans(n_clusters=k).fit(X)
        kmeanModel.fit(X)
        distortions.append(sum(np.min(cdist(X, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])
    # Plot the elbow
    plt.plot(K, distortions, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Distortion')
    plt.title('The Elbow Method showing the optimal k')
    return distortions
# distortions = elbowMethod(X)

def optimalK(data, nrefs=3, maxClusters=15): 
    # Calculates KMeans optimal K using Gap Statistic from Tibshirani, Walther, Hastie
    # Params:
    #     data: ndarry of shape (n_samples, n_features)
    #     nrefs: number of sample reference datasets to create
    #     maxClusters: Maximum number of clusters to test for
    # Returns: (gaps, optimalK)
    gaps = np.zeros((len(range(1, maxClusters)),))
    resultsdf = pd.DataFrame({'clusterCount':[], 'gap':[]})
    for gap_index, k in enumerate(range(1, maxClusters)):
        print("Testing "+str(gap_index)+" clusters....")
        # Holder for reference dispersion results
        refDisps = np.zeros(nrefs)
        # For n references, generate random sample and perform kmeans getting resulting dispersion of each loop
        for i in range(nrefs):
            # Create new random reference set
            randomReference = np.random.random_sample(size=data.shape)
            # Fit to it
            km = KMeans(k)
            km.fit(randomReference)
            refDisp = km.inertia_
            refDisps[i] = refDisp
        # Fit cluster to original data and create dispersion
        km = KMeans(k)
        km.fit(data)
        origDisp = km.inertia_
        # Calculate gap statistic
        gap = np.log(np.mean(refDisps)) - np.log(origDisp)
        # Assign this loop's gap statistic to gaps
        gaps[gap_index] = gap
        resultsdf = resultsdf.append({'clusterCount':k, 'gap':gap}, ignore_index=True)
    return (gaps.argmax() + 1, resultsdf)  # Plus 1 because index of 0 means 1 cluster is optimal, index 2 = 3 clusters are optimal

def iterate_optimalK(X):
    # get optimal K for raw data
    n, gapdf = optimalK(X, nrefs=3, maxClusters=15)
    print('Optimal K for raw data is: ', n)
    # create KMeans given optimal n and fit
    km = KMeans(n_clusters=n)
    km.fit(X)
    # Find optimal clusters for cluster centers from above
    n, gapdf = optimalK(km.cluster_centers_, nrefs=3, maxClusters=len(km.cluster_centers_))
    print('Optimal K for first clusters is: ', n)
    return n, km
 
n, k_means = iterate_optimalK(X)
# nClusters = n
# k_means = KMeans(n_clusters=nClusters)
# k_means.fit(X) 
# k_means.labels_


"""
# # Cluster by linkage instead (clusters a bit closer to k-means)
# from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial import distance
from scipy.cluster import hierarchy
correlations = X.corr()
correlations_array = np.asarray(X.corr())
row_linkage = hierarchy.linkage( distance.pdist(correlations_array), method='average')
col_linkage = hierarchy.linkage( distance.pdist(correlations_array.T), method='average')
from scipy.cluster import hierarchy
cluster_assignments = hierarchy.fcluster(Z, 1.15, criterion='inconsistent', depth=2, R=None, monocrit=None)

sns.clustermap(adjMatrix,z_score=None, cmap="RdBu", col_colors=row_colors, row_colors=row_colors) #, col_linkage=col_linkage, row_linkage=row_linkage
plt.title('Normalized Jaccard Index')

# Sort matrix by k-means clusters
kMeans_clusters = pd.DataFrame(np.column_stack([list(X.columns), list(k_means.labels_)]), columns=['Kinase', 'Cluster'])
kMeans_clusters = kMeans_clusters.sort_values(by='Cluster')

## Sort cols
ordered_adjMatrix = X.sort_values(by=list(kMeans_clusters['Kinase']), axis=1)
## Sort rows
ordered_adjMatrix = ordered_adjMatrix.sort_values(by=list(kMeans_clusters['Kinase']), axis=0) 

colors = sns.color_palette("cubehelix", nClusters)
colorDict = dict(zip( range(0,nClusters), colors))
kMeanDict = dict(zip(kMeans_clusters['Kinase'], kMeans_clusters['Cluster']))
row_colors = [colorDict[int( kMeanDict[x] ) ] for x in ordered_adjMatrix.columns]

   
g = sns.clustermap(ordered_adjMatrix, cmap='RdBu', row_colors=row_colors, col_colors=row_colors, row_cluster=False, col_cluster=False)
for cluster in colorDict.keys():
    color = colorDict[int(cluster)]
    g.ax_col_dendrogram.bar(0, 0, color=color, label=cluster, linewidth=0)
g.ax_col_dendrogram.legend(loc="center", ncol=6, title='K-means Cluster')
plt.title('Normalized Jaccard Index')



# Hierarchical clustering
g = sns.clustermap(ordered_adjMatrix,  cmap='RdBu', row_colors=row_colors, col_colors=row_colors)
for cluster in colorDict.keys():
    color = colorDict[int(cluster)]
    g.ax_col_dendrogram.bar(0, 0, color=color, label=cluster, linewidth=0)
g.ax_col_dendrogram.legend(loc="center", ncol=6, title='K-means Cluster')
plt.title('Normalized Jaccard Index')
"""

## Perform Clustering on Adjacency Matrix

In [1]:
adjMatrix = pd.read_csv('Results/Adjacency_Matrices/'+adjMatrix_name+'.txt', sep='\t', index_col=0)


def adjacency_matrix_clustering(adjMatrix, nClusters, plot=True):
    X = adjMatrix.copy()
    import seaborn as sns
    import matplotlib.pyplot as plt
    
    # Perform  clustering on adjacency matrix using similar methodology as Clustergrammer
    # 1. Calculate cosine similarity 
    from sklearn.metrics.pairwise import cosine_similarity
    row_similarities = cosine_similarity(X)
    col_similarities = cosine_similarity(X.transpose())
    # 2. Perform hierarchical clustering
    from scipy.cluster.hierarchy import linkage
    row_linkage = linkage(row_similarities, method='average')
    col_linkage = linkage(col_similarities, method='average')
    # 3. Calculate clusters from linkage 
    from scipy.cluster.hierarchy import fcluster
    row_clusters = fcluster(row_linkage, t=nClusters, criterion='maxclust', depth=2, R=None, monocrit=None)
    col_clusters = fcluster(col_linkage, t=nClusters, criterion='maxclust', depth=2, R=None, monocrit=None)
    # 4. Create colors for each cluster
    colors = sns.color_palette("hls", nClusters) #cubehelix
    colorDict = dict(zip( range(1, nClusters+1), colors))
    row_colors = [colorDict[x] for x in row_clusters]
    col_colors = [colorDict[x] for x in col_clusters]
    if plot==True:
        # 5. Plot
        # Hierarchical clustering
        g = sns.clustermap(X,  cmap='RdBu', row_linkage=row_linkage, col_linkage=col_linkage, row_colors=row_colors, col_colors=col_colors)
        for cluster in colorDict.keys():
            color = colorDict[int(cluster)]
            g.ax_col_dendrogram.bar(0, 0, color=color, label=cluster, linewidth=0)
        g.ax_col_dendrogram.legend(loc="center", ncol=5, title='Cluster', bbox_to_anchor=(.5, 1.35), borderaxespad=1)
        plt.title('Normalized Jaccard Index')
    return row_clusters, col_clusters

row_clusters, col_clusters = adjacency_matrix_clustering(adjMatrix, nClusters=13)

# Extract genes from a cluster
def extract_genes_from_cluster(X, clusters, save=True):
    clustDict={}
    for cluster in clusters:
        genes = list(X.index[row_clusters==cluster])
        clustDict[cluster] = genes
    if save==True:
        np.save('Results/adjMatrix_clusters/'+adjMatrix_name+'_clustDict.npy', clustDict)
    return clustDict

clustDict = extract_genes_from_cluster(adjMatrix, set(row_clusters), save=True)


IndentationError: expected an indented block (<ipython-input-1-365fcf072d2c>, line 47)

## Compare genes in red cluster from heatmap to clusters in adjacency matrix

In [None]:
def redCluster_vs_adjMatrixClusters(redCluster_name):
    redCluster = pd.read_csv('Results/Red_Clusters/'+redCluster_name, index_col=0)
    overlapDict={}
    redClust_genes = set(redCluster.index)
    for c in clustDict.keys():
        clust_genes = clustDict[c]
        overlap = set(clust_genes).intersection(redClust_genes)
        overlapDict[c] = len(overlap) / len(clust_genes) * 100
    return overlapDict

def plot_cluster_overlap(overlap_Dict, redCluster, ax):
    df = pd.DataFrame([overlap_Dict]).T.reset_index()
    df.columns = ['Cluster','Percent Overlap']
    sns.barplot(data=df, x='Cluster', y='Percent Overlap', ax=ax).set_title(redCluster.strip('.csv'))

def subplot_cluster_overlap(redCluster_list):
    f, axs =plt.subplots(2, 2, sharex=False, sharey='all')
    axs = axs.ravel()
    for i,rc in enumerate(redCluster_list):
        redCluster_overlap = redCluster_vs_adjMatrixClusters(redCluster_name=rc)
        plot_cluster_overlap(overlap_Dict=redCluster_overlap, redCluster=rc, ax=axs[i])


In [None]:
#KEA 2018
redCluster_CSVs = ['X2K_UP_nLog_ranks_redCluster.csv', 'X2K_DN_nLog_ranks_redCluster.csv', \
                   'KEA_UP_nLog_ranks_redCluster.csv', 'KEA_DN_nLog_ranks_redCluster.csv']
subplot_cluster_overlap(redCluster_CSVs) 

# TFs
