In [4]:
from scipy.cluster import hierarchy
from scipy.stats import spearmanr
from statsmodels.formula.api import quantreg
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

def get_hierarchical_clustering(adata):

    corr_matrix, _ = spearmanr(adata.X, axis=0)

    corr_matrix = np.corrcoef(adata.X.T)

    corr_dist = 1 - corr_matrix
    dist_linkage = hierarchy.average(corr_dist)

    df = pd.DataFrame({'cophenet':hierarchy.cophenet(dist_linkage), 'corr':  flatten_upper_triangular_excluding_diagonal(corr_matrix)})

    low_quantile_model = quantreg('corr ~ cophenet', df).fit(q=0.1)

    # np.sort is used to ensure the line is drawn smoothly from left to right
    x_sorted = np.sort(df['cophenet'])
    y_predicted = low_quantile_model.predict({'cophenet': x_sorted})

    return dist_linkage





def fit_regression_clusterings(corr_matrix, dist_linkage, threshold=2):

    df = pd.DataFrame({'cophenet':hierarchy.cophenet(dist_linkage), 'corr':  flatten_upper_triangular_excluding_diagonal(corr_matrix)})
    df =df[df.cophenet<threshold]
    low_quantile_model = quantreg('corr ~ cophenet', df).fit(q=0.1)

    # np.sort is used to ensure the line is drawn smoothly from left to right
    df = df.sort_values('cophenet')
    x_sorted = df['cophenet']

    # model line
    y_predicted = low_quantile_model.predict({'cophenet': x_sorted})
    df['linear_model'] = y_predicted
    return df

def plot_regression(df):
    fig, ax = plt.subplots(figsize=(10, 6))
    plt.scatter(df['cophenet'], df['corr'], alpha=0.7, label='Data Points')

    ax.plot(df['cophenet'], df['linear_model'], color='red', linewidth=2, label='10th Percentile Quantile Regression Line')
    ax.axhline(y=0.6, color='r', linestyle='--', label='y = 0.6')
    # Add labels and a legend for clarity
    plt.title('Quantile Regression with correlation threshold')
    plt.xlabel('Cophenet')
    plt.ylabel('Correlation')
    plt.legend()
    plt.grid(True)
    plt.show()




def cut_clustering_and_gene_mapping(corr_matrix, dist_linkage, threshold, variable_names):
    clusters = hierarchy.fcluster(dist_linkage, t=threshold, criterion='distance')

    cluster_to_genes = {}
    for gene_name, cluster_id in zip(variable_names, clusters):
        if cluster_id not in cluster_to_genes:
            cluster_to_genes[cluster_id] = []
        cluster_to_genes[cluster_id].append(gene_name)

    high_average_similarity = {}
    high_average_similarity_idx = {}

    for cluster_id, gene_list in cluster_to_genes.items():
        if len(gene_list) > 1:
            # Get the sub-matrix of the correlation matrix for the genes in the cluster
            gene_indices = [variable_names.get_loc(g) for g in gene_list]
            cluster_corr_matrix = corr_matrix[np.ix_(gene_indices, gene_indices)]

            # Calculate the average of the upper triangle (excluding the diagonal)
            upper_triangle_indices = np.triu_indices_from(cluster_corr_matrix, k=1)
            average_similarity = np.mean(cluster_corr_matrix[upper_triangle_indices])

            #if average_similarity >= req_sim:
            print(f"Cluster {cluster_id}: {gene_list}")
            print(f"  Average Similarity: {average_similarity:.4f}")
            print("-" * 45)
            high_average_similarity[cluster_id] = gene_list
            high_average_similarity_idx[cluster_id] = gene_indices
    return high_average_similarity_idx, high_average_similarity


corr_matrix = np.corrcoef(adata.X.T)
corr_dist = 1 - corr_matrix
dist_linkage = hierarchy.average(corr_dist)
df = fit_regression_clusterings(corr_matrix, dist_linkage, threshold=2)
plot_regression(df)
cluster_mapping, cluster_mapping_genes = cut_clustering_and_gene_mapping(corr_matrix, dist_linkage, 0.75, adata.var_names)


NameError: name 'adata' is not defined