In [6]:
import os, json, glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
from sklearn.preprocessing import LabelEncoder, StandardScaler, FunctionTransformer

In [5]:
data = pd.read_csv("temporal_data_with_patient_ready_normalized_full_genes.csv", index_col=0)
data = data.dropna(axis=0)

# remove a layer of complexity by averaging the data across patients for each gene and celltype
data.index = data.index.str.split('_').map(lambda x: (x[0], x[-1]))
data.index = pd.MultiIndex.from_tuples(data.index, names=["gene", "celltype"])
data_avg = data.groupby(level=["gene", "celltype"]).mean()

# fix the index
data_avg = data_avg.reset_index()
data_avg["gene_celltype"] = data_avg["gene"] + "_" + data_avg["celltype"]
data_avg = data_avg.set_index("gene_celltype").drop(columns=["gene", "celltype"])

data = data_avg

columns = data.columns
genes = np.array([name.split("_")[0] for name in list(data.index)])
cells = np.array([name.split("_")[1] for name in list(data.index)])

enc_genes = LabelEncoder()
enc_cells = LabelEncoder()

enc_genes.fit(genes)
enc_cells.fit(cells)

X = data.values[:, np.newaxis, :]

y_genes = enc_genes.transform(genes)
y_cells = enc_cells.transform(cells)

data.head()

Unnamed: 0_level_0,d0,d1,d2,d5,d9,d15
gene_celltype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A1BG_Bcells,0.196337,0.235422,0.229986,0.329739,0.465127,0.365523
A1BG_CD4cytotoxicT,0.0,0.265965,0.218069,0.269721,0.445299,0.266964
A1BG_CD4memoryT,0.211437,0.208521,0.167145,0.225112,0.305198,0.187713
A1BG_CD4memoryactivatedT,0.223909,0.211382,0.18342,0.380808,0.379326,0.353488
A1BG_CD4naiveT,0.178656,0.179129,0.212789,0.09534,0.089639,0.111972


In [None]:
# set folder with results
results_folder = 
plot_subnetworks = False

In [15]:
def aggregate_clusters(results_dir, output_file):
    aggregated_data = []
    
    # Find all cluster result files
    cluster_files = glob.glob(os.path.join(results_dir, "clusters_*.tsv"))
    
    for file_path in cluster_files:
        with open(file_path, "r") as f:
            lines = f.readlines()
        
        # Extract network and score number from filename
        filename = os.path.basename(file_path)
        parts = filename.replace("clusters_", "").replace(".tsv", "").split("_scores_")
        if len(parts) != 2:
            continue
        network = parts[0]
        score_number = parts[1]  # Capture score number
        
        # Find clusters section and p-value
        cluster_start = next((i for i, line in enumerate(lines) if line.strip() == "# Clusters:"), None)
        p_value = None
        for line in lines:
            if line.startswith("# p-value:"):
                p_value = float(line.strip().split(": ")[1])  # Convert to float for sorting
                break
        
        if cluster_start is None:
            continue
        
        # Read clusters, excluding single-gene subnetworks
        for line in lines[cluster_start + 1:]:
            genes = line.strip().split("\t")
            if len(genes) > 1:  # Exclude single-gene subnetworks
                aggregated_data.append([network, score_number, p_value, ",".join(genes)])
    
    # Create DataFrame
    df = pd.DataFrame(aggregated_data, columns=["Network", "Shapelet Number", "P-Value", "Genes"])
    
    # Sort by p-value (ascending order)
    df = df.sort_values(by="P-Value")
    
    # Save aggregated results
    df.to_csv(output_file, sep="\t", index=False)
    print(f"Aggregated results saved to {output_file}")
    
    return df

In [16]:
# Example usage
results_directory = "hierarchical-hotnet-python3-fixed/examples/shapelet_results"  # Change if necessary
output_file = "hierarchical-hotnet-python3-fixed/examples/shapelet_results/aggregated_clusters.tsv"
results = aggregate_clusters(results_directory, output_file)

Aggregated results saved to hierarchical-hotnet-python3-fixed/examples/shapelet_results/aggregated_clusters.tsv


In [17]:
results

Unnamed: 0,Network,Shapelet Number,P-Value,Genes
254,string,68,0.00,"POLR3B,POLR3C,POLR3E"
285,string,60,0.00,"KCNN4,PRKCB,PRKCQ,PRKD2"
284,string,60,0.00,"DDX1,DDX21,HEXIM1,PSPC1,SFPQ"
283,string,60,0.00,"AKIRIN2,B2M,CD244,CD247,CD3E,CD74,CSK,HLA-A,HL..."
281,string,4,0.00,"IL36A,IL36RN"
...,...,...,...,...
169,string,61,0.94,"MCOLN2,TFEB"
166,string,61,0.94,"BTN3A1,BTN3A2,BTN3A3"
171,string,61,0.94,"NLRC5,RNF135"
168,string,61,0.94,"AKIRIN2,PSMA1"


In [13]:
# list files in results folder
significant_sizes = pd.DataFrame(columns=["shapelet", "delta", "size", "pval", "subnetwork"])

for shapelet in os.listdir(results_folder):
  # iterate over every delta in shapelet
  for delta in os.listdir(os.path.join(results_folder, shapelet)):
    if os.path.isdir(os.path.join(results_folder, shapelet, delta)):
      significance_file = os.path.join(results_folder, shapelet, delta, "results.json")
      
      with open(significance_file) as f:
        results_json = json.load(f)
        max_size = max(map(int, results_json['statistics'].keys()))

        for size in results_json['statistics'].keys():
          pval = results_json['statistics'][size]['pval']
          significant_subnetworks = []

          if pval < 0.05:

            for subnetwork in results_json['components']:
              if len(subnetwork) == int(size):
                significant_subnetworks.append(subnetwork)
              if len(subnetwork) > int(size) and max_size == int(size):
                significant_subnetworks.append(subnetwork)
            
            # plot significant subnetworks based on data and genes
            if plot_subnetworks:
              # Create a PdfPages object
              with PdfPages('plots/significant_subnetworks_plots_' + shapelet + '_size_' + size + '.pdf') as pdf:
                # get data rows corresponding to the first significant subnetwork

                for genes_list in significant_subnetworks:
                  # Define the list of genes

                  # Filter rows where the index contains any of the genes in the list and "CIKcells"
                  filtered_data = data[data.index.str.contains("CIKcells") & data.index.str.contains('|'.join(genes_list))]

                  # Modify index to include only gene names
                  filtered_data.index = [next(gene for gene in genes_list if gene in name) for name in filtered_data.index]

                  # Plot the data as lines with color coding based on genes
                  plt.figure(figsize=(10, 5))
                  sns.lineplot(data=filtered_data.T, dashes=False)
                  plt.title(f'Pvalues: {pval}')
                  pdf.savefig()  # Save the current figure into the pdf
                  plt.close()  # Close the figure to avoid displaying it in the notebook

            significant_sizes = pd.concat([significant_sizes, 
                                          pd.DataFrame([{"shapelet": shapelet, 
                                            "delta": delta, 
                                            "size": size, 
                                            "pval": pval,
                                            "subnetwork": significant_subnetworks}])], ignore_index=True)

# group significant sizes by shapelet and by delta, and keep the delta with the highest number of significant sizes
significant_sizes = significant_sizes.sort_values(by=["shapelet", "delta", "size", "pval"], ascending=[True, True, True, True])

# Group by shapelet and delta, and keep the delta with the highest number of rows for each shapelet
significant_sizes = significant_sizes.groupby('shapelet').apply(lambda x: x.loc[x['delta'] == x['delta'].value_counts().idxmax()]).reset_index(drop=True)

# Remove rows with empty subnetworks
significant_sizes = significant_sizes[significant_sizes["subnetwork"].apply(lambda x: len(x) > 0)]

  significant_sizes = pd.concat([significant_sizes,
  significant_sizes = significant_sizes.groupby('shapelet').apply(lambda x: x.loc[x['delta'] == x['delta'].value_counts().idxmax()]).reset_index(drop=True)


In [14]:
significant_sizes

Unnamed: 0,shapelet,delta,size,pval,subnetwork
0,string-scores_row_1,delta_2.0097827106593018e-05,6,0.012,"[[AKIRIN2, EIF2AK4, PPP6C, PSMA1, TRIM14, USP1..."
1,string-scores_row_1,delta_2.0097827106593018e-05,8,0.020,"[[CRCP, POLR3A, POLR3B, POLR3C, POLR3D, POLR3E..."
2,string-scores_row_1,delta_2.0097827106593018e-05,9,0.018,"[[AIM2, ARHGEF2, CASP4, GSDMD, NLRP1, NLRP2, N..."
3,string-scores_row_10,delta_1.0101953752965914e-05,8,0.018,"[[CGAS, NONO, PQBP1, PSPC1, RBM14, SFPQ, TRIM3..."
4,string-scores_row_11,delta_4.0429165392845634e-05,2,0.037,"[[ADAR, ZBP1], [ALPK1, TIFA], [C1S, COLEC11], ..."
...,...,...,...,...,...
72,string-scores_row_66,delta_4.330989494819436e-06,10,0.014,"[[CLU, DCLRE1C, DDX1, DDX17, DDX21, DDX3X, DDX..."
74,string-scores_row_68,delta_3.1830071319854182e-06,3,0.018,"[[IFI27, IFITM1, ISG20], [KCNN4, PRKCB, PRKD2]..."
75,string-scores_row_68,delta_3.1830071319854182e-06,4,0.016,"[[CRCP, POLR3C, POLR3H, POLR3K], [DDX1, DDX17,..."
76,string-scores_row_69,delta_1.017753572507336e-05,7,0.023,"[[CYLD, ITCH, OPTN, PCBP2, SQSTM1, TFE3, TFEB]..."
