In [1]:
import os, json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages

In [2]:
data = pd.read_csv("temporal_data_with_patient_ready_normalized_full_genes.csv", index_col=0)
data = data.dropna(axis=0)

columns = data.columns
genes = np.array([name.split("_")[0] for name in list(data.index)])
patients = np.array([name.split("_")[1] for name in list(data.index)])
cells = np.array([name.split("_")[2] for name in list(data.index)])

data

Unnamed: 0_level_0,d0,d1,d2,d5,d9,d15
gene_patient_celltype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AL627309.1_at_Bcells,0.000000,0.000000,0.0,0.004712,0.000000,0.000000
AL627309.1_at_CD4memoryactivatedT,0.000000,0.000000,0.0,0.000000,0.002348,0.000000
AL627309.1_at_CD4memoryT,0.000000,0.007536,0.0,0.000000,0.000000,0.000000
AL627309.1_at_CD4naiveT,0.002829,0.000000,0.0,0.000000,0.000000,0.000000
AL627309.1_at_CD8naiveactivatedT,0.000000,0.000000,0.0,0.006132,0.000000,0.002049
...,...,...,...,...,...,...
AL008638.5_ev_CD8cytotoxicT,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
AL008638.5_gz_CD8cytotoxicT,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
AL008638.5_im_CD8cytotoxicT,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
AL008638.5_rs_CD8cytotoxicT,0.000000,0.000000,0.0,0.000000,0.000000,0.000000


In [3]:
# set folder with results
results_folder = "hotnet2/paper/results_average_string"
plot_subnetworks = False


In [13]:
# list files in results folder
significant_sizes = pd.DataFrame(columns=["shapelet", "delta", "size", "pval", "subnetwork"])

for shapelet in os.listdir(results_folder):
  # iterate over every delta in shapelet
  for delta in os.listdir(os.path.join(results_folder, shapelet)):
    if os.path.isdir(os.path.join(results_folder, shapelet, delta)):
      significance_file = os.path.join(results_folder, shapelet, delta, "results.json")
      
      with open(significance_file) as f:
        results_json = json.load(f)
        max_size = max(map(int, results_json['statistics'].keys()))

        for size in results_json['statistics'].keys():
          pval = results_json['statistics'][size]['pval']
          significant_subnetworks = []

          if pval < 0.05:

            for subnetwork in results_json['components']:
              if len(subnetwork) == int(size):
                significant_subnetworks.append(subnetwork)
              if len(subnetwork) > int(size) and max_size == int(size):
                significant_subnetworks.append(subnetwork)
            
            # plot significant subnetworks based on data and genes
            if plot_subnetworks:
              # Create a PdfPages object
              with PdfPages('plots/significant_subnetworks_plots_' + shapelet + '_size_' + size + '.pdf') as pdf:
                # get data rows corresponding to the first significant subnetwork

                for genes_list in significant_subnetworks:
                  # Define the list of genes

                  # Filter rows where the index contains any of the genes in the list and "CIKcells"
                  filtered_data = data[data.index.str.contains("CIKcells") & data.index.str.contains('|'.join(genes_list))]

                  # Modify index to include only gene names
                  filtered_data.index = [next(gene for gene in genes_list if gene in name) for name in filtered_data.index]

                  # Plot the data as lines with color coding based on genes
                  plt.figure(figsize=(10, 5))
                  sns.lineplot(data=filtered_data.T, dashes=False)
                  plt.title(f'Pvalues: {pval}')
                  pdf.savefig()  # Save the current figure into the pdf
                  plt.close()  # Close the figure to avoid displaying it in the notebook

            significant_sizes = pd.concat([significant_sizes, 
                                          pd.DataFrame([{"shapelet": shapelet, 
                                            "delta": delta, 
                                            "size": size, 
                                            "pval": pval,
                                            "subnetwork": significant_subnetworks}])], ignore_index=True)

# group significant sizes by shapelet and by delta, and keep the delta with the highest number of significant sizes
significant_sizes = significant_sizes.sort_values(by=["shapelet", "delta", "size", "pval"], ascending=[True, True, True, True])

# Group by shapelet and delta, and keep the delta with the highest number of rows for each shapelet
significant_sizes = significant_sizes.groupby('shapelet').apply(lambda x: x.loc[x['delta'] == x['delta'].value_counts().idxmax()]).reset_index(drop=True)

# Remove rows with empty subnetworks
significant_sizes = significant_sizes[significant_sizes["subnetwork"].apply(lambda x: len(x) > 0)]

  significant_sizes = pd.concat([significant_sizes,
  significant_sizes = significant_sizes.groupby('shapelet').apply(lambda x: x.loc[x['delta'] == x['delta'].value_counts().idxmax()]).reset_index(drop=True)


In [14]:
significant_sizes

Unnamed: 0,shapelet,delta,size,pval,subnetwork
0,string-scores_row_1,delta_2.0097827106593018e-05,6,0.012,"[[AKIRIN2, EIF2AK4, PPP6C, PSMA1, TRIM14, USP1..."
1,string-scores_row_1,delta_2.0097827106593018e-05,8,0.020,"[[CRCP, POLR3A, POLR3B, POLR3C, POLR3D, POLR3E..."
2,string-scores_row_1,delta_2.0097827106593018e-05,9,0.018,"[[AIM2, ARHGEF2, CASP4, GSDMD, NLRP1, NLRP2, N..."
3,string-scores_row_10,delta_1.0101953752965914e-05,8,0.018,"[[CGAS, NONO, PQBP1, PSPC1, RBM14, SFPQ, TRIM3..."
4,string-scores_row_11,delta_4.0429165392845634e-05,2,0.037,"[[ADAR, ZBP1], [ALPK1, TIFA], [C1S, COLEC11], ..."
...,...,...,...,...,...
72,string-scores_row_66,delta_4.330989494819436e-06,10,0.014,"[[CLU, DCLRE1C, DDX1, DDX17, DDX21, DDX3X, DDX..."
74,string-scores_row_68,delta_3.1830071319854182e-06,3,0.018,"[[IFI27, IFITM1, ISG20], [KCNN4, PRKCB, PRKD2]..."
75,string-scores_row_68,delta_3.1830071319854182e-06,4,0.016,"[[CRCP, POLR3C, POLR3H, POLR3K], [DDX1, DDX17,..."
76,string-scores_row_69,delta_1.017753572507336e-05,7,0.023,"[[CYLD, ITCH, OPTN, PCBP2, SQSTM1, TFE3, TFEB]..."
