# Does grouping genes into modules improve performance?

`Supplementary_Data_2.xlsx` contains the gene groups and can be doenloaded from https://www.nature.com/articles/s41588-021-00840-z 'Supplementary Material 2'.

In [1]:
!pip install openpyxl

import pandas as pd
import numpy as np
from pygsva import gsva, gsvaParam
import pandas as pd
import numpy as np





### Create a dictionary that maps modules to list of genes

In [2]:
df = pd.read_excel('Supplementary_Data_2.xlsx', sheet_name='Co-essential Modules', skiprows=2)
df['Module #'] = df['Module #'].astype(str)

gene_start_col = 13
# Create the dictionary
module_to_genes = {
    row['Module #']: [gene for gene in row[gene_start_col:] if pd.notna(gene)]
    for _, row in df.iterrows()
}

gene_sets = {str(k): v for k, v in module_to_genes.items()}

In [3]:
len(gene_sets)

5229

In [None]:
# # Flatten dictionary to two-column format
# rows = [(module, gene) for module, genes in module_to_genes.items() for gene in genes]
# df_mod_to_gene = pd.DataFrame(rows, columns=["module", "gene"])

# # Save to CSV
# df_mod_to_gene.to_csv("module_to_genes/module_to_genes.csv", index=False)

### (optional) Filter the dictionary to only include genes/modules present in your expression data
If this is not done, just use `module_to_genes.csv`

In [4]:
""" MODIFY """
# whether to use all modules or just a subset
# options: "all", "most-enriched", int
number_of_modules = "all"

In [6]:
"""MODIFY"""
# Load expression data
data_name = "methylation_imputed"
expression_data = pd.read_csv(f"../data/Cleveland/{data_name}.csv", index_col=0).T
# expression_data.drop(['Entrez_Gene_Id'], inplace=True, axis=1)

# expression_data = pd.read_csv("../data/Imputed/proteomics_imputed.csv", index_col=0).T
# expression_data = pd.read_csv('aggreagated_ccle_meth.csv', index_col=0)
# expression_data = pd.read_csv("C:/Users/mmarc/Documents/code/P-Net-Reproducibility-Paper-Fork/Radiosensitivity Prediction/data/Cleveland/cleveland_gene_expression.csv", index_col=0).T
# expression_data = pd.read_csv("C:/Users/mmarc/Documents/code/P-Net-Reproducibility-Paper-Fork/Radiosensitivity Prediction/data/Cleveland/CCLE_RNAseq_rsem_genes_tpm_20180929 (1).txt.gz", sep='\t', index_col=0)
print(f"Expression data has {expression_data.shape[0]} genes.")
if expression_data.shape[0] < expression_data.shape[1]:
    print("Transpose data.")

Expression data has 14608 genes.


In [7]:
# Filter gene sets to match genes in the expression data (optional)
genes_in_data = set(expression_data.index)
filtered_gene_sets = {
    str(k): [g for g in v if g in genes_in_data]
    for k, v in module_to_genes.items()
}
filtered_gene_sets = {k: v for k, v in filtered_gene_sets.items() if len(v) > 1}

if len(filtered_gene_sets) == 0:
    print("WARNING: no genes mathed. Try transposing data.")
else:
    print("Its fine.")

Its fine.


In [13]:
len(filtered_gene_sets)

4800

'Most-enriched enrichment' → fold enrichment (you want >100).

'Most-enriched p' → statistical significance (you want Bonferroni-corrected p < 0.05).

Together, these help you find the 1,269 strongest, most reliable modules.

In [None]:
df_most = df[df["Module #"].isin(filtered_gene_sets.keys())]

if number_of_modules == "most-enriched":
    df_most = df_most[df_most['Most-enriched enrichment'] > 100]
    df_most = df_most[df_most['Most-enriched p'] < 0.05]
elif number_of_modules == "all":
    df_most = df_most
else:
    df_most = df_most[df_most['Most-enriched p'] < 0.05]
    df_most = df_most.sort_values(by='Most-enriched enrichment', ascending=False)
    df_most = df_most[:number_of_modules]

print(f" number of modules: {df_most.shape[0]}")

chosen_modules = df_most['Module #']

gene_sets = {k: filtered_gene_sets[k] for k in chosen_modules}

# # Flatten dictionary to two-column format
rows = [(module, gene) for module, genes in gene_sets.items() for gene in genes]
df_module_to_genes = pd.DataFrame(rows, columns=["module", "gene"])

# Save to CSV
df_module_to_genes.to_csv(f"module_to_genes/module_to_genes_{data_name}_{str(number_of_modules)}.csv", index=False)

print(f"module_to_genes file has been saved to file 'module_to_genes_{data_name}_{str(number_of_modules)}.csv'")

 number of modules: 4970
module_to_genes file ahs been saved to file 'module_to_genes_methylation_imputed_all.csv'


: 

# Now: run the 'calculate_GSVA.R' file 
GSVA stands for Gene Set Variation Analysis.
It’s a method used in biology to turn a group of genes (like a pathway or a module) into a single score that reflects how active that group is in a sample (like a specific cell or tissue).

...

# (optional) postprocess the computed GSVA

### Methylation data:

In [None]:
number_of_modules = "most-enriched"
data_name = "rna_imputed"

df = pd.read_csv(f'gsva_scores_{data_name}_{number_of_modules}.csv', index_col=0)

df.iloc[:, 1:] = df.iloc[:, 1:].apply(lambda x : pd.to_numeric(x, errors="coerce")).fillna(np.nan)
df = df.T
df.index = [x.split("_")[0] for x in df.index]
df = df.loc[~df.index.duplicated(keep="first")]
df.head()

In [None]:
df.to_csv('../data/Cleveland/modules_gsva_meth_imputed_full_processed2.csv')

### RNA seq data:

In [10]:
gsva = pd.read_csv(f'gsva_scores/gsva_scores_{data_name}_{number_of_modules}.csv', index_col=0)
# gsva = gsva.T
# gsva.to_csv('../data/Mice/modules_gsva_meth_mice.csv')
gsva

Unnamed: 0,8,14,38,39,43,47,48,49,50,51,...,5126,5127,5130,5140,5143,5145,5148,5168,5190,5229
22RV1,0.436903,0.000555,-0.026934,-0.236241,-0.001655,0.041915,0.365020,-0.466141,0.147907,0.236512,...,-0.380139,-0.074167,-0.099280,-0.049004,0.478495,-0.016778,0.445481,0.667144,-0.022923,0.222467
42MGBA,-0.097515,-0.062043,0.178450,0.251463,-0.020668,-0.024889,-0.158225,0.336751,0.419054,0.355672,...,0.268576,-0.071268,0.252576,-0.055258,0.045041,-0.091967,0.231136,-0.055885,-0.479137,0.586704
5637,0.477699,-0.024133,-0.138267,0.146309,-0.043991,-0.076821,-0.456752,0.406816,0.288758,-0.044227,...,0.826131,0.830637,0.503483,0.179647,-0.576237,-0.334832,0.187405,-0.200009,0.189729,-0.017562
647V,0.171252,0.339635,0.305165,0.440819,0.177682,0.268122,0.141566,0.356177,0.167331,-0.055671,...,0.706533,0.650215,0.265021,0.272398,-0.639342,-0.147391,-0.217292,0.506946,-0.287527,-0.317061
769P,0.336379,0.402448,0.060402,0.446750,0.260591,0.173967,-0.232140,0.377659,0.161707,-0.154431,...,0.661838,0.718206,0.280468,-0.384513,-0.028050,-0.120623,-0.186102,-0.552712,0.082844,-0.036330
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YD8,-0.483967,0.288934,0.001948,0.394939,0.362212,0.318332,0.177699,0.668636,-0.371708,-0.522793,...,0.153489,0.046488,-0.042233,0.073169,-0.415541,0.481167,0.356197,-0.316274,0.419842,0.021585
YH13,-0.326772,0.074079,0.164898,0.041260,0.207960,0.037269,-0.182792,0.370366,-0.053202,-0.041202,...,-0.239143,-0.206505,0.114032,-0.237639,0.167308,-0.008762,0.493482,0.676151,-0.129363,0.479305
YKG1,-0.312181,0.216063,0.214252,-0.218352,0.227827,0.046648,0.127099,0.081832,-0.188130,-0.364585,...,0.339886,0.601853,-0.557499,0.014484,-0.467137,0.433590,-0.026872,0.652544,0.175716,-0.016452
ZR751,-0.521838,0.018009,-0.117763,0.102080,0.062596,0.058556,0.092437,-0.414914,-0.162324,0.193928,...,-0.684734,-0.625831,-0.347253,-0.593862,0.488493,0.649129,-0.383704,0.689982,0.135850,-0.166684
