Step1: Loading the genesets

In [1]:
from pathlib import Path
import gseapy as gp
import pandas as pd
import pickle
import anndata
import json


In [2]:
BASE_DIR = Path.home() / "Thesis"
DATA_DIR = BASE_DIR / "Data"
CODE_DIR = BASE_DIR / "Code"
gene_vocab_path = DATA_DIR / "gene_vocabulary.jsonl"

In [3]:

ens_to_symbol = {}
valid_symbols = set()

with open(gene_vocab_path) as f:
    for line in f:
        record = json.loads(line)
        ens = record["ensembl_id"].upper()
        symbol = record["gene_symbol"].upper()

        # drop fake symbols (symbol == ensembl)
        if symbol == ens:
            continue

        ens_to_symbol[ens] = symbol
        valid_symbols.add(symbol)

print(f"Valid Ensembl → symbol mappings: {len(ens_to_symbol)}")
print(f"Unique valid gene symbols: {len(valid_symbols)}")


Valid Ensembl → symbol mappings: 41780
Unique valid gene symbols: 41780


In [4]:
def normalize_lncrna_genes_with_stats(genes, ens_to_symbol, valid_symbols):
    out = set()
    stats = {
        "kept_symbols": 0,
        "converted_ensembl": 0,
        "dropped": 0,
    }

    for g in genes:
        g = g.upper()

        if g in valid_symbols:
            out.add(g)
            stats["kept_symbols"] += 1

        elif g in ens_to_symbol:
            out.add(ens_to_symbol[g])
            stats["converted_ensembl"] += 1

        else:
            stats["dropped"] += 1

    return out, stats


In [5]:
lncrna_files = {
    "LNCRNA_GO": DATA_DIR / "GO_Biological_Process_2021_lncRNA.gmt",
    "LNCRNA_REACTOME": DATA_DIR / "Reactome_2022_lncRNA.gmt",
}

lncrna_sets = {}

conversion_log = {}

for source, path in lncrna_files.items():
    sets = gp.parser.read_gmt(str(path))

    for name, genes in sets.items():
        symbol_genes, stats = normalize_lncrna_genes_with_stats(
            genes,
            ens_to_symbol,
            valid_symbols
        )

        if len(symbol_genes) < 1:
            continue

        key = f"{source}::{name}"
        lncrna_sets[key] = {
            "genes": symbol_genes,
            "source": source
        }

        conversion_log[key] = stats


In [6]:
df_stats = pd.DataFrame.from_dict(conversion_log, orient="index")
df_stats.sort_values("converted_ensembl", ascending=False).head(10)

Unnamed: 0,kept_symbols,converted_ensembl,dropped
LNCRNA_REACTOME::Disease R-HSA-1643685,664,26,1046
LNCRNA_REACTOME::Metabolism R-HSA-1430728,748,22,1279
LNCRNA_REACTOME::Gene Expression (Transcription) R-HSA-74160,547,22,880
LNCRNA_REACTOME::Metabolism Of Proteins R-HSA-392499,777,21,1092
LNCRNA_REACTOME::RNA Polymerase II Transcription R-HSA-73857,479,21,812
LNCRNA_REACTOME::Generic Transcription Pathway R-HSA-212436,372,17,801
LNCRNA_REACTOME::Post-translational Protein Modification R-HSA-597592,609,16,758
LNCRNA_REACTOME::Signal Transduction R-HSA-162582,584,15,1866
LNCRNA_REACTOME::Immune System R-HSA-168256,571,14,1358
"LNCRNA_GO::positive regulation of transcription, DNA-templated (GO:0045893)",378,14,791


In [7]:
merged_lncrna_genes = set().union(
    *[v["genes"] for v in lncrna_sets.values()]
)

print(f"Unique lncRNA gene symbols: {len(merged_lncrna_genes)}")


Unique lncRNA gene symbols: 3290


In [8]:
msigdb_files = {
    "GO_BP": DATA_DIR / "c5.go.v2023.1.Hs.symbols.gmt",
    "REACTOME": DATA_DIR / "c2.cp.reactome.v2023.1.Hs.symbols.gmt",
}

msigdb_sets = {}

for source, path in msigdb_files.items():
    sets = gp.parser.read_gmt(str(path))
    for name, genes in sets.items():
        msigdb_sets[f"{source}::{name}"] = {
            "genes": set(genes),
            "source": source
        }

In [9]:
msigdb_sets_clean = {}
msigdb_conversion_log = {}

for key, info in msigdb_sets.items():
    symbol_genes, stats = normalize_lncrna_genes_with_stats(
        info["genes"],
        ens_to_symbol,
        valid_symbols
    )

    # Drop tiny or empty sets early
    if len(symbol_genes) < 5:
        continue

    msigdb_sets_clean[key] = {
        "genes": symbol_genes,
        "source": info["source"]
    }

    msigdb_conversion_log[key] = stats


In [12]:
msigdb_sets = msigdb_sets_clean

In [14]:
merged_msigdb_genes = set().union(
    *[v["genes"] for v in msigdb_sets.values()]
)

print(f"Unique msigdb gene symbols: {len(merged_msigdb_genes)}")


Unique msigdb gene symbols: 19396


In [15]:
all_sets = {**msigdb_sets, **lncrna_sets}
len(all_sets)

19926

In [24]:
merged_all_genes = set().union(
    *[v["genes"] for v in all_sets.values()]
)

print(f"Unique all gene symbols: {len(merged_all_genes)}")


Unique all gene symbols: 22607


In [None]:
lncrna_sets

In [27]:
lncrna_txt_path = DATA_DIR / "lncrna_sets.txt"

with open(lncrna_txt_path, "w") as f:
    for name, info in lncrna_sets.items():
        genes_str = ",".join(sorted(info["genes"]))
        line = f"{name}\t{info['source']}\t{genes_str}\n"
        f.write(line)

print(f"lncrna_sets saved to {lncrna_txt_path}")

lncrna_sets saved to /home/a/aangelopa/Thesis/Data/lncrna_sets.txt


In [28]:
msigdb_txt_path = DATA_DIR / "msigdb_sets.txt"

with open(msigdb_txt_path, "w") as f:
    for name, info in msigdb_sets_clean.items():
        genes_str = ",".join(sorted(info["genes"]))
        line = f"{name}\t{info['source']}\t{genes_str}\n"
        f.write(line)

print(f"msigdb_sets saved to {msigdb_txt_path}")


msigdb_sets saved to /home/a/aangelopa/Thesis/Data/msigdb_sets.txt


In [26]:
print (f"Summary: \n lncRNA genes: {len(merged_lncrna_genes)} \n lncRNA sets: {len(lncrna_sets)} \n msigdb genes: {len(merged_msigdb_genes)} \n all unique genes {len(merged_all_genes)}  ")

Summary: 
 lncRNA genes: 3290 
 lncRNA sets: 7743 
 msigdb genes: 19396 
 all unique genes 22607  
