In [1]:
import torch
from cellmaps_imagedownloader.runner import CellmapsImageDownloader
from cellmaps_imagedownloader.runner import MultiProcessImageDownloader
from cellmaps_imagedownloader.gene import ImageGeneNodeAttributeGenerator as IGen 
from cellmaps_imagedownloader.proteinatlas import ProteinAtlasReader, ProteinAtlasImageUrlReader, ImageDownloadTupleGenerator
import json
import os
import pandas as pd
from glob import glob
#import seaborn as sns
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
from tqdm import tqdm
from joblib import Parallel, delayed
import json
from collections import Counter
import requests

In [None]:
ls

In [18]:
BASE_PATH = "data_green"
# CHANNELS = ["blue", "green", "red", "yellow"]
CHANNELS = ["green"]

In [19]:
def collect_image_paths(base_path=BASE_PATH):
    records = []
    for treatment_folder in os.listdir(base_path):
        treatment_path = os.path.join(base_path, treatment_folder)
        if not os.path.isdir(treatment_path):
            continue

        treatment = treatment_folder.split("-")[-1].lower()

        image_dict = {}
        for channel in CHANNELS:
            channel_path = os.path.join(treatment_path, channel)
            for img_path in glob(os.path.join(channel_path, "*.jpg")):
                # Extract base ID (strip _blue, _red, etc.)
                basename = os.path.basename(img_path).replace(f"_{channel}.jpg", "")
                image_dict.setdefault(basename, {"id": basename, "treatment": treatment})
                image_dict[basename][channel] = img_path

        records.extend(image_dict.values())

    return pd.DataFrame(records)

In [20]:
def load_rocrate_metadata_with_antibodies(base_path=BASE_PATH):
    print('base path: ',base_path)
    
    metadata_records = []

    for treatment_folder in os.listdir(base_path):
        crate_path = os.path.join(base_path, treatment_folder, "ro-crate-metadata.json")
        if not os.path.isfile(crate_path):
            continue

        with open(crate_path, "r") as f:
            crate = json.load(f)

        # --- Build antibody/stain index ---
        antibody_index = {}
        for entry in crate.get("@graph", []):
            if entry.get("@type") == "BioChemEntity":
                stain_id = entry["@id"]
                identifiers = entry.get("identifier", [])
                if isinstance(identifiers, dict):
                    identifiers = [identifiers]

                id_map = {i.get("name"): i.get("value") for i in identifiers}

                antibody_index[stain_id] = {
                    "name": entry.get("name"),
                    "description": entry.get("description"),
                    "hpa_id": id_map.get("HPA Antibody ID"),
                    "ensembl": id_map.get("ENSEMBL"),
                    "uniprot": id_map.get("Uniprot"),
                    "pubchem": id_map.get("PubChem"),
                    "subcellular_location": (
                        entry.get("isLocatedInSubcellularLocation", {}).get("name")
                        if isinstance(entry.get("isLocatedInSubcellularLocation"), dict)
                        else None
                    )
                }

        # --- Process each dataset (image) entry ---
        for entry in crate.get("@graph", []):
            if entry.get("@type") != "EVI:Dataset":
                continue

            content_url = entry.get("contentUrl", "")
            filename = os.path.basename(content_url.replace("file://", "")).strip("/")
            if not filename.endswith(".jpg"):
                continue

            base_id = filename.replace(".jpg", "").rsplit("_", 1)[0]
            channel = filename.replace(".jpg", "").rsplit("_", 1)[-1].lower()

            stain_ref = entry.get("usedStain", {}).get("@id", "")
            stain_key = stain_ref.split("/")[-1].replace("stain-", "")
            ab_meta = antibody_index.get(stain_ref, {})

            metadata_records.append({
                "id": base_id,
                "channel": channel,
                "antibody_stain": stain_key,
                "antibody_name": ab_meta.get("name"),
                "antibody_hpa_id": ab_meta.get("hpa_id"),
                "antibody_ensembl": ab_meta.get("ensembl"),
                "antibody_uniprot": ab_meta.get("uniprot"),
                "antibody_pubchem": ab_meta.get("pubchem"),
                "subcellular_location": ab_meta.get("subcellular_location"),
                "cell_line": entry.get("usedCellLine", {}).get("@id", "").split("/")[-1].replace("cell-line-", ""),
                "treatment": entry.get("usedTreatment", {}).get("@id", "").split("/")[-1].replace("treatment-", ""),
                "description": entry.get("description", ""),
                "filename": filename
            })

    return pd.DataFrame(metadata_records)

In [21]:
def batch_lookup_ensembl_symbols(ensembl_ids, batch_size=1000):
    """
    Look up gene symbols from Ensembl using batched POST requests.
    Returns a dict {ensembl_id: gene_symbol}
    """
    url = "https://rest.ensembl.org/lookup/id"
    headers = {"Content-Type": "application/json"}
    id_to_symbol = {}

    for i in range(0, len(ensembl_ids), batch_size):
        batch = ensembl_ids[i:i + batch_size]
        payload = {"ids": batch}
        try:
            response = requests.post(url, headers=headers, json=payload)
            if response.status_code == 200:
                results = response.json()
                for eid, info in results.items():
                    id_to_symbol[eid] = info.get("display_name", None)
            else:
                print(f"⚠️ Error {response.status_code}: {response.text}")
        except Exception as e:
            print(f"⚠️ Request failed for batch starting at {i}: {e}")
    
    return id_to_symbol

In [22]:
def load_multichannel_image(row):
    """
    Loads a 4-channel immunofluorescence image from separate grayscale files.

    Args:
        row (pd.Series): A row from df_images with keys: blue, green, red, yellow.

    Returns:
        np.ndarray: H x W x 4 array with channels in the order [blue, green, red, yellow]
    """
    img_channels = []
#     for ch in ["blue", "green", "red", "yellow"]:
    for ch in ["green"]:    
        path = row[ch]
        img = Image.open(path).convert("L")  # Load as 8-bit grayscale
        img_array = np.array(img)
        img_channels.append(img_array)

    stacked = np.stack(img_channels, axis=-1)  # Shape: H x W x 4
    return stacked


In [23]:
def print_summary_report(df_merged, n_jobs=4):
    print("🧬🔬 CM4AI Immunofluorescence Dataset Summary\n" + "="*45, flush=True)

    # 1. Number of treatments
    n_treatments = df_merged["treatment"].nunique()
    print(f"\n💊 Number of treatments: {n_treatments}", flush=True)
    for cond, count in df_merged["treatment"].value_counts().items():
        print(f"  - {cond}: {count} image-channel combinations", flush=True)

    # 2. Number of samples (unique image IDs) per treatment
    print("\n🧪 Number of unique samples per treatment:", flush=True)
    samples_per_treatment = (
        df_merged[["id", "treatment"]]
        .drop_duplicates()
        .groupby("treatment")
        .size()
    )
    for cond, count in samples_per_treatment.items():
        print(f"  - {cond}: {count} samples", flush=True)

    # 3. Image size distribution (parallelized)
    print("\n🖼 Image size distribution:", flush=True)

    # Reconstruct wide format for loading multichannel images
    df_channels = df_merged[["id", "channel", "filepath"]].drop_duplicates()
    df_shapes = df_channels.pivot(index="id", columns="channel", values="filepath").reset_index()
    df_treatments = df_merged[["id", "treatment"]].drop_duplicates()
    df_shapes = df_shapes.merge(df_treatments, on="id", how="left")

    def safe_load_shape(row):
        try:
            img = load_multichannel_image(row)
            return img.shape[:2]
        except Exception as e:
            print(f"  ⚠️ Error loading image for ID {row['id']}: {e}", flush=True)
            return None

    print("🔄 Computing image shapes in parallel...", flush=True)
    shapes = Parallel(n_jobs=n_jobs, backend="threading")(
        delayed(safe_load_shape)(row) for _, row in tqdm(df_shapes.iterrows(), total=len(df_shapes))
    )
    df_shapes["shape"] = shapes
    shape_counts = Counter([s for s in shapes if s is not None])
    for shape, count in shape_counts.items():
        print(f"  - {shape[0]}x{shape[1]}: {count} composite/multi-channel images", flush=True)

    # 4. Green channel antibody diversity
    green_df = df_merged[df_merged["channel"] == "green"]
    unique_green = sorted(set(green_df["antibody_hpa_id"].dropna().tolist()))
    print(f"\n🟩 Number of unique antibodies in green channel (protein target): {len(unique_green)}", flush=True)

    # 5. Red, Blue, Yellow antibody/stain names with icons
    print("\n🎯 Antibodies/stains used in other channels:", flush=True)

#     channel_icons = {
#         "red": "🟥",
#         "blue": "🟦",
#         "yellow": "🟨"
#     }

#     for ch in ["red", "blue", "yellow"]:
#         ch_df = df_merged[df_merged["channel"] == ch]
#         unique_ab = sorted(set(
#             ch_df["antibody_hpa_id"].dropna().tolist() +
#             ch_df["antibody_name"].dropna().tolist()
#         ))
#         icon = channel_icons.get(ch, "🔹")
#         print(f"\n  {icon} {ch.upper()} channel antibodies/stains ({len(unique_ab)}):", flush=True)
#         for ab in unique_ab:
#             print(f"    - {ab}", flush=True)

    print("\n✅ Summary complete.\n", flush=True)

In [24]:
def save_image_gene_node_attributes(df_merged, base_output_dir="data/raw"):
    print('save_image_gene_node')
    # Filter to green channel (protein target)
    df_green = df_merged[df_merged["channel"] == "green"].copy()

    # Normalize treatment label: "control" becomes "untreated"
    df_green["treatment"] = df_green["treatment"].replace("control", "untreated")

    # Drop exact duplicates across key fields
    df_green = df_green.drop_duplicates(subset=["id", "treatment", "antibody_hpa_id", "antibody_ensembl"])

    # Group by treatment
    treatments = df_green["treatment"].dropna().unique()

    for treatment in treatments:
        print(treatment)
        
        df_t = df_green[df_green["treatment"] == treatment]

        df_out = pd.DataFrame({
            "name": df_t["antibody_name"],
            "represents": "ensembl:" + df_t["antibody_ensembl"].fillna(""),
            "ambiguous": df_t["antibody_hpa_id"],
            "antibody": df_t["antibody_hpa_id"],
            "filename": df_t["id"].astype(str) + "_",
            "imageurl": "no image url found"
        })

        unique_ensembl_ids = (
            df_out["represents"]
            .dropna()
            .str.replace("ensembl:", "", regex=False)
            .loc[lambda s: s.str.match(r"ENSG\d+")]  # keep only valid Ensembl Gene IDs
            .unique()
            .tolist()
        )

        ensembl_to_name = batch_lookup_ensembl_symbols(unique_ensembl_ids)

        df_out["name"] = (
            df_out["represents"]
            .str.replace("ensembl:", "", regex=False)
            .map(ensembl_to_name)
        )

        df_out["name"] = df_out["name"].fillna("NEGATIVE")

        # Save to the appropriate treatment folder
        treatment_folder = os.path.join(base_output_dir, treatment)
        os.makedirs(treatment_folder, exist_ok=True)

        out_path = os.path.join(treatment_folder, "1_image_gene_node_attributes.tsv")
        df_out.to_csv(out_path, sep="\t", index=False)

        print(f"✅ Saved: {out_path}")

In [25]:
df_images = collect_image_paths()
df_images

Unnamed: 0,id,treatment,green
0,B2AI_2_untreated_D7_R12_z00,untreated,data_green/untreated/green/B2AI_2_untreated_D7...
1,B2AI_2_untreated_F6_R5_z01,untreated,data_green/untreated/green/B2AI_2_untreated_F6...
2,B2AI_5_untreated_B3_R12_z00,untreated,data_green/untreated/green/B2AI_5_untreated_B3...
3,B2AI_4_untreated_F8_R8_z02,untreated,data_green/untreated/green/B2AI_4_untreated_F8...
4,B2AI_5_untreated_F12_R3_z01,untreated,data_green/untreated/green/B2AI_5_untreated_F1...
...,...,...,...
12855,B2AI_3_Paclitaxel_G6_R5_z00,paclitaxel,data_green/paclitaxel/green/B2AI_3_Paclitaxel_...
12856,B2AI_2_Paclitaxel_H11_R16_z00,paclitaxel,data_green/paclitaxel/green/B2AI_2_Paclitaxel_...
12857,B2AI_5_Paclitaxel_D3_R2_z01,paclitaxel,data_green/paclitaxel/green/B2AI_5_Paclitaxel_...
12858,B2AI_3_Paclitaxel_D12_R14_z00,paclitaxel,data_green/paclitaxel/green/B2AI_3_Paclitaxel_...


In [26]:
with pd.option_context('display.max_colwidth', None):
    display(df_images[df_images.duplicated(subset="id", keep=False)].head())

Unnamed: 0,id,treatment,green
3880,B2AI_3_untreated_C2_R3_z01,untreated,data_green/untreated/green/B2AI_3_untreated_C2_R3_z01_green.jpg
12743,B2AI_3_untreated_C2_R3_z01,paclitaxel,data_green/paclitaxel/green/B2AI_3_untreated_C2_R3_z01_green.jpg


In [27]:
df_meta = load_rocrate_metadata_with_antibodies()
df_images_melted = df_images.melt(
    id_vars=["id"],  # remove "treatment" here
#     value_vars=["blue", "green", "red", "yellow"],
    value_vars=["green"],
    var_name="channel",
    value_name="filepath"
)

df_merged = df_images_melted.merge(df_meta, on=["id", "channel"], how="left")
print(df_merged)

base path:  data_green
                                  id channel  \
0        B2AI_2_untreated_D7_R12_z00   green   
1         B2AI_2_untreated_F6_R5_z01   green   
2        B2AI_5_untreated_B3_R12_z00   green   
3         B2AI_4_untreated_F8_R8_z02   green   
4        B2AI_5_untreated_F12_R3_z01   green   
...                              ...     ...   
12855    B2AI_3_Paclitaxel_G6_R5_z00   green   
12856  B2AI_2_Paclitaxel_H11_R16_z00   green   
12857    B2AI_5_Paclitaxel_D3_R2_z01   green   
12858  B2AI_3_Paclitaxel_D12_R14_z00   green   
12859    B2AI_4_Paclitaxel_F8_R1_z01   green   

                                                filepath  \
0      data_green/untreated/green/B2AI_2_untreated_D7...   
1      data_green/untreated/green/B2AI_2_untreated_F6...   
2      data_green/untreated/green/B2AI_5_untreated_B3...   
3      data_green/untreated/green/B2AI_4_untreated_F8...   
4      data_green/untreated/green/B2AI_5_untreated_F1...   
...                                     

In [28]:
save_image_gene_node_attributes(df_merged, base_output_dir=BASE_PATH)

save_image_gene_node
untreated
✅ Saved: data_green/untreated/1_image_gene_node_attributes.tsv
vorinostat
✅ Saved: data_green/vorinostat/1_image_gene_node_attributes.tsv
paclitaxel
✅ Saved: data_green/paclitaxel/1_image_gene_node_attributes.tsv


In [31]:
import os

from cellmaps_image_embedding.runner import DensenetEmbeddingGenerator
from cellmaps_image_embedding.runner import CellmapsImageEmbedder

input_base_path = "data_green"
image_interim_base_path = "pipeline_images"
embedding_base_path = "embedding_old_green"

for treatment_folder in os.listdir(input_base_path):
    input_path = os.path.join(input_base_path, treatment_folder)
    if not os.path.isdir(input_path):
        continue
    manifest_path = os.path.join(input_path, "manifest.csv")
    image_interim_path = os.path.join(image_interim_base_path, treatment_folder)
    embedding_path = os.path.join(embedding_base_path, treatment_folder)

    gen = DensenetEmbeddingGenerator(
        input_path,
        outdir=embedding_path,
        model_path="https://github.com/CellProfiling/densenet/releases/download/v0.1.0/external_crop512_focal_slov_hardlog_class_densenet121_dropout_i768_aug2_5folds_fold0_final.pth",
        fold=1
    )
    embedder = CellmapsImageEmbedder(
        outdir=embedding_path,
        inputdir=input_path,
        embedding_generator=gen,
        name=f"{treatment_folder} IF Embedding",
        organization_name="CM4AI",
        project_name="CM4AI IF Embedding Tutorial"
    )
    embedder.run()

The project name for RO-Crate /data/user/home/hnguye24/cm4ai/data_green/untreated is missing from the metadata. Please provide a name to uphold FAIR principles. Execution will proceed without the  name.
The organization name for RO-Crate /data/user/home/hnguye24/cm4ai/data_green/untreated is missing from the metadata. Please provide a name to uphold FAIR principles. Execution will proceed without the  name.
Downloading external_crop512_focal_slov_hardlog_class_densenet121_dropout_i768_aug2_5folds_fold0_final.pth: 100%|██████████| 66.1M/66.1M [00:00<00:00, 80.6MB/s]


load model: /data/user/home/hnguye24/cm4ai/embedding_old_green/untreated/model.pth


100%|██████████| 3976/3976 [2:31:17<00:00,  2.28s/it]  
The project name for RO-Crate /data/user/home/hnguye24/cm4ai/data_green/vorinostat is missing from the metadata. Please provide a name to uphold FAIR principles. Execution will proceed without the  name.
The organization name for RO-Crate /data/user/home/hnguye24/cm4ai/data_green/vorinostat is missing from the metadata. Please provide a name to uphold FAIR principles. Execution will proceed without the  name.
Downloading external_crop512_focal_slov_hardlog_class_densenet121_dropout_i768_aug2_5folds_fold0_final.pth: 100%|██████████| 66.1M/66.1M [00:00<00:00, 83.4MB/s]


load model: /data/user/home/hnguye24/cm4ai/embedding_old_green/vorinostat/model.pth


100%|██████████| 4462/4462 [1:25:05<00:00,  1.14s/it]  
The project name for RO-Crate /data/user/home/hnguye24/cm4ai/data_green/paclitaxel is missing from the metadata. Please provide a name to uphold FAIR principles. Execution will proceed without the  name.
The organization name for RO-Crate /data/user/home/hnguye24/cm4ai/data_green/paclitaxel is missing from the metadata. Please provide a name to uphold FAIR principles. Execution will proceed without the  name.
Downloading external_crop512_focal_slov_hardlog_class_densenet121_dropout_i768_aug2_5folds_fold0_final.pth: 100%|██████████| 66.1M/66.1M [00:00<00:00, 85.9MB/s]


load model: /data/user/home/hnguye24/cm4ai/embedding_old_green/paclitaxel/model.pth


100%|██████████| 4422/4422 [36:21<00:00,  2.03it/s]


In [32]:
import pandas as pd

# Step 1: Load TSV file
input_file = "embedding_old_green/vorinostat/image_emd.tsv"   # replace with your filename
df = pd.read_csv(input_file, sep="\t")

# Step 2: Compute mean per gene (assumes first column is "id" or "Gene")
mean_df = df.groupby(df.columns[0]).mean(numeric_only=True).reset_index()

# Step 3: Save to a new file
output_file = "embedding_old_green/vorinostat/gene_means.tsv"
mean_df.to_csv(output_file, sep="\t", index=False)

print("Mean values saved to:", output_file)

Mean values saved to: embedding_old_green/vorinostat/gene_means.tsv


In [None]:
rm -r data/.ipynb_checkpoints

In [33]:
from cellmaps_ppi_embedding.runner import Node2VecEmbeddingGenerator
from cellmaps_ppi_embedding.runner import CellMapsPPIEmbedder
import networkx as nx

In [None]:
inputdir = '1.ppi_download'
outdir = '2.ppi_embedding'
gen = Node2VecEmbeddingGenerator(nx_network=nx.read_edgelist(CellMapsPPIEmbedder.get_apms_edgelist_file(inputdir),
                                                             delimiter='\t'))

x =CellMapsPPIEmbedder(outdir=outdir,
                       embedding_generator=gen,
                      inputdir=inputdir)
x.run()

In [36]:
# cell map paclitaxel

from cellmaps_coembedding.runner import MuseCoEmbeddingGenerator
from cellmaps_coembedding.runner import CellmapsCoEmbedder

ppi_embeddingdir = '2.ppi_embedding'
image_embeddingdir = 'embedding_old_green/vorinostat'
outdir = '3_old_green.coembedding_vorinostat'
gen = MuseCoEmbeddingGenerator(ppi_embeddingdir=ppi_embeddingdir,
                               image_embeddingdir=image_embeddingdir,
                               outdir=os.path.abspath(outdir))

x = CellmapsCoEmbedder(outdir=outdir,
                      inputdirs=[ppi_embeddingdir, image_embeddingdir],
                      embedding_generator=gen)
x.run()

Saving embedding: 0it [00:00, ?it/s]

Finding 10 nearest neighbors using cosine metric and 'brute' algorithm
Neighbors computed in 0.25135207176208496 seconds
Jaccard graph constructed in 1.012181282043457 seconds
Wrote graph to binary file in 0.0030052661895751953 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.66654
After 5 runs, maximum modularity is Q = 0.667759
Louvain completed 25 runs in 0.1549065113067627 seconds
Sorting communities by size, please wait ...
PhenoGraph completed in 2.418247699737549 seconds
Finding 10 nearest neighbors using cosine metric and 'brute' algorithm
Neighbors computed in 0.01608872413635254 seconds
Jaccard graph constructed in 0.9773478507995605 seconds
Wrote graph to binary file in 0.002326488494873047 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.688086
Louvain completed 21 runs in 0.0819997787475586 seconds
Sorting communities by size, please wait ...
PhenoGraph completed in 2.0767061710357666 seconds

Saving embedding: 97it [00:34,  2.85it/s]


0

In [37]:
import csv
import numpy as np
from collections import defaultdict
input_file = "embedding_old_green/vorinostat/image_emd.tsv"
# Store embeddings per gene
data = defaultdict(list)
# Read the original file
with open(input_file, "r") as f:
    reader = csv.reader(f, delimiter='\t')
    header = next(reader) # store header
    for row in reader:
        gene_id = row[0]
        embedding = list(map(float, row[1:]))
        data[gene_id].append(embedding)
# Compute mean and overwrite the same file
with open(input_file, "w", newline='') as out:
    writer = csv.writer(out, delimiter='\t')
    writer.writerow(["id"] + header[1:]) # write header back
    for gene_id, vectors in data.items():
        arr = np.array(vectors)
        mean_vector = np.mean(arr, axis=0)
        writer.writerow([gene_id] + mean_vector.tolist())

In [38]:
import csv
input_file = "embedding_old_green/vorinostat/image_emd.tsv"
unique_genes = set()
with open(input_file, "r") as f:
    reader = csv.reader(f, delimiter='\t')
    next(reader) # skip header
    for row in reader:
        gene_id = row[0]
        unique_genes.add(gene_id)
print(f"Number of unique genes: {len(unique_genes)}")

Number of unique genes: 461


In [40]:
from cellmaps_generate_hierarchy.ppi import CosineSimilarityPPIGenerator
from cellmaps_generate_hierarchy.hierarchy import CDAPSHiDeFHierarchyGenerator
from cellmaps_generate_hierarchy.maturehierarchy import HiDeFHierarchyRefiner
from cellmaps_generate_hierarchy.hcx import HCXFromCDAPSCXHierarchy
from cellmaps_generate_hierarchy.runner import CellmapsGenerateHierarchy

inputdir = '3_old_green.coembedding_vorinostat'
outdir = '5.2_old_green_hierarchy_vorinostat'
ppigen = CosineSimilarityPPIGenerator(embeddingdirs=[inputdir])

refiner = HiDeFHierarchyRefiner()

converter = HCXFromCDAPSCXHierarchy()

hiergen = CDAPSHiDeFHierarchyGenerator(refiner=refiner,
                                       hcxconverter=converter)

x = CellmapsGenerateHierarchy(outdir=outdir,
                              inputdirs=inputdir,
                              ppigen=ppigen,
                              hiergen=hiergen)
x.run()

Generating hierarchy: 15it [00:00, 53.13it/s]

Generating CX
Generating CX
Generating CX
Generating CX
Generating CX
Generating CX
Generating CX
Generating CX
Generating CX
Generating CX
Generating CX
Generating CX
Generating CX
Generating CX
Generating CX





0

In [41]:
import os
import ndex2
from ndex2.cx2 import RawCX2NetworkFactory
from cellmaps_generate_hierarchy.ndexupload import NDExHierarchyUploader

#Specify NDEx server
ndexserver = 'public.ndexbio.org'
ndexuser = ''
ndexpassword = ''
    
# Specify paths to hierarchy and its parent (you can find example files in examples directory in cellmaps_generate_hierarchy_repo)
hierarchy_path = './5.2_old_green_hierarchy_vorinostat/hierarchy.cx2'
parent_network_path = './5.2_old_green_hierarchy_vorinostat/hierarchy_parent.cx2'

# Load the hierarchy and parent network CX2 files into network objects
factory = RawCX2NetworkFactory()
hierarchy_network = factory.get_cx2network(hierarchy_path)
parent_network = factory.get_cx2network(parent_network_path)

# Initialize NDExHierarchyUploader with the specified NDEx server and credentials
uploader = NDExHierarchyUploader(ndexserver, ndexuser, ndexpassword, visibility=True)

# Upload the hierarchy and parent network to NDEx
parent_uuid, parenturl, hierarchy_uuid, hierarchyurl = uploader.save_hierarchy_and_parent_network(hierarchy_network, parent_network)

print(f"Parent network UUID is {parent_uuid} and its URL in NDEx is {parenturl}")
print(f"Hierarchy network UUID is {hierarchy_uuid} and its URL in NDEx is {hierarchyurl}")

# # Another option is to just specify the directory where the files are placed
# _, _, _, hierarchyurl = uploader.upload_hierary_and_parent_network_from_files('./examples/')
# print(f'Hierarchy uploaded. To view the hierarchy, paste this URL in your browser: {hierarchyurl}')

Parent network UUID is 8b7f887a-911f-11f0-a218-005056ae3c32 and its URL in NDEx is https://www.ndexbio.org/viewer/networks/8b7f887a-911f-11f0-a218-005056ae3c32
Hierarchy network UUID is 8b9c112c-911f-11f0-a218-005056ae3c32 and its URL in NDEx is https://www.ndexbio.org/viewer/networks/8b9c112c-911f-11f0-a218-005056ae3c32


In [43]:
import pandas as pd

# Step 1: Load TSV file
input_file = "embedding_old_green/untreated/image_emd.tsv"   # replace with your filename
df = pd.read_csv(input_file, sep="\t")

# Step 2: Compute mean per gene (assumes first column is "id" or "Gene")
mean_df = df.groupby(df.columns[0]).mean(numeric_only=True).reset_index()

# Step 3: Save to a new file
output_file = "embedding_old_green/untreated/gene_means.tsv"
mean_df.to_csv(output_file, sep="\t", index=False)

print("Mean values saved to:", output_file)

Mean values saved to: embedding_old_green/untreated/gene_means.tsv


In [44]:
import os

# Paths
old_dir = "embedding_old_green/untreated"
old_image = os.path.join(old_dir, "image_emd.tsv")
new_image = os.path.join(old_dir, "image_emd2.tsv")
gene_means = os.path.join(old_dir, "gene_means.tsv")
new_gene_means = os.path.join(old_dir, "image_emd.tsv")

# Rename image_emd.tsv → image_emd2.tsv
os.rename(old_image, new_image)

# Rename gene_means.tsv → image_emd.tsv
os.rename(gene_means, new_gene_means)

print("Files renamed successfully.")


Files renamed successfully.


In [45]:
# cell map untreated

from cellmaps_coembedding.runner import MuseCoEmbeddingGenerator
from cellmaps_coembedding.runner import CellmapsCoEmbedder

ppi_embeddingdir = '2.ppi_embedding'
image_embeddingdir = 'embedding_old_green/untreated'
outdir = '3_old_green.coembedding_untreated'
gen = MuseCoEmbeddingGenerator(ppi_embeddingdir=ppi_embeddingdir,
                               image_embeddingdir=image_embeddingdir,
                               outdir=os.path.abspath(outdir))

x = CellmapsCoEmbedder(outdir=outdir,
                      inputdirs=[ppi_embeddingdir, image_embeddingdir],
                      embedding_generator=gen)
x.run()

Saving embedding: 0it [00:00, ?it/s]

Finding 10 nearest neighbors using cosine metric and 'brute' algorithm
Neighbors computed in 0.028494834899902344 seconds
Jaccard graph constructed in 1.2302742004394531 seconds
Wrote graph to binary file in 0.0028908252716064453 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.66567
After 2 runs, maximum modularity is Q = 0.666853
After 4 runs, maximum modularity is Q = 0.668028
Louvain completed 24 runs in 0.10589838027954102 seconds
Sorting communities by size, please wait ...
PhenoGraph completed in 2.5843708515167236 seconds
Finding 10 nearest neighbors using cosine metric and 'brute' algorithm
Neighbors computed in 0.02761077880859375 seconds
Jaccard graph constructed in 1.2105543613433838 seconds
Wrote graph to binary file in 0.001458883285522461 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.679937
After 3 runs, maximum modularity is Q = 0.681284
Louvain completed 23 runs in 0.09343242645263672 

Saving embedding: 97it [00:30,  3.17it/s]


0

In [46]:
import csv
import numpy as np
from collections import defaultdict
input_file = "embedding_old_green/untreated/image_emd.tsv"
# Store embeddings per gene
data = defaultdict(list)
# Read the original file
with open(input_file, "r") as f:
    reader = csv.reader(f, delimiter='\t')
    header = next(reader) # store header
    for row in reader:
        gene_id = row[0]
        embedding = list(map(float, row[1:]))
        data[gene_id].append(embedding)
# Compute mean and overwrite the same file
with open(input_file, "w", newline='') as out:
    writer = csv.writer(out, delimiter='\t')
    writer.writerow(["id"] + header[1:]) # write header back
    for gene_id, vectors in data.items():
        arr = np.array(vectors)
        mean_vector = np.mean(arr, axis=0)
        writer.writerow([gene_id] + mean_vector.tolist())

In [47]:
import csv
input_file = "embedding_old_green/untreated/image_emd.tsv"
unique_genes = set()
with open(input_file, "r") as f:
    reader = csv.reader(f, delimiter='\t')
    next(reader) # skip header
    for row in reader:
        gene_id = row[0]
        unique_genes.add(gene_id)
print(f"Number of unique genes: {len(unique_genes)}")

Number of unique genes: 461


In [48]:
from cellmaps_generate_hierarchy.ppi import CosineSimilarityPPIGenerator
from cellmaps_generate_hierarchy.hierarchy import CDAPSHiDeFHierarchyGenerator
from cellmaps_generate_hierarchy.maturehierarchy import HiDeFHierarchyRefiner
from cellmaps_generate_hierarchy.hcx import HCXFromCDAPSCXHierarchy
from cellmaps_generate_hierarchy.runner import CellmapsGenerateHierarchy

inputdir = '3_old_green.coembedding_untreated'
outdir = '5.2_old_green_hierarchy_untreated'
ppigen = CosineSimilarityPPIGenerator(embeddingdirs=[inputdir])

refiner = HiDeFHierarchyRefiner()

converter = HCXFromCDAPSCXHierarchy()

hiergen = CDAPSHiDeFHierarchyGenerator(refiner=refiner,
                                       hcxconverter=converter)

x = CellmapsGenerateHierarchy(outdir=outdir,
                              inputdirs=inputdir,
                              ppigen=ppigen,
                              hiergen=hiergen)
x.run()

Generating hierarchy: 15it [00:00, 85.55it/s] 


Generating CX
Generating CX
Generating CX
Generating CX
Generating CX
Generating CX
Generating CX
Generating CX
Generating CX
Generating CX
Generating CX
Generating CX
Generating CX
Generating CX
Generating CX




0

In [49]:
import os
import ndex2
from ndex2.cx2 import RawCX2NetworkFactory
from cellmaps_generate_hierarchy.ndexupload import NDExHierarchyUploader

#Specify NDEx server
ndexserver = 'public.ndexbio.org'
ndexuser = ''
ndexpassword = ''
    
# Specify paths to hierarchy and its parent (you can find example files in examples directory in cellmaps_generate_hierarchy_repo)
hierarchy_path = './5.2_old_green_hierarchy_untreated/hierarchy.cx2'
parent_network_path = './5.2_old_green_hierarchy_untreated/hierarchy_parent.cx2'

# Load the hierarchy and parent network CX2 files into network objects
factory = RawCX2NetworkFactory()
hierarchy_network = factory.get_cx2network(hierarchy_path)
parent_network = factory.get_cx2network(parent_network_path)

# Initialize NDExHierarchyUploader with the specified NDEx server and credentials
uploader = NDExHierarchyUploader(ndexserver, ndexuser, ndexpassword, visibility=True)

# Upload the hierarchy and parent network to NDEx
parent_uuid, parenturl, hierarchy_uuid, hierarchyurl = uploader.save_hierarchy_and_parent_network(hierarchy_network, parent_network)

print(f"Parent network UUID is {parent_uuid} and its URL in NDEx is {parenturl}")
print(f"Hierarchy network UUID is {hierarchy_uuid} and its URL in NDEx is {hierarchyurl}")

# # Another option is to just specify the directory where the files are placed
# _, _, _, hierarchyurl = uploader.upload_hierary_and_parent_network_from_files('./examples/')
# print(f'Hierarchy uploaded. To view the hierarchy, paste this URL in your browser: {hierarchyurl}')

Parent network UUID is 6205f09e-9121-11f0-a218-005056ae3c32 and its URL in NDEx is https://www.ndexbio.org/viewer/networks/6205f09e-9121-11f0-a218-005056ae3c32
Hierarchy network UUID is 62218ef0-9121-11f0-a218-005056ae3c32 and its URL in NDEx is https://www.ndexbio.org/viewer/networks/62218ef0-9121-11f0-a218-005056ae3c32


In [50]:
import pandas as pd

# Step 1: Load TSV file
input_file = "embedding_old_green/paclitaxel/image_emd.tsv"   # replace with your filename
df = pd.read_csv(input_file, sep="\t")

# Step 2: Compute mean per gene (assumes first column is "id" or "Gene")
mean_df = df.groupby(df.columns[0]).mean(numeric_only=True).reset_index()

# Step 3: Save to a new file
output_file = "embedding_old_green/paclitaxel/gene_means.tsv"
mean_df.to_csv(output_file, sep="\t", index=False)

print("Mean values saved to:", output_file)

Mean values saved to: embedding_old_green/paclitaxel/gene_means.tsv


In [51]:
import os

# Paths
old_dir = "embedding_old_green/paclitaxel"
old_image = os.path.join(old_dir, "image_emd.tsv")
new_image = os.path.join(old_dir, "image_emd2.tsv")
gene_means = os.path.join(old_dir, "gene_means.tsv")
new_gene_means = os.path.join(old_dir, "image_emd.tsv")

# Rename image_emd.tsv → image_emd2.tsv
os.rename(old_image, new_image)

# Rename gene_means.tsv → image_emd.tsv
os.rename(gene_means, new_gene_means)

print("Files renamed successfully.")


Files renamed successfully.


In [56]:
# cell map vorinostat

from cellmaps_coembedding.runner import MuseCoEmbeddingGenerator
from cellmaps_coembedding.runner import CellmapsCoEmbedder

ppi_embeddingdir = '2.ppi_embedding'
image_embeddingdir = 'embedding_old_green/paclitaxel'
outdir = '3_old_green.coembedding_paclitaxel'
gen = MuseCoEmbeddingGenerator(ppi_embeddingdir=ppi_embeddingdir,
                               image_embeddingdir=image_embeddingdir,
                               outdir=os.path.abspath(outdir))

x = CellmapsCoEmbedder(outdir=outdir,
                      inputdirs=[ppi_embeddingdir, image_embeddingdir],
                      embedding_generator=gen)
x.run()

Saving embedding: 0it [00:00, ?it/s]

Finding 10 nearest neighbors using cosine metric and 'brute' algorithm
Neighbors computed in 0.04577040672302246 seconds
Jaccard graph constructed in 1.2841176986694336 seconds
Wrote graph to binary file in 0.0017898082733154297 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.664183
After 3 runs, maximum modularity is Q = 0.66654
After 8 runs, maximum modularity is Q = 0.667759
Louvain completed 28 runs in 0.11507487297058105 seconds
Sorting communities by size, please wait ...
PhenoGraph completed in 2.7138445377349854 seconds
Finding 10 nearest neighbors using cosine metric and 'brute' algorithm
Neighbors computed in 0.016773223876953125 seconds
Jaccard graph constructed in 1.263922929763794 seconds
Wrote graph to binary file in 0.0016999244689941406 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.661529
After 2 runs, maximum modularity is Q = 0.663141
After 5 runs, maximum modularity is Q = 0.664297


Saving embedding: 97it [00:31,  3.07it/s]


0

In [57]:
import csv
import numpy as np
from collections import defaultdict
input_file = "embedding_old_green/paclitaxel/image_emd.tsv"
# Store embeddings per gene
data = defaultdict(list)
# Read the original file
with open(input_file, "r") as f:
    reader = csv.reader(f, delimiter='\t')
    header = next(reader) # store header
    for row in reader:
        gene_id = row[0]
        embedding = list(map(float, row[1:]))
        data[gene_id].append(embedding)
# Compute mean and overwrite the same file
with open(input_file, "w", newline='') as out:
    writer = csv.writer(out, delimiter='\t')
    writer.writerow(["id"] + header[1:]) # write header back
    for gene_id, vectors in data.items():
        arr = np.array(vectors)
        mean_vector = np.mean(arr, axis=0)
        writer.writerow([gene_id] + mean_vector.tolist())

In [58]:
import csv
input_file = "embedding_old_green/paclitaxel/image_emd.tsv"
unique_genes = set()
with open(input_file, "r") as f:
    reader = csv.reader(f, delimiter='\t')
    next(reader) # skip header
    for row in reader:
        gene_id = row[0]
        unique_genes.add(gene_id)
print(f"Number of unique genes: {len(unique_genes)}")

Number of unique genes: 461


In [60]:
from cellmaps_generate_hierarchy.ppi import CosineSimilarityPPIGenerator
from cellmaps_generate_hierarchy.hierarchy import CDAPSHiDeFHierarchyGenerator
from cellmaps_generate_hierarchy.maturehierarchy import HiDeFHierarchyRefiner
from cellmaps_generate_hierarchy.hcx import HCXFromCDAPSCXHierarchy
from cellmaps_generate_hierarchy.runner import CellmapsGenerateHierarchy

inputdir = '3_old_green.coembedding_paclitaxel'
outdir = '5.2_old_green_hierarchy_paclitaxel'
ppigen = CosineSimilarityPPIGenerator(embeddingdirs=[inputdir])

refiner = HiDeFHierarchyRefiner()

converter = HCXFromCDAPSCXHierarchy()

hiergen = CDAPSHiDeFHierarchyGenerator(refiner=refiner,
                                       hcxconverter=converter)

x = CellmapsGenerateHierarchy(outdir=outdir,
                              inputdirs=inputdir,
                              ppigen=ppigen,
                              hiergen=hiergen)
x.run()

Generating hierarchy: 15it [00:00, 77.56it/s] 


Generating CX
Generating CX
Generating CX
Generating CX
Generating CX
Generating CX
Generating CX
Generating CX
Generating CX
Generating CX
Generating CX
Generating CX
Generating CX
Generating CX
Generating CX




0

In [61]:
import os
import ndex2
from ndex2.cx2 import RawCX2NetworkFactory
from cellmaps_generate_hierarchy.ndexupload import NDExHierarchyUploader

#Specify NDEx server
ndexserver = 'public.ndexbio.org'
ndexuser = ''
ndexpassword = ''
    
# Specify paths to hierarchy and its parent (you can find example files in examples directory in cellmaps_generate_hierarchy_repo)
hierarchy_path = './5.2_old_hierarchy_paclitaxel/hierarchy.cx2'
parent_network_path = './5.2_old_hierarchy_paclitaxel/hierarchy_parent.cx2'

# Load the hierarchy and parent network CX2 files into network objects
factory = RawCX2NetworkFactory()
hierarchy_network = factory.get_cx2network(hierarchy_path)
parent_network = factory.get_cx2network(parent_network_path)

# Initialize NDExHierarchyUploader with the specified NDEx server and credentials
uploader = NDExHierarchyUploader(ndexserver, ndexuser, ndexpassword, visibility=True)

# Upload the hierarchy and parent network to NDEx
parent_uuid, parenturl, hierarchy_uuid, hierarchyurl = uploader.save_hierarchy_and_parent_network(hierarchy_network, parent_network)

print(f"Parent network UUID is {parent_uuid} and its URL in NDEx is {parenturl}")
print(f"Hierarchy network UUID is {hierarchy_uuid} and its URL in NDEx is {hierarchyurl}")

# # Another option is to just specify the directory where the files are placed
# _, _, _, hierarchyurl = uploader.upload_hierary_and_parent_network_from_files('./examples/')
# print(f'Hierarchy uploaded. To view the hierarchy, paste this URL in your browser: {hierarchyurl}')

Parent network UUID is 1e60ab92-9123-11f0-a218-005056ae3c32 and its URL in NDEx is https://www.ndexbio.org/viewer/networks/1e60ab92-9123-11f0-a218-005056ae3c32
Hierarchy network UUID is 1e7c9804-9123-11f0-a218-005056ae3c32 and its URL in NDEx is https://www.ndexbio.org/viewer/networks/1e7c9804-9123-11f0-a218-005056ae3c32
