In [None]:
import pandas as pd 
import os
from cellmaps_utils.provenance import ProvenanceUtil

In [None]:
# Create Embedding RO-Crate to use it with cellmaps_generate_hierarchy and save the embedding table
# As cellmaps_generate_hierarchy looks for specific file names for embeddings in embedding ro-crate, we name the embedding file: coembedding_emd.tsv.

embedding_rocrate_path = '/kaggle/working/embedding'
if not os.path.exists(embedding_rocrate_path):
    os.mkdir(embedding_rocrate_path)

prov_utils = ProvenanceUtil(fairscape_binary='/usr/local/lib/python3.11/dist-packages/fairscape_cli/', raise_on_error=True)

embedding_file_path = '/kaggle/working/embedding/coembedding_emd.tsv'
if not os.path.exists(embedding_file_path):
    # Replace NaN values in embedding table with zeros
    df = pd.read_table('/kaggle/input/cm4ai-community-detection-protein-co-elution/repl1_repl2_combined.tsv', index_col=0)
    df.fillna(0, inplace=True)
    df.to_csv(embedding_file_path, sep='\t')
    # Generate provenance for Embedding File
    prov_utils.register_rocrate(embedding_rocrate_path,
                                name='Kaggle Challenge Embedding Data',
                                organization_name='Ideker Lab',
                                project_name='Kaggle Challenge',
                                description='Kaggle Challenge Embedding Data',
                                keywords=['kaggle', 'challenge', 'embedding'])

In [None]:
# Generate hierarchy with cellmaps_generate_hierarchy
from cellmaps_generate_hierarchy.ppi import CosineSimilarityPPIGenerator
from cellmaps_generate_hierarchy.hierarchy import CDAPSHiDeFHierarchyGenerator
from cellmaps_generate_hierarchy.maturehierarchy import HiDeFHierarchyRefiner
from cellmaps_generate_hierarchy.hcx import HCXFromCDAPSCXHierarchy
from cellmaps_generate_hierarchy.runner import CellmapsGenerateHierarchy

inputdir = embedding_rocrate_path
outdir = '/kaggle/working/hierarchydir'

# Sets the cutoffs used to generate edgelists fed to HiDeF. 
# The defaults take a long time to run so these have been shortened to run quickly

PPI_CUTOFFS = [0.001, 0.002, 0.003]

# Generator that creates edge lists used as input for HiDeF
ppigen = CosineSimilarityPPIGenerator(embeddingdirs=[inputdir], cutoffs=PPI_CUTOFFS)

# Refiner that performs some cleanup of the hierarchy
refiner = HiDeFHierarchyRefiner(provenance_utils=prov_utils)

# Converter that converts CDAPS CX hierarchy into HCX format
converter = HCXFromCDAPSCXHierarchy()

# Creates hierarchy generator
hiergen = CDAPSHiDeFHierarchyGenerator(hidef_cmd='/usr/local/lib/python3.11/dist-packages/hidef/hidef_finder.py',
                                       refiner=refiner,
                                       hcxconverter=converter,
                                       provenance_utils=prov_utils)

# Constructor of the object that takes all the objects created above to make a hierarchy
x = CellmapsGenerateHierarchy(outdir=outdir,inputdirs=inputdir,ppigen=ppigen,hiergen=hiergen,provenance_utils=prov_utils)

# Runs the hierarchy generation
x.run()

In [None]:
# Generate submission result file

from ndex2.cx2 import RawCX2NetworkFactory

factory = RawCX2NetworkFactory()

net = factory.get_cx2network(f'{outdir}/hierarchy.cx2')

data = []
for node_id, node_obj in net.get_nodes().items():
    genes = node_obj['v']['CD_MemberList'].split()
    for gene in genes:
        data.append({
            'xxx': gene,
            'prediction': node_id
        })

df = pd.DataFrame(data, columns=['xxx', 'prediction'])
df = df[['xxx', 'prediction']]
df.to_csv('/kaggle/working/submission.csv', index=True, index_label="ID")