In [None]:
import pandas as pd
import json
import subprocess

## Model Selection

In [None]:
# Extract cancer models from disease model stats
stats = pd.read_csv("../data/disease_model_stats.csv")
cancers = stats[stats['name'].str.contains("cancer")]
cancers = cancers['ID'].str.replace(":", "_").tolist()

## Preparation

In [None]:
# 1. filter GREIN_data for mouse GSE IDs
grein_df = pd.read_csv("../demo/GREIN_data.csv")
mouse_df = grein_df[grein_df['Species'] == "Mus musculus"]
mouse_ids = mouse_df['GEO accession']
mouse_ids.to_csv("../demo/metadata_mouse_ID.tsv", sep="\t", index=False, header=False)

# 2. Extract titles from metadata_mouse
with open("../demo/metadata_mouse.json") as f:
    metadata = json.load(f)

mouse_titles = [metadata[gse]["Title"] for gse in mouse_ids if gse in metadata]
pd.DataFrame(mouse_titles).to_csv("../demo/metadata_mouse_title.tsv", sep="\t", index=False, header=False)

## Prediction

In [None]:
# Preprocess
subprocess.run([
    "python", "../src/preprocess.py",
    "-input", "../demo/metadata_mouse_title.tsv",
    "-out", "../demo/metadata_mouse_title_processed.tsv"
])

# Generate embeddings
subprocess.run([
    "python", "../src/embedding_lookup_table.py",
    "-input", "../demo/metadata_mouse_title_processed.tsv",
    "-out", "../demo/metadata_mouse_title_embedding.npz"
])

# Predict
for cancer_id in cancers:
    subprocess.run([
        "python", "../src/predict.py",
        "-input", "../demo/metadata_mouse_title_processed.tsv",
        "-id", "../demo/metadata_mouse_ID.tsv",
        "-input_embed", "../demo/metadata_mouse_title_embedding.npz",
        "-train_embed", "../data/disease_desc_embedding.npz",
        "-model", f"../bins/{cancer_id}__model.pkl",
        "-out", "../results/cancer_mouse/"
    ])