In [None]:
import pandas as pd
import json
import subprocess

## Preparation

In [None]:
# 1. filter GREIN_data for human GSE IDs
grein_df = pd.read_csv("../demo/GREIN_data.csv")
human_df = grein_df[grein_df['Species'] == "Homo sapiens"]
human_ids = human_df['GEO accession']
human_ids.to_csv("../demo/metadata_human_ID.tsv", sep="\t", index=False, header=False)

# 2. Extract titles from metadata_human
with open("../demo/metadata_human.json") as f:
    metadata = json.load(f)

human_titles = [metadata[gse]["Title"] for gse in human_ids if gse in metadata]
pd.DataFrame(human_titles).to_csv("../demo/metadata_human_title.tsv", sep="\t", index=False, header=False)

## Prediction

In [None]:
# Preprocess
subprocess.run([
    "python", "../src/preprocess.py",
    "-input", "../demo/metadata_human_title.tsv",
    "-out", "../demo/metadata_human_title_processed.tsv"
])

# Generate embeddings
subprocess.run([
    "python", "../src/embedding_lookup_table.py",
    "-input", "../demo/metadata_human_title_processed.tsv",
    "-out", "../demo/metadata_human_title_embedding.npz"
])

# Predict for one human model (e.g. MONDO_0000315)
subprocess.run([
    "python", "../src/predict.py",
    "-input", "../demo/metadata_human_title_processed.tsv",
    "-id", "../demo/metadata_human_ID.tsv",
    "-input_embed", "../demo/metadata_human_title_embedding.npz",
    "-train_embed", "../data/disease_desc_embedding.npz",
    "-model", "../bins/MONDO_0000315__model.pkl",
    "-out", "../results/"
])