In [None]:
import pandas as pd
import os
import json
import matplotlib.pyplot as plt
import plotly.graph_objects as go

## Concatenation

In [None]:
files = [f for f in os.listdir("results/cancer_mouse") if f.endswith(".csv")]

dfs = []
for f in files:
    df = pd.read_csv(os.path.join("results/cancer_mouse", f))
    df['source'] = f.replace("__preds.csv", "")
    dfs.append(df)

all_preds = pd.concat(dfs, ignore_index=True)
all_preds.to_csv(
    os.path.join("results/cancer_mouse", "all_cancer_predictions.csv"),
    index=False
)
print("Saved all_cancer_predictions.csv")

## Analysis

In [None]:
# Load and merge
preds = pd.read_csv("results/cancer_mouse/all_cancer_predictions.csv")
stats = pd.read_csv("data/disease_model_stats.csv")
with open("data/cancer_children.json") as f:
    children = json.load(f)

preds = preds[preds['related_words'].notna()]
stats['ID_'] = stats['ID'].str.replace(":", "_")
merged = (
    pd.merge(preds, stats[['ID_', 'name']],
             left_on='source', right_on='ID_', how='left')
    .drop(columns=['ID_'])
)

# Subset for parent + children of MONDO_0004992
parent = "MONDO_0004992"
terms = children['_embedded']['terms']
child_ids = [t['short_form'] for t in terms if 'cancer' in t['label']]
models = child_ids + [parent]
sub = merged[merged['source'].isin(models)]

# Counts per subtype
counts = sub.groupby('name').size().reset_index(name='count').sort_values('count', ascending=False)


### Pie chart

In [None]:
plt.figure()
circle = plt.Circle((0,0), 0.7, color='white')
plt.pie(counts['count'], labels=counts['name'])
plt.gca().add_artist(circle)
plt.title("Cancer subtype distribution")
plt.show()

### Sankey Diagram

In [None]:
labels = ["cancer"] + counts['name'].tolist()
source_idxs = [0] * len(counts)
target_idxs = list(range(1, len(counts)+1))
values = counts['count'].tolist()

fig = go.Figure(data=[go.Sankey(
    node=dict(pad=15, thickness=20,
              line=dict(color="black", width=0.5),
              label=labels),
    link=dict(source=source_idxs, target=target_idxs, value=values)
)])
fig.update_layout(title_text="GSEs Predicted as Cancer Subtypes", font_size=12)
fig.show()