In [37]:
import os, glob
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
import openslide
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from lifelines.utils import concordance_index
from sklearn.model_selection import train_test_split

# Merge RNA seq data

In [None]:
input_dir = "/gpfs/gibbs/project/cpsc452/cpsc452_am4289/rna_seq_open_tsv"

expression_dfs = []
gene_name_map = {}

filepaths = sorted(glob(os.path.join(input_dir, "*.tsv")))
print(f"Found {len(filepaths)} files.")

In [None]:
for filepath in filepaths:
    sample_id = os.path.basename(filepath).replace(".tsv", "")

    try:
        df = pd.read_csv(filepath, sep='\t', comment="#")
        df = df[df["gene_type"] == "protein_coding"]
        df = df[["gene_id", "gene_name", "tpm_unstranded"]].dropna()

        df["tpm_log"] = np.log2(df["tpm_unstranded"] + 1)

        for i, row in df.iterrows():
            gid = row["gene_id"]
            gname = row["gene_name"]
            if gid not in gene_name_map:
                gene_name_map[gid] = gname

        sample_expr = df[["gene_id", "tpm_log"]].set_index("gene_id")
        sample_expr.columns = [sample_id]

        expression_dfs.append(sample_expr)

    except Exception as e:
        print(f"Error processing {sample_id}: {e}")

In [None]:
print("Merging all subjects...")
merged_df = pd.concat(expression_dfs, axis=1, join="outer")

merged_df = merged_df.fillna(0)

gene_names = pd.Series(gene_name_map).rename("gene_name")
merged_df.insert(0, "gene_name", gene_names)

In [None]:
output_csv = "/gpfs/gibbs/project/cpsc452/cpsc452_jl4286/merged_rnaseq_logTPM_matrix.csv"
merged_df.to_csv(output_csv)

print(f"Final matrix saved to '{output_csv}' with shape {merged_df.shape}")

In [None]:
df = pd.read_csv("/gpfs/gibbs/project/cpsc452/cpsc452_jl4286/merged_rnaseq_logTPM_matrix.csv", index_col=0)
X = df.drop(columns=["gene_name"]).values
X = X.T

np.save("/gpfs/gibbs/project/cpsc452/cpsc452_jl4286/rnaseq_data.npy", X)

# Convert rna id form csv to npy

In [3]:
df = pd.read_csv("/gpfs/gibbs/project/cpsc452/cpsc452_jl4286/tcga_gbm_rna_seq_cases.csv")

rna_ids = df['submitter_id'].values
np.save("/gpfs/gibbs/project/cpsc452/cpsc452_jl4286/rna_ids.npy", rna_ids)

# Clean clinical data, match to RNA matrix and RNA IDs, and save filtered ones

In [4]:
RNASEQ_PATH = "/gpfs/gibbs/project/cpsc452/cpsc452_jl4286/rnaseq_data.npy"
RNA_IDS   = "/gpfs/gibbs/project/cpsc452/cpsc452_jl4286/rna_ids.npy"

CLIN_CSV  = "/gpfs/gibbs/project/cpsc452/cpsc452_jl4286/clinical.csv"
WSI_DIRS  = [
    "/gpfs/gibbs/project/cpsc452/cpsc452_jl4286/image",
    "/gpfs/gibbs/project/cpsc452/cpsc452_am4289/image",
    "/gpfs/gibbs/project/cpsc452/cpsc452_yg427/data/image_svs_manual",
    "/gpfs/gibbs/project/cpsc452/cpsc452_yy743/image_svs_manual",
]

wsi_map = {}
for d in WSI_DIRS:
    for fp in glob.glob(os.path.join(d, "*.svs")):
        pid = os.path.basename(fp).split(".")[0]
        wsi_map.setdefault(pid, []).append(fp)
print(f"Found WSIs for {len(wsi_map)} patients")

Found WSIs for 1074 patients


In [15]:
clin = pd.read_csv(CLIN_CSV)
clin.head()

Unnamed: 0,case_submitter_id,days_to_death,days_to_birth,gender,ethnicity,race,vital_status
0,TCGA-14-1034,485,22029,1,not reported,not reported,Dead
1,TCGA-06-0140,6,31566,0,not hispanic or latino,white,Dead
2,TCGA-06-0171,399,24085,0,not hispanic or latino,white,Dead
3,TCGA-12-0819,754,18160,1,not hispanic or latino,black or african american,Dead
4,TCGA-12-0619,1062,21920,0,not hispanic or latino,white,Dead


In [16]:
clin["days_to_death"] = pd.to_numeric(clin["days_to_death"], errors="coerce")
clin["event"] = np.where(clin["vital_status"] == "Dead", 1, 0)

clin["survival_time"] = np.where(
    clin["event"] == 1,
    clin["days_to_death"],
    5000)

In [18]:
rna_ids = np.load(RNA_IDS, allow_pickle=True)

clin = clin[clin["case_submitter_id"].isin(rna_ids)]
clin["race"] = clin["race"].replace("not reported", "unknown")

race_dummies = pd.get_dummies(clin["race"], prefix="race")
clin = pd.concat([clin.drop("race", axis=1), race_dummies], axis=1)

In [20]:
clin = clin.rename(columns={
    "case_submitter_id": "submitter_id",
    "race_black or african american": "race_black"
})
clin.to_csv("/gpfs/gibbs/project/cpsc452/cpsc452_jl4286/cleaned_clinical.csv", index=False)

In [30]:
rna_mat = np.load(RNASEQ_PATH)
pid2idx = {pid:i for i,pid in enumerate(rna_ids)}

print("RNA matrix:", rna_mat.shape)

RNA matrix: (293, 19962)


In [32]:
matched_ids = list(rna_ids_set & clin_ids)
matched_idx = [i for i, pid in enumerate(rna_ids) if pid in matched_ids]

rna_mat_filtered = rna_mat[matched_idx]
rna_ids_filtered = [rna_ids[i] for i in matched_idx]

np.save("filtered_rnaseq_data.npy", rna_mat_filtered)
np.save("filtered_rna_ids.npy", rna_ids_filtered)

print(f"Filtered RNA matrix shape: {rna_mat_filtered.shape}")

Filtered RNA matrix shape: (286, 19962)
