In [5]:
import pandas as pd
import numpy as np
import os

from sklearn.preprocessing import scale

In [2]:
df1 = pd.read_csv("/cluster/projects/radiomics/RADCURE-challenge/clinical_cancer_death.csv")
df2 = pd.read_csv("/cluster/projects/radiomics/RADCURE-challenge/data/clinical_test.csv")
df3 = pd.read_csv("/cluster/projects/radiomics/RADCURE-challenge/data/clinical.csv")
df = pd.merge(df1, pd.concat([df2[["Study ID", "ECOG"]], df3[["Study ID", "ECOG"]]]), 
              how='outer', on='Study ID').drop("EGFRI", axis=1)

In [3]:
def make_data(path, split="training"):
    """Load and preprocess the data."""
    try:
        df = pd.read_csv(path)
    except:
        df = path

    clinical_data = (df
                     .query("split == @split")
                     # .set_index("Study ID")
                     .drop(["split"], axis=1, errors="ignore"))
    # if split == "training":
    clinical_data = clinical_data.rename(columns={"death": "event", "survival_time": "time"})
    # Convert time to months
    clinical_data["time"] *= 12

    clinical_data["age at dx"] = scale(clinical_data["age at dx"])
    clinical_data["Dose"] = scale(clinical_data["Dose"])

    # binarize T stage as T1/2 = 0, T3/4 = 1
    clinical_data["T Stage"] = clinical_data["T Stage"].map(
        lambda x: "T1/2" if x in ["T1", "T1a", "T1b", "T2"] else "T3/4", na_action="ignore")

    # use more fine-grained grouping for N stage
    clinical_data["N Stage"] = clinical_data["N Stage"].str.slice(0, 2)

    clinical_data["Stage"] = clinical_data["Stage"].map(
        lambda x: "I/II" if x in ["I", "II", "IIA"] else "III/IV", na_action="ignore")

    clinical_data["ECOG"] = clinical_data["ECOG"].map(
        lambda x: ">0" if x > 0 else "0", na_action="ignore")

    clinical_data = pd.get_dummies(clinical_data,
                                   columns=["Sex",
                                            "N Stage",
                                            "Disease Site"],
                                   drop_first=True)
    clinical_data = pd.get_dummies(clinical_data,
                                   columns=["HPV Combined",
                                            "T Stage",
                                            "Stage",
                                            "ECOG"])

    return clinical_data

In [6]:
make_data(df, split="training").columns

Unnamed: 0,Study ID,target_binary,time,event,age at dx,Dose,Chemotherapy,cancer_death,Sex_Male,N Stage_N1,...,Disease Site_paranasal sinus,Disease Site_salivary glands,HPV Combined_0.0,HPV Combined_1.0,T Stage_T1/2,T Stage_T3/4,Stage_I/II,Stage_III/IV,ECOG_0,ECOG_>0
2,RADCURE303,0,72.460274,0,0.397671,0.614220,0,0,1,0,...,0,0,0,0,0,1,0,1,1,0
3,RADCURE304,0,69.600000,0,0.541313,0.614220,0,0,0,0,...,0,0,0,0,1,0,0,1,0,1
4,RADCURE305,0,110.071233,0,-1.038746,0.614220,1,0,0,0,...,0,0,0,0,1,0,0,1,0,1
5,RADCURE306,0,65.523288,0,2.239664,0.614220,0,0,1,0,...,0,0,0,0,1,0,0,1,0,1
6,RADCURE307,0,61.906849,0,0.659606,-0.405216,0,0,1,0,...,0,0,0,1,0,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2547,RADCURE4124,0,119.178082,0,0.532863,0.614220,0,0,1,0,...,0,0,0,1,1,0,0,1,0,1
2548,RADCURE4125,0,135.320548,0,-0.683867,-1.084840,0,0,1,0,...,0,0,0,0,1,0,1,0,1,0
2549,RADCURE4126,0,98.268493,1,-0.312088,0.614220,0,0,1,0,...,0,0,0,1,1,0,1,0,1,0
2550,RADCURE4127,0,68.876712,0,-0.810609,0.614220,1,0,0,0,...,0,0,0,1,1,0,0,1,1,0


In [7]:
make_data(df, split="test")

Unnamed: 0,Study ID,target_binary,time,event,age at dx,Dose,Chemotherapy,cancer_death,Sex_Male,N Stage_N1,...,Disease Site_paranasal sinus,Disease Site_salivary glands,HPV Combined_0.0,HPV Combined_1.0,T Stage_T1/2,T Stage_T3/4,Stage_I/II,Stage_III/IV,ECOG_0,ECOG_>0
0,RADCURE300,0,56.054795,0,-0.177024,0.418295,0,0,1,0,...,0,0,0,1,1,0,0,1,0,1
1,RADCURE301,0,51.682192,0,-1.293244,0.418295,1,0,1,0,...,0,0,0,1,0,1,0,1,1,0
12,RADCURE315,0,57.731507,0,0.644515,0.418295,0,0,1,1,...,0,0,0,1,0,1,0,1,1,0
16,RADCURE320,0,71.046575,0,-0.936054,0.418295,0,0,1,0,...,0,0,1,0,1,0,1,0,1,0
19,RADCURE323,0,50.169863,0,-1.132509,0.418295,1,0,1,0,...,0,0,1,0,0,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2537,RADCURE4105,0,26.761644,1,-0.150234,0.418295,1,1,1,0,...,0,0,0,1,0,1,0,1,1,0
2539,RADCURE4108,0,34.980822,0,-2.686288,0.418295,1,0,0,0,...,0,0,1,0,0,1,0,1,1,0
2540,RADCURE4113,0,25.019178,0,-1.436121,0.418295,1,0,1,0,...,0,0,0,1,0,1,0,1,0,1
2541,RADCURE4116,0,64.898630,0,0.117659,-0.423910,0,0,1,0,...,0,0,0,0,1,0,1,0,1,0
