In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

### Data import

In [2]:
dataset_name = "Metabric"
data_dir = "dataset/" + dataset_name
data = pd.read_csv(data_dir + '/METABRIC_RNA_Mutation.csv')
keep_columns = ['overall_survival_months', 'overall_survival', 'age_at_diagnosis', 'type_of_breast_surgery', 
                'cellularity', 'chemotherapy', 'er_status', 'her2_status_measured_by_snp6', 
                'her2_status', 'hormone_therapy', 'inferred_menopausal_state', 'lymph_nodes_examined_positive', 
                'oncotree_code', 'pr_status', 'radio_therapy', 'tumor_size', 'death_from_cancer']
data = data[keep_columns].dropna()

### Preprocessing

In [None]:
# Convert some continous features to categorical ones
conditions = [(data["lymph_nodes_examined_positive"] == 0),
             (data["lymph_nodes_examined_positive"] >=1) & (data["lymph_nodes_examined_positive"] <=3),
             (data["lymph_nodes_examined_positive"] >=4) & (data["lymph_nodes_examined_positive"] <=9),
             (data["lymph_nodes_examined_positive"] >=10)]
values = ["N0", "N1", "N2", "N3"]
data["lymph_nodes_examined_positive"] = np.select(conditions, values, default=data["lymph_nodes_examined_positive"])

conditions = [(data["tumor_size"] <20),
             (data["tumor_size"] >=20) & (data["tumor_size"] <50),
             (data["tumor_size"] >=50)]
values = ["T1", "˜T2", "T3"]
data["tumor_size"] = np.select(conditions, values, default=data["tumor_size"])

In [6]:
# Encode the features
for col in keep_columns:
    if col not in ["overall_survival_months", "overall_survival", "chemotherapy"]:
        unique_value = np.unique(data[col].values)

        # Create a mapping dictionary
        mapping = dict(zip(unique_value, np.arange(len(unique_value))))

        # Replace values in the column
        data[col] = data[col].replace(mapping)

### Data types

In [7]:
data_types = [["name", "type", "dim", "nclass"], ["survcens", "surv_piecewise", 2, np.nan]]
for col in keep_columns:
    if col not in ["overall_survival_months", "overall_survival", "chemotherapy"]:
        n_unique = len(np.unique(data[col].values))
        if n_unique < 20:
            data_types.append([col, "cat", 1, str(n_unique)])
        else:
            if (data[col] > 0).all():
                data_types.append([col, "pos", 1, np.nan])
            else:
                data_types.append([col, "real", 1, np.nan])

### Split data into treated and control group

In [8]:
data = data.rename(columns={"overall_survival_months": "time", "overall_survival": "censor"})
data_treated = data[data['chemotherapy']==1].drop(["chemotherapy"] , axis=1)
data_control = data[data['chemotherapy']==0].drop(["chemotherapy"] , axis=1)

### Save data

In [9]:
import csv

data.to_csv(data_dir + "/data.csv", header=False, index=False)
data_treated.to_csv(data_dir + "/data_treated.csv", header=False, index=False)
data_control.to_csv(data_dir + "/data_control.csv", header=False, index=False)
pd.DataFrame(data_types).to_csv(data_dir + "/data_types_treated.csv", index=False, header=False)
pd.DataFrame(data_types).to_csv(data_dir + "/data_types_control.csv", index=False, header=False)