In [1]:
# header files
# please note that in this notebook we are performing survival analysis using 'sksurv' package on the D8 cohort
# D8 cohort - Ovarian cancer, pretreatment scans treated with chemotherapy after surgery
%matplotlib inline
import glob
import csv
import numpy as np
import pandas as pd
from sksurv.nonparametric import kaplan_meier_estimator
from sksurv.linear_model import CoxnetSurvivalAnalysis
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.ensemble import RandomSurvivalForest
from sksurv.svm import HingeLossSurvivalSVM
from sksurv.metrics import (
    concordance_index_censored,
    concordance_index_ipcw,
    cumulative_dynamic_auc,
    integrated_brier_score,
)
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [4, 4]
print("Header files loaded!")

Header files loaded!


In [2]:
# training: 95 TCGA ovarian cancer cases treated with chemotherapy 
# this block consists of four variables: train_features, train_y, train_event, train_survival_time loaded from data.csv
# train_features: 34 features used, combination of collagen and til
# train_y: each value in the array is (event, survival_time) where event is 'True' (if death or reccurence occured) or 'False' (no event occured) and survival time is the time from disease being diagnosed to event
# train_event: event is 'True' (if death or reccurence occured) or 'False' (no event occured)
# train_survival_time: survival time is the time from disease being diagnosed to event
train_features = []
train_y = []
train_event = []
train_survival_time = []

flag = -1
with open("../data/data.csv", newline='', encoding = "ISO-8859-1") as csvfile:
    spamreader = csv.reader(csvfile)
    for row in spamreader:
        if flag == -1:
            flag = 1
            print(row)
        else:
            array = row
            if array[1] == "Train":
                f = []
                for index in range(6, 40):
                    f.append(float(array[index]))
                train_features.append(f)
                
                event = False
                if array[41] == "TRUE" or array[41] == "True" or array[41] == "true":
                    event = True
                train_y.append([event, float(array[40])])
                train_event.append(event)
                train_survival_time.append(float(array[40]))
print(len(train_features))
print(len(train_y))
print(len(train_event))
print(len(train_survival_time))

['Patient ID', 'Data split', 'Organ', 'Site', 'Treatment', 'Vital status', 'Feature 1(TIL)', 'Feature 2(TIL)', 'Feature 3(TIL)', 'Feature 4(TIL)', 'Feature 5(TIL)', 'Feature 6(TIL)', 'Feature 7(TIL)', 'Feature 8(Collagen)', 'Feature 9(Collagen)', 'Feature 10(Collagen)', 'Feature 11(Collagen)', 'Feature 12(Collagen)', 'Feature 13(Collagen)', 'Feature 14(Collagen)', 'Feature 15(Collagen)', 'Feature 16(Collagen)', 'Feature 17(Collagen)', 'Feature 18(Collagen)', 'Feature 19(Collagen)', 'Feature 20(Collagen)', 'Feature 21(Collagen)', 'Feature 22(Collagen)', 'Feature 23(Collagen)', 'Feature 24(Collagen)', 'Feature 25(Collagen)', 'Feature 26(Collagen)', 'Feature 27(Collagen)', 'Feature 28(Collagen)', 'Feature 29(Collagen)', 'Feature 30(Collagen)', 'Feature 31(Collagen)', 'Feature 32(Collagen)', 'Feature 33(Collagen)', 'Feature 34(Collagen)', 'OS', 'OS_event', 'PFS', 'PFS_event', 'Age', 'Stage', 'Tumor grade', 'Risk score', 'Binary risk score']
95
95
95
95


In [3]:
# this block basically converts the four variables created in previous block to numpy arrays 
# which will be used for training the model
train_features = np.array(train_features)
train_y = np.array(train_y)
train_event = np.array(train_event)
train_survival_time = np.array(train_survival_time)

In [4]:
# validation: 30 MSKCC cases treated with chemotherapy after surgery
# this block consists of four variables: train_features, train_y, train_event, train_survival_time loaded from data.csv
# train_features: 34 features used, combination of collagen and til
# train_y: each value in the array is (event, survival_time) where event is 'True' (if death or reccurence occured) or 'False' (no event occured) and survival time is the time from disease being diagnosed to event
# train_event: event is 'True' (if death or reccurence occured) or 'False' (no event occured)
# train_survival_time: survival time is the time from disease being diagnosed to event
test_features = []
test_y = []
test_event = []
test_survival_time = []
test_clinical_var_age = []
test_clinical_var_stage = []

flag = -1
with open("../data/data.csv", newline='', encoding = "ISO-8859-1") as csvfile:
    spamreader = csv.reader(csvfile)
    for row in spamreader:
        if flag == -1:
            flag = 1
            print(row)
        else:
            array = row
            if array[1] == "Validation" and array[2] == "Ovary" and array[4] == "Chemotherapy":
                f = []
                for index in range(6, 40):
                    f.append(float(array[index]))
                test_features.append(f)
                
                event = False
                if array[43] == "TRUE" or array[43] == "True" or array[43] == "true":
                    event = True
                test_y.append([event, float(array[42])])
                test_event.append(event)
                test_survival_time.append(float(array[42]))
                test_clinical_var_age.append(float(array[44]))
                test_clinical_var_stage.append(float(array[45]))
print(len(test_features))
print(len(test_y))
print(len(test_event))
print(len(test_survival_time))
print(len(test_clinical_var_age))
print(len(test_clinical_var_stage))

['Patient ID', 'Data split', 'Organ', 'Site', 'Treatment', 'Vital status', 'Feature 1(TIL)', 'Feature 2(TIL)', 'Feature 3(TIL)', 'Feature 4(TIL)', 'Feature 5(TIL)', 'Feature 6(TIL)', 'Feature 7(TIL)', 'Feature 8(Collagen)', 'Feature 9(Collagen)', 'Feature 10(Collagen)', 'Feature 11(Collagen)', 'Feature 12(Collagen)', 'Feature 13(Collagen)', 'Feature 14(Collagen)', 'Feature 15(Collagen)', 'Feature 16(Collagen)', 'Feature 17(Collagen)', 'Feature 18(Collagen)', 'Feature 19(Collagen)', 'Feature 20(Collagen)', 'Feature 21(Collagen)', 'Feature 22(Collagen)', 'Feature 23(Collagen)', 'Feature 24(Collagen)', 'Feature 25(Collagen)', 'Feature 26(Collagen)', 'Feature 27(Collagen)', 'Feature 28(Collagen)', 'Feature 29(Collagen)', 'Feature 30(Collagen)', 'Feature 31(Collagen)', 'Feature 32(Collagen)', 'Feature 33(Collagen)', 'Feature 34(Collagen)', 'OS', 'OS_event', 'PFS', 'PFS_event', 'Age', 'Stage', 'Tumor grade', 'Risk score', 'Binary risk score']


ValueError: could not convert string to float: '-'

In [5]:
# running survival model using the train and validation dataset defined above
# this block has four major variables: train_group, train_risk_scores, group, test_risk_scores
# train_group: binary risk score 1 or 0 for train dataset. 1: high risk group and 0: low risk group
# test_group: binary risk score 1 or 0 for test dataset. 1: high risk group and 0: low risk group
# train_risk_scores: risk scores for train dataset
# test_risk_scores: risk scores for test dataset
group = []
train_group = []
features_train = train_features
features_test = test_features
y_train = train_y
event_train, survival_time_train = train_event, train_survival_time
dt = dtype=[('Status', '?'), ('Survival_in_days', '<f8')]
y_train = np.array([tuple(row) for row in y_train], dtype=dt)
scaler = MinMaxScaler()
features_train = scaler.fit_transform(features_train)
features_test = scaler.transform(features_test)
features_train_df = pd.DataFrame(features_train)
features_test_df = pd.DataFrame(features_test)
        

estimator = CoxnetSurvivalAnalysis(l1_ratio=0.9, alpha_min_ratio=0.1)
#estimator = CoxPHSurvivalAnalysis()
estimator.fit(features_train_df, y_train)

score, _, _, _, _ = concordance_index_censored(test_event, test_survival_time, estimator.predict(features_test_df))
print("Test: " + str(score))
score, _, _, _, _ = concordance_index_censored(train_event, train_survival_time, estimator.predict(features_train_df))
print("Train: " + str(score))

# get risk scores
train_risk_scores = estimator.predict(features_train_df)
test_risk_scores = estimator.predict(features_test_df)

median = np.mean(train_risk_scores)
count_low = 0
count_high = 0
for index in range(0, len(train_risk_scores)):
    if train_risk_scores[index] > median:
        count_high += 1
        train_group.append(1)
    else:
        count_low += 1
        train_group.append(0)

count_low = 0
count_high = 0
for index in range(0, len(test_risk_scores)):
    if test_risk_scores[index] > median:
        count_high += 1
        group.append(1)
    else:
        count_low += 1
        group.append(0)

ValueError: Found input variables with inconsistent numbers of samples: [0, 0, 1]

In [None]:
# this block prints values for variables 'test_event', 'test_survival_time' and 'group' defined above
# these values are used in the 'univariate.m' script to find the corresponding HR, p-values and 95% CI
a = []
for index in range(0, len(test_event)):
    if test_event[index] == False:
        a.append(0)
    else:
        a.append(1)
print(*a, sep="; ")

print(*test_survival_time, sep="; ")

age = []
for index in range(0, len(test_clinical_var_age)):
    if test_clinical_var_age[index] <=60:
        age.append(0)
    else:
        age.append(1)
        
g = []
for index in range(0, len(test_clinical_var_age)):
    g.append([age[index], np.abs(test_clinical_var_stage[index]), group[index]])
print(*group, sep="; ")