In [1]:
# header files
%matplotlib inline
import glob
import csv
import numpy as np
import pandas as pd
from sksurv.nonparametric import kaplan_meier_estimator
from sksurv.linear_model import CoxnetSurvivalAnalysis
from sksurv.ensemble import RandomSurvivalForest
from sksurv.metrics import (
    concordance_index_censored,
    concordance_index_ipcw,
    cumulative_dynamic_auc,
    integrated_brier_score,
)
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_regression, f_classif
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [100, 100]
print("Header files loaded!")

Header files loaded!


In [2]:
# hyper-parameters
is_ovarian_cancer = 1
is_cervix_cancer = 0
is_endometrial_cancer = 0

In [3]:
# load ovarian cancer files
if is_ovarian_cancer:
    oc_files = (glob.glob("../results/oc_collagen_features/window_1/*"))
    print(len(oc_files))

95


In [4]:
# collect features
if is_ovarian_cancer:
    collagen_features = []
    for file in oc_files:
        filename = file.split("/")[-1]
        flag = -1
        file_features = []
        with open(file, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../results/oc_collagen_features/window_2/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../results/oc_collagen_features/window_3/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../results/oc_collagen_features/window_4/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../results/oc_collagen_features/window_5/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../results/oc_collagen_features/window_6/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../results/oc_collagen_features/window_7/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../results/oc_collagen_features/window_8/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../results/oc_collagen_features/window_9/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
        collagen_features.append(file_features)

In [5]:
# create output survival information for training model and get til features
if is_ovarian_cancer:
    til_features = []
    censor = []
    days = []
    filenames = []
    flag = -1
    with open("../results/DATA_OC.csv", newline='', encoding = "ISO-8859-1") as csvfile:
        spamreader = csv.reader(csvfile)
        for row in spamreader:
            if flag == -1:
                flag = 1
                print(row)
            else:
                array = row
                filenames.append(array[0])
                f_features = [float(array[1]), float(array[2]), float(array[3]), float(array[4]), float(array[5]), float(array[6]), float(array[7])]
                til_features.append(f_features)
                censor.append(bool(int(array[20])))
                days.append(int(array[23]))
                
    final_til_features = []
    y = []
    event = []
    survival_time = []
    for file in oc_files:
        count = 0
        filename1 = file.split("/")[-1][:-4]
        for filename in filenames:
            filename2 = filename
            if filename1 == filename2:
                final_til_features.append(til_features[count])
                y.append([censor[count], days[count]])
                event.append(censor[count])
                survival_time.append(days[count])
            count += 1
    print(len(final_til_features))
    print(len(y))
    print(len(event))
    print(len(survival_time))

['patient_name', 'SF1', 'SF2', 'SF3', 'SF4', 'SF5', 'SF6', 'SF7', 'cont_risk_score', 'binary_risk_score', 'WSI_Width', 'WSI_Height', 'year_of_birth', 'race', 'year_of_death', 'vital_status', 'Organ', 'treatment_type', 'Age', 'TTE', 'censor', 'Site', 'stage', 'OS_days', 'Vital', 'stage_numeric']
95
95
95
95


In [6]:
# generate training set for training model
features = []
for index in range(0, len(oc_files)):
    features.append(final_til_features[index]+collagen_features[index])
    #features.append(collagen_features[index])
    #features.append(final_til_features[index])
print(len(features))
print(len(features[0]))

95
34


In [7]:
# final training information to be used for training model
features = np.array(features)
y = np.array(y)
event = np.array(event)
survival_time = np.array(survival_time)

In [8]:
# load ovarian cancer files
is_cervix_cancer = 1
if is_cervix_cancer:
    test_cc_files = (glob.glob("../../tcga_cervix_cancer/collagen_feature_maps_200_final/*"))
    print(len(test_cc_files))

262


In [9]:
# collect test features
if is_cervix_cancer:
    test_collagen_features = []
    for file in test_cc_files:
        filename = file.split("/")[-1]
        flag = -1
        file_features = []
        with open(file, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../../tcga_cervix_cancer/collagen_feature_maps_250_final/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../../tcga_cervix_cancer/collagen_feature_maps_300_final/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../../tcga_cervix_cancer/collagen_feature_maps_350_final/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../../tcga_cervix_cancer/collagen_feature_maps_400_final/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../../tcga_cervix_cancer/collagen_feature_maps_450_final/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../../tcga_cervix_cancer/collagen_feature_maps_500_final/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../../tcga_cervix_cancer/collagen_feature_maps_550_final/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../../tcga_cervix_cancer/collagen_feature_maps_600_final/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
        test_collagen_features.append(file_features)

In [10]:
# create output survival information for training model and get til features
if is_cervix_cancer:
    til_features_chemo = []
    censor_chemo = []
    days_chemo = []
    filenames_chemo = []
    til_features_radio = []
    censor_radio = []
    days_radio = []
    filenames_radio = []
    flag = -1
    with open("../../DATA.csv", newline='', encoding = "ISO-8859-1") as csvfile:
        spamreader = csv.reader(csvfile)
        for row in spamreader:
            if flag == -1:
                flag = 1
                print(row)
            else:
                array = row
                if array[17] == "Cervix" and array[18] == "Chemotherapy":
                    filenames_chemo.append(array[0])
                    f_features = [float(array[1]), float(array[2]), float(array[3]), float(array[4]), float(array[5]), float(array[6]), float(array[7])]
                    til_features_chemo.append(f_features)
                    censor_chemo.append(bool(int(array[24])))
                    days_chemo.append(int(array[23]))
                    
                if array[17] == "Cervix" and array[18] == "Radiation Therapy":
                    filenames_radio.append(array[0])
                    f_features = [float(array[1]), float(array[2]), float(array[3]), float(array[4]), float(array[5]), float(array[6]), float(array[7])]
                    til_features_radio.append(f_features)
                    censor_radio.append(bool(int(array[24])))
                    days_radio.append(int(array[23]))
    print(len(filenames_chemo))
    print(len(filenames_radio))

['patient_name', 'SF1', 'SF2', 'SF3', 'SF4', 'SF5', 'SF6', 'SF7', 'cont_risk_score', 'binary_risk_score', 'file_name', 'WSI_Width', 'WSI_Height', 'year_of_birth', 'race', 'year_of_death', 'vital_status', 'Organ', 'treatment_type', 'Age', 'ajcc_pathologic_m', 'ajcc_pathologic_n', 'ajcc_pathologic_t', 'TTE', 'censor', 'thres', 'Site', 'stage', 'OS_days', 'Vital', 'Grade', 'BMI', 'plannedRegimen', 'stage_numeric', 'Cohort']
140
129


In [11]:
final_filenames_chemo = []
final_til_features_chemo = []
final_collagen_features_chemo = []
y_chemo = []
event_chemo = []
survival_time_chemo = []
index = 0
for file in test_cc_files:
    count = 0
    filename1 = file.split("/")[-1][:-4]
    flag = -1
    for filename in filenames_chemo:
        filename2 = filename
        if filename1 == filename2:
            final_til_features_chemo.append(til_features_chemo[count])
            y_chemo.append([censor_chemo[count], days_chemo[count]])
            event_chemo.append(censor_chemo[count])
            survival_time_chemo.append(days_chemo[count])
            final_collagen_features_chemo.append(test_collagen_features[index])
            final_filenames_chemo.append(file)
            flag = 1
        count += 1
    index += 1
print(len(final_filenames_chemo))
print(len(final_collagen_features_chemo))
print(len(final_til_features_chemo))
print(len(y_chemo))
print(len(event_chemo))
print(len(survival_time_chemo))

134
134
134
134
134
134


In [12]:
final_filenames_radio = []
final_til_features_radio = []
final_collagen_features_radio = []
y_radio = []
event_radio = []
survival_time_radio = []
index = 0
for file in test_cc_files:
    count = 0
    filename1 = file.split("/")[-1][:-4]
    flag = -1
    for filename in filenames_radio:
        filename2 = filename
        if filename1 == filename2:
            final_til_features_radio.append(til_features_radio[count])
            y_radio.append([censor_radio[count], days_radio[count]])
            event_radio.append(censor_radio[count])
            survival_time_radio.append(days_radio[count])
            final_collagen_features_radio.append(test_collagen_features[index])
            final_filenames_radio.append(file)
            flag = 1
        count += 1
    index += 1
print(len(final_filenames_radio))
print(len(final_til_features_radio))
print(len(y_radio))
print(len(event_radio))
print(len(survival_time_radio))

128
128
128
128
128


In [13]:
# generate training set for testing model
test_features = []
for index in range(0, len(final_filenames_radio)):
    test_features.append(final_til_features_radio[index] + final_collagen_features_radio[index])
    #test_features.append(final_collagen_features_chemo[index])
    #test_features.append(final_til_features_chemo[index])
print(len(test_features))
print(len(test_features[0]))

128
34


In [14]:
# run on test set
group = []
features_train = features
features_test = test_features
y_train = y
event_train, survival_time_train = event, survival_time
dt = dtype=[('Status', '?'), ('Survival_in_days', '<f8')]
y_train = np.array([tuple(row) for row in y_train], dtype=dt)
scaler = MinMaxScaler()
features_train = scaler.fit_transform(features_train)
features_test = scaler.transform(features_test)
features_train_df = pd.DataFrame(features_train)
features_test_df = pd.DataFrame(features_test)
        
# fit model
estimator = CoxnetSurvivalAnalysis(l1_ratio=0.9, alpha_min_ratio=0.1)
estimator.fit(features_train_df, y_train)

score, _, _, _, _ = concordance_index_censored(event_radio, survival_time_radio, estimator.predict(features_test_df))
print("Test: " + str(score))
score, _, _, _, _ = concordance_index_censored(event, survival_time, estimator.predict(features_train_df))
print("Train: " + str(score))

# get risk scores
train_risk_scores = estimator.predict(features_train_df)
test_risk_scores = estimator.predict(features_test_df)

median = np.percentile(train_risk_scores, 66)
count_low = 0
count_high = 0
for index in range(0, len(test_risk_scores)):
    if test_risk_scores[index] > median:
        count_high += 1
        group.append(1)
    else:
        count_low += 1
        group.append(0)

Test: 0.6703703703703704
Train: 0.7264375637972099


In [15]:
a = []
for index in range(0, len(event_radio)):
    if event_radio[index] == False:
        a.append(0)
    else:
        a.append(1)
print(*a, sep="; ")

1; 1; 0; 0; 0; 0; 1; 1; 0; 0; 0; 1; 0; 0; 1; 0; 0; 1; 1; 0; 1; 0; 0; 0; 0; 0; 0; 0; 1; 0; 0; 0; 0; 0; 1; 0; 0; 0; 0; 0; 1; 0; 0; 1; 0; 0; 0; 0; 1; 0; 0; 0; 0; 1; 0; 1; 1; 1; 1; 0; 0; 0; 1; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 1; 1; 0; 1; 0; 0; 0; 0; 0; 0; 1; 0; 0; 0; 0; 0; 0; 0; 0; 1; 1; 1; 1; 0; 0; 0; 0; 0; 0; 1; 0; 0; 0; 1; 1; 1; 0; 1; 0; 0; 0; 0; 0; 1; 0; 0; 1; 0; 1; 0


In [16]:
print(*survival_time_radio, sep="; ")

350; 1118; 634; 463; 540; 573; 52; 227; 1386; 444; 1066; 1372; 1667; 636; 1394; 1735; 2044; 829; 442; 1505; 582; 3609; 1415; 854; 1778; 25; 1345; 9; 305; 2056; 1053; 4385; 4694; 447; 414; 2949; 5385; 0; 474; 1112; 4086; 0; 699; 3097; 533; 426; 0; 0; 2052; 4172; 602; 4482; 0; 2520; 791; 100; 955; 2032; 266; 3935; 1409; 730; 166; 747; 43; 485; 791; 380; 376; 406; 473; 321; 34; 720; 499; 834; 5; 795; 1065; 355; 287; 659; 632; 1078; 829; 1106; 453; 474; 638; 428; 6408; 2394; 725; 3874; 241; 2493; 0; 570; 604; 1083; 908; 83; 685; 3988; 619; 890; 761; 633; 1015; 1136; 1992; 506; 104; 348; 149; 144; 793; 346; 1561; 986; 1630; 642; 1013; 1444; 3046; 954; 978; 548


In [17]:
print(*group, sep="; ")

0; 1; 0; 0; 1; 1; 1; 1; 1; 1; 0; 1; 0; 1; 0; 1; 1; 1; 1; 1; 1; 1; 1; 0; 1; 0; 1; 1; 1; 1; 0; 1; 0; 0; 1; 0; 1; 1; 1; 1; 0; 1; 0; 1; 0; 0; 1; 1; 1; 0; 0; 0; 0; 1; 1; 0; 0; 0; 0; 0; 0; 0; 1; 1; 0; 1; 0; 0; 0; 1; 0; 1; 1; 0; 0; 0; 1; 0; 1; 1; 1; 1; 1; 1; 0; 0; 1; 0; 1; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 1; 1; 0; 0; 0; 0; 0; 0; 1; 0; 0; 1; 0; 1; 0; 1; 1; 1; 0; 1; 0; 0; 1; 0; 0; 1; 0; 1; 1


In [22]:
count = []
for index in range(0, len(final_filenames_radio)):
    if group[index] == 1 and event_radio[index] == 1:
        print(final_filenames_radio[index])
        count.append(survival_time_radio[index])
print(np.median(count))
print(np.min(count))
print(np.max(count))
print(len(count))

../../tcga_cervix_cancer/collagen_feature_maps_200_final/TCGA-VS-A8QC.csv
../../tcga_cervix_cancer/collagen_feature_maps_200_final/TCGA-C5-A8XH.csv
../../tcga_cervix_cancer/collagen_feature_maps_200_final/TCGA-C5-A7CK.csv
../../tcga_cervix_cancer/collagen_feature_maps_200_final/TCGA-JW-A5VH.csv
../../tcga_cervix_cancer/collagen_feature_maps_200_final/TCGA-C5-A1M6.csv
../../tcga_cervix_cancer/collagen_feature_maps_200_final/TCGA-IR-A3LB.csv
../../tcga_cervix_cancer/collagen_feature_maps_200_final/TCGA-DS-A1O9.csv
../../tcga_cervix_cancer/collagen_feature_maps_200_final/TCGA-C5-A1BF.csv
../../tcga_cervix_cancer/collagen_feature_maps_200_final/TCGA-C5-A1BQ.csv
../../tcga_cervix_cancer/collagen_feature_maps_200_final/TCGA-VS-A94X.csv
../../tcga_cervix_cancer/collagen_feature_maps_200_final/TCGA-C5-A0TN.csv
570.0
100
4086
11


In [None]:
# find prognostic features from model trained above
count = 0
for index1 in range(0, len(estimator.coef_)):
    flag = -1
    for index2 in range(0, len(estimator.coef_[index1])):
        if estimator.coef_[index1][index2] > 0:
            flag = 1
            print(index1)
            break
    if flag == 1:
        count += 1
print()
print("Prognostic features count = " + str(count))