In [1]:
# header files
%matplotlib inline
import glob
import csv
import numpy as np
import pandas as pd
from sksurv.nonparametric import kaplan_meier_estimator
from sksurv.linear_model import CoxnetSurvivalAnalysis
from sksurv.ensemble import RandomSurvivalForest
from sksurv.metrics import (
    concordance_index_censored,
    concordance_index_ipcw,
    cumulative_dynamic_auc,
    integrated_brier_score,
)
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_regression, f_classif
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [100, 100]
print("Header files loaded!")

Header files loaded!


In [2]:
# hyper-parameters
is_ovarian_cancer = 0
is_cervix_cancer = 1
is_endometrial_cancer = 0

In [3]:
# load ovarian cancer files
if is_cervix_cancer:
    cc_files = (glob.glob("../../tcga_cervix_cancer/collagen_feature_maps_200_final/*"))
    print(len(cc_files))

262


In [4]:
# collect features
if is_cervix_cancer:
    collagen_features = []
    for file in cc_files:
        filename = file.split("/")[-1]
        flag = -1
        file_features = []
        with open(file, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../../tcga_cervix_cancer/collagen_feature_maps_250_final/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../../tcga_cervix_cancer/collagen_feature_maps_300_final/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../../tcga_cervix_cancer/collagen_feature_maps_350_final/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../../tcga_cervix_cancer/collagen_feature_maps_400_final/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../../tcga_cervix_cancer/collagen_feature_maps_450_final/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../../tcga_cervix_cancer/collagen_feature_maps_500_final/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../../tcga_cervix_cancer/collagen_feature_maps_550_final/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../../tcga_cervix_cancer/collagen_feature_maps_600_final/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
        collagen_features.append(file_features)

In [5]:
# create output survival information for training model and get til features
if is_cervix_cancer:
    til_features_chemo = []
    censor_chemo = []
    days_chemo = []
    filenames_chemo = []
    til_features_radio = []
    censor_radio = []
    days_radio = []
    filenames_radio = []
    flag = -1
    with open("../../DATA.csv", newline='', encoding = "ISO-8859-1") as csvfile:
        spamreader = csv.reader(csvfile)
        for row in spamreader:
            if flag == -1:
                flag = 1
                print(row)
            else:
                array = row
                if array[17] == "Cervix" and array[18] == "Chemotherapy":
                    filenames_chemo.append(array[0])
                    f_features = [float(array[1]), float(array[2]), float(array[3]), float(array[4]), float(array[5]), float(array[6]), float(array[7])]
                    til_features_chemo.append(f_features)
                    censor_chemo.append(bool(int(array[24])))
                    days_chemo.append(int(array[28]))
                    
                if array[17] == "Cervix" and array[18] == "Radiation Therapy":
                    filenames_radio.append(array[0])
                    f_features = [float(array[1]), float(array[2]), float(array[3]), float(array[4]), float(array[5]), float(array[6]), float(array[7])]
                    til_features_radio.append(f_features)
                    censor_radio.append(bool(int(array[24])))
                    days_radio.append(int(array[28]))
    print(len(filenames_chemo))
    print(len(filenames_radio))

['patient_name', 'SF1', 'SF2', 'SF3', 'SF4', 'SF5', 'SF6', 'SF7', 'cont_risk_score', 'binary_risk_score', 'file_name', 'WSI_Width', 'WSI_Height', 'year_of_birth', 'race', 'year_of_death', 'vital_status', 'Organ', 'treatment_type', 'Age', 'ajcc_pathologic_m', 'ajcc_pathologic_n', 'ajcc_pathologic_t', 'TTE', 'censor', 'thres', 'Site', 'stage', 'OS_days', 'Vital', 'Grade', 'BMI', 'plannedRegimen', 'stage_numeric', 'Cohort']
140
129


In [6]:
final_filenames_chemo = []
final_til_features_chemo = []
final_collagen_features_chemo = []
y_chemo = []
event_chemo = []
survival_time_chemo = []
index = 0
for file in cc_files:
    count = 0
    filename1 = file.split("/")[-1][:-4]
    flag = -1
    for filename in filenames_chemo:
        filename2 = filename
        if filename1 == filename2:
            final_til_features_chemo.append(til_features_chemo[count])
            y_chemo.append([censor_chemo[count], days_chemo[count]])
            event_chemo.append(censor_chemo[count])
            survival_time_chemo.append(days_chemo[count])
            final_collagen_features_chemo.append(collagen_features[index])
            final_filenames_chemo.append(file)
            flag = 1
        count += 1
    index += 1
print(len(final_filenames_chemo))
print(len(final_collagen_features_chemo))
print(len(final_til_features_chemo))
print(len(y_chemo))
print(len(event_chemo))
print(len(survival_time_chemo))

134
134
134
134
134
134


In [7]:
final_filenames_radio = []
final_til_features_radio = []
final_collagen_features_radio = []
y_radio = []
event_radio = []
survival_time_radio = []
index = 0
for file in cc_files:
    count = 0
    filename1 = file.split("/")[-1][:-4]
    flag = -1
    for filename in filenames_radio:
        filename2 = filename
        if filename1 == filename2:
            final_til_features_radio.append(til_features_radio[count])
            y_radio.append([censor_radio[count], days_radio[count]])
            event_radio.append(censor_radio[count])
            survival_time_radio.append(days_radio[count])
            final_collagen_features_radio.append(collagen_features[index])
            final_filenames_radio.append(file)
            flag = 1
        count += 1
    index += 1
print(len(final_filenames_radio))
print(len(final_til_features_radio))
print(len(y_radio))
print(len(event_radio))
print(len(survival_time_radio))

128
128
128
128
128


In [8]:
print(final_filenames_chemo[0] + " " + str(event_chemo[0]) + " " + str(survival_time_chemo[0]))
print(final_filenames_chemo[50] + " " + str(event_chemo[50]) + " " + str(survival_time_chemo[50]))

../../tcga_cervix_cancer/collagen_feature_maps_200_final/TCGA-VS-A957.csv False 9
../../tcga_cervix_cancer/collagen_feature_maps_200_final/TCGA-IR-A3L7.csv False 119


In [9]:
print(final_filenames_radio[0] + " " + str(event_radio[0]) + " " + str(survival_time_radio[0]))
print(final_filenames_radio[50] + " " + str(event_radio[50]) + " " + str(survival_time_radio[50]))

../../tcga_cervix_cancer/collagen_feature_maps_200_final/TCGA-VS-A8QC.csv True 2949
../../tcga_cervix_cancer/collagen_feature_maps_200_final/TCGA-EX-A69L.csv False 1320


In [39]:
# generate training set for training model
features = []
for index in range(0, len(final_filenames_radio)):
    features.append(final_collagen_features_radio[index] + final_til_features_radio[index])
    #features.append(final_collagen_features_radio[index])
    #features.append(final_til_features[index])
print(len(features))
print(len(features[0]))

128
34


In [40]:
# final training information to be used for training model
features = np.array(features)
y = np.array(y_radio)
event = np.array(event_radio)
survival_time = np.array(survival_time_radio)

In [44]:
# main code for training
iter_scores = []
max_score = -1
dt = dtype=[('Status', '?'), ('Survival_in_days', '<f8')]
for iter in range(100):
    model_score = []
    kf = KFold(n_splits=5, shuffle=True)
    for train_index, test_index in kf.split(features):
        # get the training and validation data
        features_train, features_test = features[train_index], features[test_index]
        y_train, y_test = y[train_index], y[test_index]
        event_train, survival_time_train = event[train_index], survival_time[train_index]
        event_test, survival_time_test = event[test_index], survival_time[test_index]
        y_train = np.array([tuple(row) for row in y_train], dtype=dt)
        y_test = np.array([tuple(row) for row in y_test], dtype=dt)
        
        # feature selection
        scaler = MinMaxScaler()
        features_train = scaler.fit_transform(features_train)
        features_test = scaler.transform(features_test)
        select = SelectKBest(score_func=chi2, k=len(features[0])-20)
        features_train_selected = select.fit_transform(features_train, survival_time_train)
        features_test_selected = select.transform(features_test)
        features_train_df = pd.DataFrame(features_train)
        features_test_df = pd.DataFrame(features_test)
        
        # fit model
        estimator = CoxnetSurvivalAnalysis(l1_ratio=0.9, alpha_min_ratio=0.005)
        estimator.fit(features_train_df, y_train)
        
        # score on validation set
        score, _, _, _, _ = concordance_index_censored(event_test, survival_time_test, estimator.predict(features_test_df))
        model_score.append(score)
        if score > max_score:
            max_score = score
    
    if len(model_score) > 0:
        iter_scores.append(np.mean(model_score))
        max_score = max(max(model_score), max_score)
print(np.mean(iter_scores), np.std(iter_scores))
print(max_score)

0.6659333252914236 0.0449889677995397
1.0


In [46]:
# model to be used for external validation
features_train = features
y_train = y
event_train, survival_time_train = event, survival_time
dt = dtype=[('Status', '?'), ('Survival_in_days', '<f8')]
y_train = np.array([tuple(row) for row in y_train], dtype=dt)
        
# feature selection
scaler = MinMaxScaler()
features_train = scaler.fit_transform(features_train)
select = SelectKBest(score_func=chi2, k=len(features[0])-20)
features_train_selected = select.fit_transform(features_train, survival_time_train)
features_train_df = pd.DataFrame(features_train)
        
# fit model
estimator = CoxnetSurvivalAnalysis(l1_ratio=0.9, alpha_min_ratio=0.005)
estimator.fit(features_train_df, y_train)

CoxnetSurvivalAnalysis(alpha_min_ratio=0.005, l1_ratio=0.9)

In [47]:
# find prognostic features from model trained above
count = 0
for index1 in range(0, len(estimator.coef_)):
    flag = -1
    for index2 in range(0, len(estimator.coef_[index1])):
        if estimator.coef_[index1][index2] > 0:
            flag = 1
            print(index1)
            break
    if flag == 1:
        count += 1
print()
print("Prognostic features count = " + str(count))

0
1
3
6
9
11
15
19
20
21
23
25
29
32

Prognostic features count = 14


In [None]:
# work with test data
test_oc_files = (glob.glob("../results/oc_collagen_features/test_window_1/*"))
print(len(test_oc_files))

In [None]:
# collect test features
is_ovarian_cancer = 1
if is_ovarian_cancer:
    test_collagen_features = []
    for file in test_oc_files:
        filename = file.split("/")[-1]
        flag = -1
        file_features = []
        with open(file, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../results/oc_collagen_features/test_window_2/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../results/oc_collagen_features/test_window_3/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../results/oc_collagen_features/test_window_4/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../results/oc_collagen_features/test_window_5/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../results/oc_collagen_features/test_window_6/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../results/oc_collagen_features/test_window_7/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../results/oc_collagen_features/test_window_8/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../results/oc_collagen_features/test_window_9/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
        test_collagen_features.append(file_features)

In [None]:
test_til_features = []
test_filenames = []
flag = -1
with open("../results/DATA_UPMC.csv", newline='', encoding = "ISO-8859-1") as csvfile:
    spamreader = csv.reader(csvfile)
    for row in spamreader:
        if flag == -1:
            flag = 1
            print(row)
        else:
            array = row
            test_filenames.append(array[0])
            f_features = [float(array[1]), float(array[2]), float(array[3]), float(array[4]), float(array[5]), float(array[6]), float(array[7])]
            test_til_features.append(f_features)

final_test_til_features = []
for file in test_oc_files:
    count = 0
    filename1 = file.split("/")[-1][:-7]
    for filename in test_filenames:
        filename2 = filename
        if filename1 == filename2:
            final_test_til_features.append(test_til_features[count])
        count += 1
print(len(final_test_til_features))

In [None]:
# generate training set for testing model
test_features = []
for index in range(0, len(test_oc_files)):
    test_features.append(final_test_til_features[index] + test_collagen_features[index])
    #test_features.append(test_collagen_features[index])
    #test_features.append(final_test_til_features[index])
print(len(test_features))
print(len(test_features[0]))

In [None]:
# run on test set
features_train = features
features_test = test_features
y_train = y
event_train, survival_time_train = event, survival_time
dt = dtype=[('Status', '?'), ('Survival_in_days', '<f8')]
y_train = np.array([tuple(row) for row in y_train], dtype=dt)
scaler = MinMaxScaler()
features_train = scaler.fit_transform(features_train)
features_test = scaler.transform(features_test)
select = SelectKBest(score_func=chi2, k=len(features[0])-4)
features_train_selected = select.fit_transform(features_train, survival_time_train)
features_test_selected = select.transform(features_test)
features_train_df = pd.DataFrame(features_train_selected)
features_test_df = pd.DataFrame(features_test_selected)
        
# fit model
estimator = CoxnetSurvivalAnalysis(l1_ratio=0.9, alpha_min_ratio=0.001)
estimator.fit(features_train_df, y_train)

# get risk scores
train_risk_scores = estimator.predict(features_train_df)
test_risk_scores = estimator.predict(features_test_df)

median = np.median(train_risk_scores)
count_low = 0
count_high = 0
for index in range(0, len(test_risk_scores)):
    if test_risk_scores[index] > median:
        count_high += 1
    else:
        count_low += 1
print(count_low)
print(count_high)

In [None]:
print(count_high)
for index in range(0, len(test_oc_files)):
    if test_risk_scores[index] > median:
        print(test_oc_files[index])

row_1 = []
row_2 = []
row_3 = []
row_4 = []
for index in range(0, len(test_oc_files)):
    filename = test_oc_files[index].split("/")[-1][:-8]
    flag = 0
    flag_category = "low"
    if test_risk_scores[index] > median:
        flag = 1
        flag_category = "high"
    row_1.append(str(filename))
    row_2.append(str(test_risk_scores[index]))
    row_3.append(str(flag))
    row_4.append(str(flag_category))
    
with open("../results/upmc_oc_collagen.csv", 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile)
    spamwriter.writerow(["Patient Id", "Risk Score", "Risk (Numerical)", "Risk (Category)"])
    for index in range(0, len(test_oc_files)):    
        spamwriter.writerow([row_1[index], row_2[index], row_3[index], row_4[index]])

files = glob.glob("../results/upmc_oc_collagen_features/*")
print(len(files))

count_high = 0
count_low = 0
for file in files:
    flag = -1
    with open(file, newline='') as csvfile:
        spamreader = csv.reader(csvfile)
        for row in spamreader:
            if flag == -1:
                array = row
                flag = 1
            else:
                array = row
                if int(array[1]) == 0:
                    count_low += 1
                else:
                    count_high += 1
print(count_low)
print(count_high)

In [None]:
f = []
for feature in features:
    f.append(feature)
for test_feature in test_features:
    f.append(test_feature)
print(len(f))

In [None]:
features_train_df = pd.DataFrame(f)

In [None]:
features_train_df

In [None]:
corr = features_train_df.corr().abs()
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(corr,cmap='coolwarm', vmin=0, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,len(features_train_df.columns),1)
ax.set_xticks(ticks)
plt.xticks(rotation=90)
ax.set_yticks(ticks)
ax.set_xticklabels(features_train_df.columns)
ax.set_yticklabels(features_train_df.columns)
plt.show()

In [None]:
corr

In [None]:
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
new_data = features_train_df.drop(features_train_df[to_drop], axis=1)

In [None]:
print(features_train_df.shape)
new_data.shape