In [1]:
# header files
import glob
import csv
import numpy as np
import pandas as pd
from sksurv.nonparametric import kaplan_meier_estimator
from sksurv.linear_model import CoxnetSurvivalAnalysis
from sksurv.ensemble import RandomSurvivalForest
from sksurv.metrics import (
    concordance_index_censored,
    concordance_index_ipcw,
    cumulative_dynamic_auc,
    integrated_brier_score,
)
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_regression, f_classif
print("Header files loaded!")

Header files loaded!


In [2]:
# hyper-parameters
is_ovarian_cancer = 1
is_cervix_cancer = 0
is_endometrial_cancer = 0

In [3]:
# load ovarian cancer files
if is_ovarian_cancer:
    oc_files = (glob.glob("../results/oc_collagen_features/window_1/*"))
    print(len(oc_files))

95


In [4]:
# collect features
if is_ovarian_cancer:
    collagen_features = []
    for file in oc_files:
        filename = file.split("/")[-1]
        flag = -1
        file_features = []
        with open(file, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../results/oc_collagen_features/window_2/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../results/oc_collagen_features/window_3/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../results/oc_collagen_features/window_4/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../results/oc_collagen_features/window_5/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../results/oc_collagen_features/window_6/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../results/oc_collagen_features/window_7/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../results/oc_collagen_features/window_8/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../results/oc_collagen_features/window_9/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
        collagen_features.append(file_features)

In [6]:
# create output survival information for training model and get til features
if is_ovarian_cancer:
    til_features = []
    censor = []
    days = []
    filenames = []
    flag = -1
    with open("../results/DATA_OC.csv", newline='', encoding = "ISO-8859-1") as csvfile:
        spamreader = csv.reader(csvfile)
        for row in spamreader:
            if flag == -1:
                flag = 1
                print(row)
            else:
                array = row
                filenames.append(array[0])
                f_features = [float(array[1]), float(array[2]), float(array[3]), float(array[4]), float(array[5]), float(array[6]), float(array[7])]
                til_features.append(f_features)
                censor.append(bool(int(array[20])))
                days.append(int(array[23]))
                
    final_til_features = []
    y = []
    event = []
    survival_time = []
    final_oc_files = []
    for file in oc_files:
        count = 0
        filename1 = file.split("/")[-1][:-4]
        for filename in filenames:
            filename2 = filename
            if filename1 == filename2:
                if days[count] < 200 and censor[count] == 1:
                    final_oc_files.append(filename1)
                    final_til_features.append(til_features[count])
                    y.append([censor[count], days[count]])
                    event.append(censor[count])
                    survival_time.append(days[count])
                
                if days[count] > 300:
                    final_oc_files.append(filename1)
                    final_til_features.append(til_features[count])
                    y.append([censor[count], days[count]])
                    event.append(censor[count])
                    survival_time.append(days[count])
            count += 1
    print(len(final_oc_files))
    print(len(final_til_features))
    print(len(y))
    print(len(event))
    print(len(survival_time))

['patient_name', 'SF1', 'SF2', 'SF3', 'SF4', 'SF5', 'SF6', 'SF7', 'cont_risk_score', 'binary_risk_score', 'WSI_Width', 'WSI_Height', 'year_of_birth', 'race', 'year_of_death', 'vital_status', 'Organ', 'treatment_type', 'Age', 'TTE', 'censor', 'Site', 'stage', 'OS_days', 'Vital', 'stage_numeric']
85
85
85
85
85


In [7]:
# generate training set for training model
features = []
for index in range(0, len(final_oc_files)):
    #features.append(final_til_features[index]+collagen_features[index])
    features.append(collagen_features[index])
    #features.append(final_til_features[index])
print(len(features))
print(len(features[0]))

85
27


In [8]:
# final training information to be used for training model
features = np.array(features)
y = np.array(y)
event = np.array(event)
survival_time = np.array(survival_time)

In [10]:
# main code for training
iter_scores = []
max_score = -1
dt = dtype=[('Status', '?'), ('Survival_in_days', '<f8')]
for iter in range(100):
    model_score = []
    kf = KFold(n_splits=10, shuffle=True)
    for train_index, test_index in kf.split(features):
        # get the training and validation data
        features_train, features_test = features[train_index], features[test_index]
        y_train, y_test = y[train_index], y[test_index]
        event_train, survival_time_train = event[train_index], survival_time[train_index]
        event_test, survival_time_test = event[test_index], survival_time[test_index]
        y_train = np.array([tuple(row) for row in y_train], dtype=dt)
        y_test = np.array([tuple(row) for row in y_test], dtype=dt)
        
        # feature selection
        scaler = MinMaxScaler()
        features_train = scaler.fit_transform(features_train)
        features_test = scaler.transform(features_test)
        select = SelectKBest(score_func=chi2, k=len(features[0])-4)
        features_train_selected = select.fit_transform(features_train, survival_time_train)
        features_test_selected = select.transform(features_test)
        features_train_df = pd.DataFrame(features_train_selected)
        features_test_df = pd.DataFrame(features_test_selected)
        
        # fit model
        estimator = CoxnetSurvivalAnalysis(l1_ratio=0.9, alpha_min_ratio=0.001)
        estimator.fit(features_train_df, y_train)
        
        # score on validation set
        score, _, _, _, _ = concordance_index_censored(event_test, survival_time_test, estimator.predict(features_test_df))
        model_score.append(score)
        if score > max_score:
            max_score = score
    
    if len(model_score) > 0:
        iter_scores.append(np.mean(model_score))
        max_score = max(max(model_score), max_score)
print(np.mean(iter_scores), np.std(iter_scores))
print(max_score)

0.5342332306564795 0.033626665305022174
1.0


In [None]:
# model to be used for external validation
features_train = features
y_train = y
event_train, survival_time_train = event, survival_time
dt = dtype=[('Status', '?'), ('Survival_in_days', '<f8')]
y_train = np.array([tuple(row) for row in y_train], dtype=dt)
        
# feature selection
scaler = MinMaxScaler()
features_train = scaler.fit_transform(features_train)
select = SelectKBest(score_func=chi2, k=len(features[0])-4)
features_train_selected = select.fit_transform(features_train, survival_time_train)
features_train_df = pd.DataFrame(features_train_selected)
        
# fit model
estimator = CoxnetSurvivalAnalysis(l1_ratio=0.9, alpha_min_ratio=0.001)
estimator.fit(features_train_df, y_train)

In [None]:
# find prognostic features from model trained above
count = 0
for index1 in range(0, len(estimator.coef_)):
    flag = -1
    for index2 in range(0, len(estimator.coef_[index1])):
        if estimator.coef_[index1][index2] > 0:
            flag = 1
            print(index1)
            break
    if flag == 1:
        count += 1
print()
print("Prognostic features count = " + str(count))

In [None]:
# work with test data
test_oc_files = (glob.glob("../results/oc_collagen_features/test_window_1/*"))
print(len(test_oc_files))

In [None]:
# collect test features
is_ovarian_cancer = 1
if is_ovarian_cancer:
    test_collagen_features = []
    for file in test_oc_files:
        filename = file.split("/")[-1]
        flag = -1
        file_features = []
        with open(file, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../results/oc_collagen_features/test_window_2/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../results/oc_collagen_features/test_window_3/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../results/oc_collagen_features/test_window_4/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../results/oc_collagen_features/test_window_5/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../results/oc_collagen_features/test_window_6/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../results/oc_collagen_features/test_window_7/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../results/oc_collagen_features/test_window_8/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../results/oc_collagen_features/test_window_9/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
        test_collagen_features.append(file_features)

In [None]:
test_til_features = []
test_filenames = []
flag = -1
with open("../results/DATA_UPMC.csv", newline='', encoding = "ISO-8859-1") as csvfile:
    spamreader = csv.reader(csvfile)
    for row in spamreader:
        if flag == -1:
            flag = 1
            print(row)
        else:
            array = row
            test_filenames.append(array[0])
            f_features = [float(array[1]), float(array[2]), float(array[3]), float(array[4]), float(array[5]), float(array[6]), float(array[7])]
            test_til_features.append(f_features)

final_test_til_features = []
for file in test_oc_files:
    count = 0
    filename1 = file.split("/")[-1][:-7]
    for filename in test_filenames:
        filename2 = filename
        if filename1 == filename2:
            final_test_til_features.append(test_til_features[count])
        count += 1
print(len(final_test_til_features))

In [None]:
# generate training set for testing model
test_features = []
for index in range(0, len(test_oc_files)):
    test_features.append(final_test_til_features[index] + test_collagen_features[index])
    #test_features.append(test_collagen_features[index])
    #test_features.append(final_test_til_features[index])
print(len(test_features))
print(len(test_features[0]))

In [None]:
# run on test set
features_train = features
features_test = test_features
y_train = y
event_train, survival_time_train = event, survival_time
dt = dtype=[('Status', '?'), ('Survival_in_days', '<f8')]
y_train = np.array([tuple(row) for row in y_train], dtype=dt)
scaler = MinMaxScaler()
features_train = scaler.fit_transform(features_train)
features_test = scaler.transform(features_test)
select = SelectKBest(score_func=chi2, k=len(features[0])-4)
features_train_selected = select.fit_transform(features_train, survival_time_train)
features_test_selected = select.transform(features_test)
features_train_df = pd.DataFrame(features_train_selected)
features_test_df = pd.DataFrame(features_test_selected)
        
# fit model
estimator = CoxnetSurvivalAnalysis(l1_ratio=0.9, alpha_min_ratio=0.001)
estimator.fit(features_train_df, y_train)

# get risk scores
train_risk_scores = estimator.predict(features_train_df)
test_risk_scores = estimator.predict(features_test_df)

median = np.median(train_risk_scores)
count_low = 0
count_high = 0
for index in range(0, len(test_risk_scores)):
    if test_risk_scores[index] > median:
        count_high += 1
    else:
        count_low += 1
print(count_low)
print(count_high)

In [None]:
print(count_high)
for index in range(0, len(test_oc_files)):
    if test_risk_scores[index] > median:
        print(test_oc_files[index])

row_1 = []
row_2 = []
row_3 = []
row_4 = []
for index in range(0, len(test_oc_files)):
    filename = test_oc_files[index].split("/")[-1][:-8]
    flag = 0
    flag_category = "low"
    if test_risk_scores[index] > median:
        flag = 1
        flag_category = "high"
    row_1.append(str(filename))
    row_2.append(str(test_risk_scores[index]))
    row_3.append(str(flag))
    row_4.append(str(flag_category))
    
with open("../results/upmc_oc_collagen.csv", 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile)
    spamwriter.writerow(["Patient Id", "Risk Score", "Risk (Numerical)", "Risk (Category)"])
    for index in range(0, len(test_oc_files)):    
        spamwriter.writerow([row_1[index], row_2[index], row_3[index], row_4[index]])

files = glob.glob("../results/upmc_oc_collagen_features/*")
print(len(files))

count_high = 0
count_low = 0
for file in files:
    flag = -1
    with open(file, newline='') as csvfile:
        spamreader = csv.reader(csvfile)
        for row in spamreader:
            if flag == -1:
                array = row
                flag = 1
            else:
                array = row
                if int(array[1]) == 0:
                    count_low += 1
                else:
                    count_high += 1
print(count_low)
print(count_high)