In [1]:
# header files
import glob
import csv
import numpy as np
import pandas as pd
from sksurv.nonparametric import kaplan_meier_estimator
from sksurv.linear_model import CoxnetSurvivalAnalysis
from sksurv.ensemble import RandomSurvivalForest
from sksurv.metrics import (
    concordance_index_censored,
    concordance_index_ipcw,
    cumulative_dynamic_auc,
    integrated_brier_score,
)
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_regression, f_classif
from datetime import datetime
plt.figure(figsize=(10,10))
print("Header files loaded!")

Header files loaded!


<Figure size 720x720 with 0 Axes>

In [2]:
# hyper-parameters
is_ovarian_cancer = 1
is_cervix_cancer = 0
is_endometrial_cancer = 0

In [3]:
# load ovarian cancer files
if is_ovarian_cancer:
    oc_files = (glob.glob("../results/oc_collagen_features/window_1/*"))
    print(len(oc_files))

95


In [4]:
# collect features
if is_ovarian_cancer:
    collagen_features = []
    for file in oc_files:
        filename = file.split("/")[-1]
        flag = -1
        file_features = []
        with open(file, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../results/oc_collagen_features/window_2/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../results/oc_collagen_features/window_3/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../results/oc_collagen_features/window_4/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../results/oc_collagen_features/window_5/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../results/oc_collagen_features/window_6/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../results/oc_collagen_features/window_7/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../results/oc_collagen_features/window_8/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
                    
        with open("../results/oc_collagen_features/window_9/" + filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = row
                    for index in range(0, len(array)-1):
                        file_features.append(float(array[index]))
        collagen_features.append(file_features)

In [5]:
# create output survival information for training model and get til features
if is_ovarian_cancer:
    til_features = []
    censor = []
    days = []
    filenames = []
    flag = -1
    with open("../results/DATA_OC.csv", newline='', encoding = "ISO-8859-1") as csvfile:
        spamreader = csv.reader(csvfile)
        for row in spamreader:
            if flag == -1:
                flag = 1
                print(row)
            else:
                array = row
                filenames.append(array[0])
                f_features = [float(array[1]), float(array[2]), float(array[3]), float(array[4]), float(array[5]), float(array[6]), float(array[7])]
                til_features.append(f_features)
                censor.append(bool(int(array[20])))
                days.append(int(array[23]))
                
    final_til_features = []
    y = []
    event = []
    survival_time = []
    for file in oc_files:
        count = 0
        filename1 = file.split("/")[-1][:-4]
        for filename in filenames:
            filename2 = filename
            if filename1 == filename2:
                final_til_features.append(til_features[count])
                y.append([censor[count], days[count]])
                event.append(censor[count])
                survival_time.append(days[count])
            count += 1
    print(len(final_til_features))
    print(len(y))
    print(len(event))
    print(len(survival_time))

['patient_name', 'SF1', 'SF2', 'SF3', 'SF4', 'SF5', 'SF6', 'SF7', 'cont_risk_score', 'binary_risk_score', 'WSI_Width', 'WSI_Height', 'year_of_birth', 'race', 'year_of_death', 'vital_status', 'Organ', 'treatment_type', 'Age', 'TTE', 'censor', 'Site', 'stage', 'OS_days', 'Vital', 'stage_numeric']
95
95
95
95


In [6]:
# generate training set for training model
features = []
for index in range(0, len(oc_files)):
    #features.append(final_til_features[index]+collagen_features[index])
    features.append(collagen_features[index])
    #features.append(final_til_features[index])
print(len(features))
print(len(features[0]))

95
27


In [7]:
# final training information to be used for training model
features = np.array(features)
y = np.array(y)
event = np.array(event)
survival_time = np.array(survival_time)

In [8]:
# model to be used for external validation
features_train = features
y_train = y
event_train, survival_time_train = event, survival_time
dt = dtype=[('Status', '?'), ('Survival_in_days', '<f8')]
y_train = np.array([tuple(row) for row in y_train], dtype=dt)
        
# feature selection
scaler = MinMaxScaler()
features_train = scaler.fit_transform(features_train)
features_train_df = pd.DataFrame(features_train)
        
# fit model
estimator = CoxnetSurvivalAnalysis(l1_ratio=0.9, alpha_min_ratio=0.1)
estimator.fit(features_train_df, y_train)

CoxnetSurvivalAnalysis(alpha_min_ratio=0.1, l1_ratio=0.9)

In [9]:
# find prognostic features from model trained above
count = 0
for index1 in range(0, len(estimator.coef_)):
    flag = -1
    for index2 in range(0, len(estimator.coef_[index1])):
        if estimator.coef_[index1][index2] > 0:
            flag = 1
            print(index1)
            break
    if flag == 1:
        count += 1
print()
print("Prognostic features count = " + str(count))

1
11
12
16
17
19
21
23
25

Prognostic features count = 9


In [10]:
# hyper-parameters
date_format = "%m/%d/%Y"
is_ovarian_cancer = 1
is_cervix_cancer = 0
is_endometrial_cancer = 0

def mean(a):
    return sum(a) / len(a)

In [12]:
test_censor_chemo = []
test_days_chemo = []
test_filenames_chemo = []
flag = -1
with open("../../../Desktop/uh_ec.csv", newline='', encoding = "ISO-8859-1") as csvfile:
    spamreader = csv.reader(csvfile)
    for row in spamreader:
        if flag == -1:
            flag = 1
            print(row)
        else:
            array = row
            
            if array[1] == "":
                continue
            
            if (array[7] == "1" or array[7] == "?") and (array[8] == "0" or array[8] == ""):
                test_filenames_chemo.append(array[0])
                if array[4] == "0":
                    test_censor_chemo.append(False)
                
                    first_date = datetime.strptime(str(array[1]), date_format)
                    last_date = datetime.strptime(str(array[2]), date_format)
                    delta = last_date - first_date
                    test_days_chemo.append(delta.days)
                else:
                    test_censor_chemo.append(True)
                
                    first_date = datetime.strptime(str(array[1]), date_format)
                    last_date = datetime.strptime(str(array[4]), date_format)
                    delta = last_date - first_date
                    test_days_chemo.append(delta.days)
print(len(test_filenames_chemo))
print(len(test_censor_chemo))
print(len(test_days_chemo))

['Copath', 'Diagnosis date', 'Last follow up', 'Date recurred ', 'Date of death', 'BMI', 'BMI_binary [1=obese]', 'CTx', 'RTx [1=VB,2=EBRT,3=palliative]', 'Comorb_HTN', 'Comorb_DM', 'Comorb_Hyperlip ']
9
9
9


In [13]:
test_censor_radio = []
test_days_radio = []
test_filenames_radio = []
flag = -1
with open("../../../Desktop/uh_ec.csv", newline='', encoding = "ISO-8859-1") as csvfile:
    spamreader = csv.reader(csvfile)
    for row in spamreader:
        if flag == -1:
            flag = 1
            print(row)
        else:
            array = row
            
            if array[1] == "":
                continue
            
            if array[7] == "0" and array[8] != "0":
                test_filenames_radio.append(array[0])
                if array[4] == "0":
                    test_censor_radio.append(False)
                
                    first_date = datetime.strptime(str(array[1]), date_format)
                    last_date = datetime.strptime(str(array[2]), date_format)
                    delta = last_date - first_date
                    test_days_radio.append(delta.days)
                else:
                    test_censor_radio.append(True)
                
                    first_date = datetime.strptime(str(array[1]), date_format)
                    last_date = datetime.strptime(str(array[4]), date_format)
                    delta = last_date - first_date
                    test_days_radio.append(delta.days)
print(len(test_filenames_radio))
print(len(test_censor_radio))
print(len(test_days_radio))

['Copath', 'Diagnosis date', 'Last follow up', 'Date recurred ', 'Date of death', 'BMI', 'BMI_binary [1=obese]', 'CTx', 'RTx [1=VB,2=EBRT,3=palliative]', 'Comorb_HTN', 'Comorb_DM', 'Comorb_Hyperlip ']
16
16
16


In [24]:
test_censor_chemo_radio = []
test_days_chemo_radio = []
test_filenames_chemo_radio = []
flag = -1
with open("../../../Desktop/uh_ec.csv", newline='', encoding = "ISO-8859-1") as csvfile:
    spamreader = csv.reader(csvfile)
    for row in spamreader:
        if flag == -1:
            flag = 1
            print(row)
        else:
            array = row
            
            if array[1] == "":
                continue
            
            if (array[7] != "0" and array[8] != "0") and (array[7] != "" and array[8] != ""):
                test_filenames_chemo_radio.append(array[0])
                if array[4] == "0":
                    test_censor_chemo_radio.append(False)
                
                    first_date = datetime.strptime(str(array[1]), date_format)
                    last_date = datetime.strptime(str(array[2]), date_format)
                    delta = last_date - first_date
                    test_days_chemo_radio.append(delta.days)
                else:
                    test_censor_chemo_radio.append(True)
                
                    first_date = datetime.strptime(str(array[1]), date_format)
                    last_date = datetime.strptime(str(array[4]), date_format)
                    delta = last_date - first_date
                    test_days_chemo_radio.append(delta.days)
print(len(test_filenames_chemo_radio))
print(len(test_censor_chemo_radio))
print(len(test_days_chemo_radio))

['Copath', 'Diagnosis date', 'Last follow up', 'Date recurred ', 'Date of death', 'BMI', 'BMI_binary [1=obese]', 'CTx', 'RTx [1=VB,2=EBRT,3=palliative]', 'Comorb_HTN', 'Comorb_DM', 'Comorb_Hyperlip ']
38
38
38


In [25]:
print(test_filenames_chemo[5])
print(test_days_chemo[5])
print(test_censor_chemo[5])

S12-12393
2198
False


In [26]:
print(test_filenames_radio[5])
print(test_days_radio[5])
print(test_censor_radio[5])

S07-9737
846
False


In [28]:
print(test_filenames_chemo_radio[10])
print(test_days_chemo_radio[10])
print(test_censor_chemo_radio[10])

S07-26423
2107
False


In [29]:
test_ec_files = (glob.glob("../../uh_endometrial_cancer/collagen_feature_maps_200_final/*"))
print(len(test_ec_files))

225


In [30]:
test_final_filenames = []
test_final_censor = []
test_final_days = []
index = 0
for file in test_filenames_chemo_radio:
    flag = -1
    for file_1 in test_ec_files:
        if file in file_1:
            test_final_filenames.append(test_filenames_chemo_radio[index])
            test_final_censor.append(test_censor_chemo_radio[index])
            test_final_days.append(test_days_chemo_radio[index])
            flag = 1
            break
    index += 1
print(len(test_final_filenames))
print(len(test_final_censor))
print(len(test_final_days))

27
27
27


In [32]:
# collect features
test_collagen_features = []
for file in test_final_filenames:
    file_features = []
        
    for file_1 in test_ec_files:
        if file in file_1:
            filename = file_1.split("/")[-1]
            flag = -1
            slide_features = []
            
            with open(file_1, newline='') as csvfile:
                spamreader = csv.reader(csvfile)
                for row in spamreader:
                    if flag == -1:
                        array = row
                        for index in range(0, len(array)-1):
                            slide_features.append(float(array[index]))
                    
            with open("../../uh_endometrial_cancer/collagen_feature_maps_250_final/" + filename, newline='') as csvfile:
                spamreader = csv.reader(csvfile)
                for row in spamreader:
                    if flag == -1:
                        array = row
                        for index in range(0, len(array)-1):
                            slide_features.append(float(array[index]))
                    
            with open("../../uh_endometrial_cancer/collagen_feature_maps_300_final/" + filename, newline='') as csvfile:
                spamreader = csv.reader(csvfile)
                for row in spamreader:
                    if flag == -1:
                        array = row
                        for index in range(0, len(array)-1):
                            slide_features.append(float(array[index]))
                    
            with open("../../uh_endometrial_cancer/collagen_feature_maps_350_final/" + filename, newline='') as csvfile:
                spamreader = csv.reader(csvfile)
                for row in spamreader:
                    if flag == -1:
                        array = row
                        for index in range(0, len(array)-1):
                            slide_features.append(float(array[index]))
                    
            with open("../../uh_endometrial_cancer/collagen_feature_maps_400_final/" + filename, newline='') as csvfile:
                spamreader = csv.reader(csvfile)
                for row in spamreader:
                    if flag == -1:
                        array = row
                        for index in range(0, len(array)-1):
                            slide_features.append(float(array[index]))
                    
            with open("../../uh_endometrial_cancer/collagen_feature_maps_450_final/" + filename, newline='') as csvfile:
                spamreader = csv.reader(csvfile)
                for row in spamreader:
                    if flag == -1:
                        array = row
                        for index in range(0, len(array)-1):
                            slide_features.append(float(array[index]))
                    
            with open("../../uh_endometrial_cancer/collagen_feature_maps_500_final/" + filename, newline='') as csvfile:
                spamreader = csv.reader(csvfile)
                for row in spamreader:
                    if flag == -1:
                        array = row
                        for index in range(0, len(array)-1):
                            slide_features.append(float(array[index]))
                    
            with open("../../uh_endometrial_cancer/collagen_feature_maps_550_final/" + filename, newline='') as csvfile:
                spamreader = csv.reader(csvfile)
                for row in spamreader:
                    if flag == -1:
                        array = row
                        for index in range(0, len(array)-1):
                            slide_features.append(float(array[index]))
                    
            with open("../../uh_endometrial_cancer/collagen_feature_maps_600_final/" + filename, newline='') as csvfile:
                spamreader = csv.reader(csvfile)
                for row in spamreader:
                    if flag == -1:
                        array = row
                        for index in range(0, len(array)-1):
                            slide_features.append(float(array[index]))
            file_features.append(slide_features)
    
    f = [sum(col) / float(len(col)) for col in zip(*file_features)]
    test_collagen_features.append(f)
print(len(test_collagen_features))
print(len(test_collagen_features[0]))

27
27


In [33]:
test_y = []
test_event = []
test_survival_time = []
for index in range(0, len(test_final_censor)):
    test_y.append([test_final_censor[index], test_final_days[index]])
    test_event.append(test_final_censor[index])
    test_survival_time.append(test_final_days[index])
print(len(test_y))

27


In [34]:
# generate training set for training model
test_features = []
for index in range(0, len(test_final_filenames)):
    #test_features.append(test_collagen_features[index] + [test_final_genes_high[index]] + [test_final_genes_low[index]] + [test_final_genes_mmr[index]])
    test_features.append(test_collagen_features[index])
    #features.append(final_til_features[index])
print(len(test_features))
print(len(test_features[0]))
print(test_features[0])

27
27
[1.9329666666666665, 0.2464666666666667, 2.2958666666666665, 1.9857666666666667, 0.3131933333333334, 2.2970666666666664, 2.021733333333333, 0.3066466666666667, 2.2968333333333333, 2.0467999999999997, 0.3766966666666667, 2.2959666666666667, 2.0687333333333333, 0.44312666666666667, 2.2950999999999997, 2.0825666666666667, 0.36338, 2.2917666666666663, 2.0981, 0.42194000000000004, 2.2913, 2.1118666666666663, 0.4701633333333333, 2.2914333333333334, 2.1183666666666667, 0.43571999999999994, 2.290366666666667]


In [36]:
# final training information to be used for training model
test_features = np.array(test_features)
test_y = np.array(test_y)
test_event = np.array(test_event)
test_survival_time = np.array(test_survival_time)

In [47]:
# run on test set
features_train = features
features_test = test_features
y_train = y
event_train, survival_time_train = event, survival_time
dt = dtype=[('Status', '?'), ('Survival_in_days', '<f8')]
y_train = np.array([tuple(row) for row in y_train], dtype=dt)
scaler = MinMaxScaler()
features_train = scaler.fit_transform(features_train)
features_test = scaler.transform(features_test)
features_train_df = pd.DataFrame(features_train)
features_test_df = pd.DataFrame(features_test)
        
# fit model
estimator = CoxnetSurvivalAnalysis(l1_ratio=0.9, alpha_min_ratio=0.1)
estimator.fit(features_train_df, y_train)

score, _, _, _, _ = concordance_index_censored(test_event, test_survival_time, estimator.predict(features_test_df))
print(score)

# get risk scores
train_risk_scores = estimator.predict(features_train_df)
test_risk_scores = estimator.predict(features_test_df)

group = []
median = np.median(train_risk_scores)
count_low = 0
count_high = 0
for index in range(0, len(test_risk_scores)):
    if test_risk_scores[index] > median:
        count_high += 1
        group.append(1)
    else:
        count_low += 1
        group.append(0)
print(count_low)
print(count_high)
print(len(group))

0.4364406779661017
14
13
27


In [48]:
a = []
for index in range(0, len(test_event)):
    if test_event[index] == False:
        a.append(0)
    else:
        a.append(1)
print(*a, sep="; ")

1; 0; 0; 1; 1; 0; 0; 1; 1; 0; 0; 1; 0; 1; 1; 0; 1; 0; 1; 0; 1; 0; 1; 1; 0; 0; 1


In [49]:
print(*test_survival_time, sep="; ")

381; 4356; 2107; 839; 872; 148; 3716; 2753; 1931; 2972; 2454; 577; 3004; 1476; 1391; 1751; 1804; 197; 904; 2178; 1679; 2218; 1486; 553; 2310; 2211; 639


In [50]:
print(*group, sep="; ")

0; 0; 0; 0; 0; 1; 1; 1; 1; 1; 0; 0; 1; 0; 1; 1; 0; 0; 1; 0; 0; 0; 1; 1; 1; 1; 0


In [None]:
print(count_high)
for index in range(0, len(test_oc_files)):
    if test_risk_scores[index] > median:
        print(test_oc_files[index])

In [None]:
import umap

In [None]:
f = []
c = []
f_1 = []
c_1 = []
for index in range(0, len(features)):
    f.append(features[index])
    c.append((1, 0, 0))
for index in range(0, len(test_features)):
    f.append(test_features[index])
    c.append((0, 1, 0))

In [None]:
trans = umap.UMAP(n_neighbors=5, random_state=42).fit(f)

In [None]:
f_1_transform = trans.transform(f_1)

In [None]:
plt.scatter(trans.embedding_[:, 0], trans.embedding_[:, 1], s= 5, c=c, cmap='Spectral')
#plt.scatter(f_1_transform[:, 0], f_1_transform[:, 1], s= 5, c=c_1, cmap='Spectral')
plt.title("UMAP Embedding")

In [None]:
print(features)

In [None]:
print(test_features)