In [1]:
# header files
import glob
import csv
import numpy as np
import pandas as pd
from sksurv.nonparametric import kaplan_meier_estimator
from sksurv.linear_model import CoxnetSurvivalAnalysis
from sksurv.ensemble import RandomSurvivalForest
from sksurv.metrics import (
    concordance_index_censored,
    concordance_index_ipcw,
    cumulative_dynamic_auc,
    integrated_brier_score,
)
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_regression, f_classif
from datetime import datetime
print("Header files loaded!")

Header files loaded!


In [2]:
# hyper-parameters
date_format = "%m/%d/%Y"
is_ovarian_cancer = 1
is_cervix_cancer = 0
is_endometrial_cancer = 0

def mean(a):
    return sum(a) / len(a)

In [3]:
censor = []
days = []
filenames = []
flag = -1
with open("../../../Desktop/Yale_YTMA79_HEIF_clinical_info_v2.csv", newline='', encoding = "ISO-8859-1") as csvfile:
    spamreader = csv.reader(csvfile)
    for row in spamreader:
        if flag == -1:
            flag = 1
            print(row)
        else:
            array = row
            
            if array[1] == "" or array[22] == "NA":
                continue
            
            filenames.append(str(array[0][1:len(array[0])-1]))
            if array[26] == "1":
                censor.append(False)
                days.append(int(array[22]))
            else:
                censor.append(True)
                days.append(int(array[22]))
print(len(filenames))
print(len(censor))
print(len(days))

['TMA_label', 'Gender', 'Race', 'Diagnosis', 'DiagnosisDate', 'AgeatDiagnosis', 'AliveDeadCensor', 'AliveDeadStatus', 'DateLastSeen', 'DODCensor', 'DODStatus', 'FollowupDays', 'FollowupMonths', 'PrimaryStatus', 'Core', 'Stage', 'histology_subtype', 'smoking_status', 'CD4', 'CD20', 'CD8', 'OS_m', 'OS_d', 'DSS_censor', 'DSS_censor_bin', 'OS_Censor', 'OS_bin']
116
116
116


In [4]:
print(filenames[0])
print(censor[0])
print(days[0])

YTMA79-10_2_383_254
False
1376


In [5]:
print(filenames[50])
print(censor[50])
print(days[50])

YTMA79-10_120_309_943
True
701


In [6]:
ec_files = (glob.glob("../../yale_lung_cancer/macrophage_output_1/*"))
print(len(ec_files))

90


In [7]:
ec_files_1 = (glob.glob("../../yale_lung_cancer/macrophage_output/*"))
print(len(ec_files_1))

119


In [8]:
final_filenames = []
final_censor = []
final_days = []
index = 0
for file in filenames:
    flag = -1
    for file_1 in ec_files:
        if file in file_1:
            final_filenames.append(filenames[index])
            final_censor.append(censor[index])
            final_days.append(days[index])
            break
    index += 1
print(len(final_filenames))
print(len(final_censor))
print(len(final_days))

87
87
87


In [9]:
print(final_filenames)

['YTMA79-10_2_383_254', 'YTMA79-10_3_486_245', 'YTMA79-10_4_614_221', 'YTMA79-10_5_715_219', 'YTMA79-10_7_914_216', 'YTMA79-10_8_1020_205', 'YTMA79-10_9_1122_196', 'YTMA79-10_12_1437_184', 'YTMA79-10_13_1550_179', 'YTMA79-10_20_1340_285', 'YTMA79-10_22_1126_303', 'YTMA79-10_31_289_460', 'YTMA79-10_35_715_422', 'YTMA79-10_36_819_423', 'YTMA79-10_38_1020_394', 'YTMA79-10_39_1132_402', 'YTMA79-10_43_1552_377', 'YTMA79-10_47_1667_470', 'YTMA79-10_50_1351_489', 'YTMA79-10_53_1025_500', 'YTMA79-10_56_711_529', 'YTMA79-10_57_603_534', 'YTMA79-10_58_506_527', 'YTMA79-10_59_402_547', 'YTMA79-10_65_715_625', 'YTMA79-10_68_1041_596', 'YTMA79-10_79_1460_680', 'YTMA79-10_86_720_722', 'YTMA79-10_87_612_729', 'YTMA79-10_89_407_738', 'YTMA79-10_92_403_848', 'YTMA79-10_95_720_820', 'YTMA79-10_97_937_801', 'YTMA79-10_110_1366_876', 'YTMA79-10_112_1147_888', 'YTMA79-10_114_927_904', 'YTMA79-10_120_309_943', 'YTMA79-10_121_307_1047', 'YTMA79-10_123_513_1031', 'YTMA79-10_125_735_1011', 'YTMA79-10_127_938_9

In [10]:
# collect features
collagen_features = []
for file in final_filenames:
    file_features = []
        
    for file_1 in ec_files_1:
        if file in file_1:
            filename = file_1.split("/")[-1]
            flag = -1
            slide_features = []
            
            with open(file_1, newline='') as csvfile:
                spamreader = csv.reader(csvfile)
                for row in spamreader:
                    if flag == -1:
                        array = row
                        for index in range(0, len(array)):
                            slide_features.append(float(array[index]))
            file_features.append(slide_features)
    
    f = [sum(col) / float(len(col)) for col in zip(*file_features)]
    collagen_features.append(f)
print(len(collagen_features))
print(len(collagen_features[0]))

87
3


In [11]:
print(collagen_features)

[[0.176930606285435, 0.10047847889952152, 0.16838488972508592], [0.14777778777777778, 0.13757397449704142, 0.1489796018367347], [0.1623931723931624, 0.2276923176923077, 0.20246914580246914], [0.07744566217391304, 0.09430439842203547, 0.09079755601226994], [0.18166940443535187, 0.1848184918481848, 0.18634260259259258], [0.12477232329690345, 0.15418503202643172, 0.130879355603272], [0.16926771708283314, 0.07764706882352941, 0.15306123448979592], [0.22448980591836734, 0.22344323344322345, 0.23796792443850268], [0.12831859407079646, 0.07582939388625592, 0.11276949590381426], [0.07533113582781456, 0.14960630921259843, 0.07984497124031008], [0.20934580439252337, 0.1705989210707804, 0.1986970784039088], [0.22678572428571428, 0.13725491196078432, 0.1952941276470588], [0.22281168108753316, 0.14285715285714284, 0.20781894004115226], [0.21413722413721414, 0.13281251, 0.2022867294371152], [0.22774870109947642, 0.07980457026058631, 0.11356046424181696], [0.18487395957983194, 0.07352942176470588, 0.

In [12]:
y = []
event = []
survival_time = []
for index in range(0, len(final_censor)):
    y.append([final_censor[index], final_days[index]])
    event.append(final_censor[index])
    survival_time.append(final_days[index])
print(len(y))

87


In [13]:
# generate training set for training model
features = []
for index in range(0, len(final_filenames)):
    #features.append(final_til_features[index]+collagen_features[index])
    features.append(collagen_features[index])
    #features.append(final_til_features[index])
print(len(features))
print(len(features[0]))

87
3


In [14]:
# final training information to be used for training model
features = np.array(features)
y = np.array(y)
event = np.array(event)
survival_time = np.array(survival_time)

In [28]:
# main code for training
iter_scores = []
max_score = -1
dt = dtype=[('Status', '?'), ('Survival_in_days', '<f8')]
for iter in range(100):
    model_score = []
    kf = KFold(n_splits=5, shuffle=True)
    for train_index, test_index in kf.split(features):
        # get the training and validation data
        features_train, features_test = features[train_index], features[test_index]
        y_train, y_test = y[train_index], y[test_index]
        event_train, survival_time_train = event[train_index], survival_time[train_index]
        event_test, survival_time_test = event[test_index], survival_time[test_index]
        y_train = np.array([tuple(row) for row in y_train], dtype=dt)
        y_test = np.array([tuple(row) for row in y_test], dtype=dt)
        
        # feature selection
        scaler = MinMaxScaler()
        features_train = scaler.fit_transform(features_train)
        features_test = scaler.transform(features_test)
        #select = SelectKBest(score_func=chi2, k=len(features[0])-800)
        #features_train_selected = select.fit_transform(features_train, survival_time_train)
        #features_test_selected = select.transform(features_test)
        features_train_df = pd.DataFrame(features_train)
        features_test_df = pd.DataFrame(features_test)
        
        # fit model
        estimator = CoxnetSurvivalAnalysis(l1_ratio=0.99, alpha_min_ratio=0.05)
        estimator.fit(features_train_df, y_train)
        
        # score on validation set
        score, _, _, _, _ = concordance_index_censored(event_test, survival_time_test, estimator.predict(features_test_df))
        model_score.append(score)
        if score > max_score:
            max_score = score
    
    if len(model_score) > 0:
        iter_scores.append(np.mean(model_score))
        max_score = max(max(model_score), max_score)
print(np.mean(iter_scores), np.std(iter_scores))
print(max_score)

0.5861737833246604 0.02790885308882465
0.8586956521739131


In [None]:
# model to be used for external validation
features_train = features
y_train = y
event_train, survival_time_train = event, survival_time
dt = dtype=[('Status', '?'), ('Survival_in_days', '<f8')]
y_train = np.array([tuple(row) for row in y_train], dtype=dt)
        
# feature selection
scaler = MinMaxScaler()
features_train = scaler.fit_transform(features_train)
#select = SelectKBest(score_func=chi2, k=len(features[0])-4)
#features_train_selected = select.fit_transform(features_train, survival_time_train)
features_train_df = pd.DataFrame(features_train)
        
# fit model
estimator = CoxnetSurvivalAnalysis(l1_ratio=0.99, alpha_min_ratio=0.05)
estimator.fit(features_train_df, y_train)

In [None]:
# find prognostic features from model trained above
count = 0
for index1 in range(0, len(estimator.coef_)):
    flag = -1
    for index2 in range(0, len(estimator.coef_[index1])):
        if estimator.coef_[index1][index2] > 0:
            flag = 1
            print(index1)
            print(estimator.coef_[index1][index2])
            break
    if flag == 1:
        count += 1
print()
print("Prognostic features count = " + str(count))

In [None]:
oc_files = (glob.glob("../../tam_results/macrophage_tcga_final/*"))
print(len(oc_files))

In [None]:
test_tam_features = []
for file in oc_files:
    filename = file.split("/")[-1]
    flag = -1
    file_features = []
    with open(file, newline='') as csvfile:
        spamreader = csv.reader(csvfile)
        for row in spamreader:
            if flag == -1:
                array = row
                for index in range(0, len(array)):
                    file_features.append(float(array[index]))
    test_tam_features.append(file_features)
print(test_tam_features)

In [None]:
# Sepideh OC Spatil features
test_censor = []
test_days = []
test_filenames = []
flag = -1
with open("../../DATA_OC.csv", newline='', encoding = "ISO-8859-1") as csvfile:
    spamreader = csv.reader(csvfile)
    for row in spamreader:
        if flag == -1:
            flag = 1
            print(row)
        else:
            array = row
            #if int(array[23]) > 1800:
            #    array[20] = 0
                
            test_filenames.append(array[0])
            test_censor.append(bool(int(array[20])))
            test_days.append(int(array[23]))

print(len(test_filenames))
print(len(test_censor))
print(len(test_days))

In [None]:
test_y = []
test_event = []
test_survival_time = []
for file in oc_files:
    count = 0
    filename1 = file.split("/")[-1][:-4]
    for filename in test_filenames:
        filename2 = filename
        if filename1 == filename2:
            test_y.append([test_censor[count], test_days[count]])
            test_event.append(test_censor[count])
            test_survival_time.append(test_days[count])
        count += 1
print(len(test_y))
print(len(test_event))
print(len(test_survival_time))

In [None]:
# generate training set for training model
test_features = []
for index in range(0, len(oc_files)):
    #features.append(final_til_features[index]+collagen_features[index])
    test_features.append(test_tam_features[index])
    #features.append(final_til_features[index])
print(len(test_features))
print(len(test_features[0]))

In [None]:
# final training information to be used for training model
test_features = np.array(test_features)
test_y = np.array(test_y)
test_event = np.array(test_event)
test_survival_time = np.array(test_survival_time)

In [None]:
# run on test set
group = []
features_train = test_features
features_test = features
y_train = test_y
event_train, survival_time_train = test_event, test_survival_time
dt = dtype=[('Status', '?'), ('Survival_in_days', '<f8')]
y_train = np.array([tuple(row) for row in y_train], dtype=dt)
scaler = MinMaxScaler()
features_train = scaler.fit_transform(features_train)
features_test = scaler.transform(features_test)
#select = SelectKBest(score_func=chi2, k=len(features[0])-4)
#features_train_selected = select.fit_transform(features_train, survival_time_train)
#features_test_selected = select.transform(features_test)
features_train_df = pd.DataFrame(features_train)
features_test_df = pd.DataFrame(features_test)
        
# fit model
estimator = CoxnetSurvivalAnalysis(l1_ratio=0.99, alpha_min_ratio=0.05)
estimator.fit(features_train_df, y_train)

score, _, _, _, _ = concordance_index_censored(event, survival_time, estimator.predict(features_test_df))
print(score)

# get risk scores
train_risk_scores = estimator.predict(features_train_df)
test_risk_scores = estimator.predict(features_test_df)

median = np.median(train_risk_scores)
count_low = 0
count_high = 0
for index in range(0, len(test_risk_scores)):
    if test_risk_scores[index] > median:
        count_high += 1
        group.append(1)
    else:
        count_low += 1
        group.append(0)
print(count_low)
print(count_high)

In [None]:
a = []
for index in range(0, len(event)):
    if event[index] == False:
        a.append(0)
    else:
        a.append(1)
print(*a, sep="; ")

In [None]:
print(*survival_time, sep="; ")

In [None]:
print(*group, sep="; ")