In [1]:
# header files
%matplotlib inline
import glob
import csv
import cv2
import numpy as np
import pandas as pd
from sksurv.nonparametric import kaplan_meier_estimator
from sksurv.linear_model import CoxnetSurvivalAnalysis
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.ensemble import RandomSurvivalForest
from sksurv.svm import HingeLossSurvivalSVM
from sksurv.metrics import (
    concordance_index_censored,
    concordance_index_ipcw,
    cumulative_dynamic_auc,
    integrated_brier_score,
)
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_regression, f_classif
import matplotlib.pyplot as plt
from datetime import datetime
date_format = "%m/%d/%Y"
plt.rcParams['figure.figsize'] = [4, 4]
print("Header files loaded!")

Header files loaded!


In [2]:
# load cervix cancer files
filenames = (glob.glob("../results/emory/predicted_features/*"))
print(len(filenames))

50


In [3]:
features = []
for file in filenames:
    filename = file.split("/")[-1]
    flag = -1
    file_features = []
    with open("../results/emory/predicted_features/"+filename, newline='') as csvfile:
        spamreader = csv.reader(csvfile)
        for row in spamreader:
            if flag == -1:
                array = list(row)
                file_features.append(float(array[0][1:len(array[0])-1]))
        features.append(file_features)
print(features)

[[0.0704916448811392], [0.245768002298372], [0.05948350225443637], [0.0638924931235889], [0.233745331867386], [0.1776222883728405], [0.238301218974691], [0.2072093317950069], [0.0749134213859705], [0.1881482016421886], [0.06206962327134], [0.2131763158173928], [0.2461792233956068], [0.2064917238432327], [0.2311045577005599], [0.1100206961743309], [0.0533581893214465], [0.1753827065720689], [0.113962039941698], [0.2108966338082036], [0.1187966796079487], [0.219796399395218], [0.0810295842208371], [0.1336852281901268], [0.167121085897264], [0.070416276384865], [0.1260489997805625], [0.1403205021397684], [0.137276532339119], [0.1745912623705549], [0.0869146270568658], [0.1204366583182438], [0.04946949029065239], [0.2202022522798837], [0.249281655283618], [0.335217222125273], [0.098096921855186], [0.25090921294698], [0.099786539002644], [0.1186853903538891], [0.165403355771731], [0.26849193351056], [0.1755319628093149], [0.1979304347395555], [0.1543520216015321], [0.1186857385806106], [0.1

In [4]:
# create output survival information for training model and get til features
if True:
    censor_clinical = []
    days_clinical = []
    filenames_clinical = []
    flag = -1
    c = 0
    prev = ""
    with open("../data/emory.csv", newline='', encoding = "ISO-8859-1") as csvfile:
        spamreader = csv.reader(csvfile)
        for row in spamreader:
            if flag == -1:
                flag = 1
            elif flag == 1:
                print(row)
                flag = 2
            else:
                array = row
                if array[0] != prev:
                    prev = array[0]
                    filenames_clinical.append(array[0][6:])
                    start_date = -1
                    end_date = -1
                    
                    if array[len(array)-8] == "NA" or array[len(array)-8] == "Unknown *":
                        end_date = array[len(array)-6]
                    else:
                        end_date = array[len(array)-8]
                    
                    if array[len(array)-12] == "Y":
                        start_date = array[len(array)-11]
                    elif array[len(array)-10] == "Y":
                        start_date = array[len(array)-9]
                    else:
                        start_date = array[len(array)-13]    
                    first_date = datetime.strptime(str(start_date), date_format)
                    last_date = datetime.strptime(str(end_date), date_format)
                    delta = last_date - first_date
                    days_clinical.append(np.abs(float(delta.days)))
                    
                    if array[len(array)-8] == "NA":
                        censor_clinical.append(False)
                    else:
                        censor_clinical.append(True)
    print(len(filenames_clinical))
    print(len(days_clinical))
    print(len(censor_clinical))

['Patient Project ID (PPID)', 'Gender', '', 'Ethnicity', 'Age at Surgery', 'Neoadjuvant Treatment Received Prior to Surgical Accession Date? (Y/N)', 'HPV Status (+/-)', 'Path Status', 'Anatomic Site', 'Laterality', 'Synoptic Histology', 'TNM Descriptor(s)', 'Primary Tumor (pT)', 'Regional Lymph Nodes (pN)', 'Distant Metastasis (pM)', 'Primary Tumor (cT)', 'Regional Lymph Nodes (cN)', 'Distant Metastasis (cM)', 'Surgery (Y/N)', 'Date of Surgery', 'Radiation (Y/N)', 'Radiation Start Date (use to calculate PFS/OS)', 'Chemotherapy (Y/N)', 'Chemotherapy Start Date (use to calculate PFS/OS)', 'Progression of Disease Date (use to calculate PFS/OS)', 'Death Date (use to calculate PFS/OS)', 'Last Follow up Date (use to calculate PFS/OS)', 'Metastasis (Y/N)', 'TMA', 'Row', 'Column', 'Notes']
112
112
112


In [5]:
final_filenames = []
final_features = []
final_days = []
final_censor = []

for index in range(0, len(filenames)):
    filename = filenames[index].split("/")[-1][:-4]
    
    count = 0
    for file in filenames_clinical:
        if filename == file:
            final_filenames.append(filename)
            final_features.append(features[index])
            final_days.append(np.abs(days_clinical[count]))
            final_censor.append(censor_clinical[count])
        count += 1
print(len(final_filenames))
print(len(final_features))
print(len(final_days))
print(len(final_censor))

50
50
50
50


In [6]:
final_features = np.array(final_features)
final_censor = np.array(final_censor)
final_days = np.array(final_days)

In [7]:
print(*(final_days), sep="; ")

978.0; 575.0; 344.0; 1121.0; 551.0; 701.0; 1183.0; 62.0; 947.0; 1022.0; 1064.0; 626.0; 455.0; 1100.0; 1616.0; 1525.0; 619.0; 1016.0; 332.0; 1128.0; 461.0; 791.0; 329.0; 315.0; 367.0; 1526.0; 1620.0; 322.0; 1050.0; 183.0; 305.0; 777.0; 665.0; 840.0; 1233.0; 1186.0; 495.0; 740.0; 584.0; 1150.0; 161.0; 1093.0; 7.0; 1149.0; 216.0; 758.0; 309.0; 782.0; 227.0; 154.0


In [8]:
a = []
for index in range(0, len(final_censor)):
    if final_censor[index] == False:
        a.append(0)
    else:
        a.append(1)
# a = [0; 1; 0; 0; 1; 1; 1; 1; 0; 1; 0; 1; 1; 1; 1; 1; 0; 1; 1; 1; 1; 1; 0; 1; 1; 0; 0; 0; 0; 1; 0; 0; 0; 1; 1; 1; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 1; 0; 0; 1]
print(*a, sep="; ")

0; 0; 0; 1; 1; 0; 1; 1; 1; 0; 0; 1; 0; 0; 0; 1; 0; 0; 1; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 1; 0; 0; 0; 0; 0; 0; 0; 0; 0; 1; 0; 0; 1


In [9]:
group = []
for index in range(0, len(final_features)):
    if final_features[index, 0] >= 0.13:
        group.append(1)
    else:
        group.append(0)
print(*group, sep="; ")

0; 1; 0; 0; 1; 1; 1; 1; 0; 1; 0; 1; 1; 1; 1; 0; 0; 1; 0; 1; 0; 1; 0; 1; 1; 0; 0; 1; 1; 1; 0; 0; 0; 1; 1; 1; 0; 1; 0; 0; 1; 1; 1; 1; 1; 0; 1; 1; 0; 0
