In [1]:
# header files
%matplotlib inline
import glob
import cv2
import csv
import numpy as np
import pandas as pd
from sksurv.nonparametric import kaplan_meier_estimator
from sksurv.linear_model import CoxnetSurvivalAnalysis
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.ensemble import RandomSurvivalForest
from sksurv.svm import HingeLossSurvivalSVM
from sksurv.metrics import (
    concordance_index_censored,
    concordance_index_ipcw,
    cumulative_dynamic_auc,
    integrated_brier_score,
)
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_regression, f_classif
import matplotlib.pyplot as plt
from datetime import datetime
date_format = "%m/%d/%y"
date_format_1 = "%m/%d/%Y"
plt.rcParams['figure.figsize'] = [4, 4]
print("Header files loaded!")

Header files loaded!


In [2]:
# load cervix cancer files
filenames = (glob.glob("../results/vanderbilt/predicted_features/*"))
print(len(filenames))

102


In [3]:
features = []
for file in filenames:
    filename = file.split("/")[-1]
    flag = -1
    file_features = []
    with open("../results/vanderbilt/predicted_features/"+filename, newline='') as csvfile:
        spamreader = csv.reader(csvfile)
        for row in spamreader:
            if flag == -1:
                array = list(row)
                file_features.append(float(array[0][1:len(array[0])-1]))
        features.append(file_features)
print(features)

[[0.0606729284998744], [0.1138422736652199], [0.1239245120249116], [0.1061073482705876], [0.06001054506628327], [0.190415573974464], [0.1431565519729632], [0.0652088635295261], [0.100165219451615], [0.0703204857048407], [0.0680334599459368], [0.0494236262920304], [0.1498280601899735], [0.1185624158135363], [0.1182900946189712], [0.1413033840697502], [0.0702000137782166], [0.0532305305742529], [0.1895549653613], [0.1068607453376163], [0.0692533568331758], [0.0399093810842701], [0.0695154373559627], [0.1952404266144438], [0.0344048472991731], [0.0212479494042479], [0.02224628984573954], [0.0855406077015651], [0.2263544041899789], [0.2276102004506198], [0.1251670657001812], [0.1309282917659454], [0.1035199803808187], [0.1226783223207302], [0.0642181936303003], [0.0644033458183203], [0.0393761673149047], [0.120574882661454], [0.1572327096109015], [0.128563945577692], [0.0231809850609083], [0.1138755613921677], [0.05306265460560019], [0.0931967265781526], [0.1101972205192613], [0.1223255642

In [4]:
# create output survival information for training model and get til features
if True:
    censor_clinical = []
    days_clinical = []
    filenames_clinical = []
    flag = -1
    c = 0
    with open("../data/vanderbilt.csv", newline='', encoding = "ISO-8859-1") as csvfile:
        spamreader = csv.reader(csvfile)
        for row in spamreader:
            if flag == -1:
                flag = 1
                print(row)
            else:
                array = row
                if len(array[12]) > 0 and array[12] != "#VALUE!":
                    filenames_clinical.append(array[0])
                    #days_clinical.append(float(array[12]))
                    
                    start_date = array[9]
                    
                    if array[14] == "":
                        end_date = array[11]
                    else:
                        end_date = array[14]
                       
                    d = date_format
                    if len(start_date.split("/")[-1]) == 4:
                        d = date_format_1
                        
                    d_1 = date_format
                    if len(end_date.split("/")[-1]) == 4:
                        d_1 = date_format_1
                    first_date = datetime.strptime(str(start_date), d)
                    last_date = datetime.strptime(str(end_date), d_1)
                    delta = last_date - first_date
                    days_clinical.append(float(delta.days))
                    
                    if array[16] == "":
                        array[16] = "1"
                    if int(array[16]) == 1:
                        censor_clinical.append(False)
                    else:
                        censor_clinical.append(True)
    print(len(filenames_clinical))
    print(len(days_clinical))
    print(len(censor_clinical))

['ï»¿Code #', 'Age', 'Sex', 'Race', 'Path Histo #', 'HPV RNA ISH', 'p16 IHC Score', 'p16 Binary 75', 'Treatment', 'Date of Diagnosis ', 'Treatment Start Date', 'Last follow up', 'F/U Length', 'F/U Length Months', 'Last Disease Free FU Date', 'Patient Status', 'Disease status ', 'Local Failure', 'LCRF Date', 'regional fail', 'date reg rec', 'Dist met', 'DM Date', 'Chemo', 'T-Stage', 'N-Stage', 'Overall Stage', 'Margins', 'Smoking?', 'Drinking?', 'ACE-27 Comorbidity Score']
274
274
274


In [6]:
final_filenames = []
final_features = []
final_days = []
final_censor = []

for index in range(0, len(filenames)):
    filename = filenames[index].split("/")[-1][:-4]
    
    count = 0
    for file in filenames_clinical:
        if filename == file:
            final_filenames.append(filename)
            final_features.append(features[index])
            final_days.append(days_clinical[count])
            final_censor.append(censor_clinical[count])
        count += 1
print(len(final_filenames))
print(len(final_features))
print(len(final_days))
print(len(final_censor))

102
102
102
102


In [7]:
final_features = np.array(final_features)
final_censor = np.array(final_censor)
final_days = np.array(final_days)

In [8]:
print(*(final_days), sep="; ")

2544.0; 503.0; 1405.0; 2926.0; 979.0; 2251.0; 156.0; 160.0; 2007.0; 3325.0; 1434.0; 2007.0; 3060.0; 1277.0; 1518.0; 3968.0; 394.0; 610.0; 623.0; 2357.0; 432.0; 1610.0; 96.0; 225.0; 1583.0; 72.0; 2160.0; 699.0; 28.0; 2296.0; 1057.0; 421.0; 756.0; 763.0; 517.0; 1041.0; 1782.0; 836.0; 996.0; 440.0; 673.0; 427.0; 804.0; 803.0; 1671.0; 917.0; 500.0; 775.0; 2274.0; 73.0; 410.0; 1463.0; 1176.0; 1547.0; 2268.0; 1392.0; 2620.0; 393.0; 699.0; 65.0; 1164.0; 70.0; 1840.0; 906.0; 2087.0; 2714.0; 496.0; 1041.0; 377.0; 464.0; 267.0; 666.0; 3545.0; 4280.0; 3120.0; 2647.0; 1722.0; 1581.0; 1368.0; 1563.0; 1762.0; 3039.0; 3180.0; 2624.0; 316.0; 422.0; 2100.0; 246.0; 2883.0; 1774.0; 2088.0; 795.0; 575.0; 2945.0; 2641.0; 1449.0; 1006.0; 208.0; 473.0; 117.0; 385.0; 844.0


In [9]:
a = []
for index in range(0, len(final_censor)):
    if final_censor[index] == False:
        a.append(0)
    else:
        a.append(1)
print(*a, sep="; ")

0; 0; 0; 0; 0; 1; 1; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 1; 1; 0; 1; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 1; 0; 0; 0; 0; 0; 0; 1; 0; 0; 0; 1; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 1; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 1; 0; 1; 0; 0; 0; 1; 1; 0; 0; 0; 1; 0; 1; 0; 0; 0


In [11]:
group = []
for index in range(0, len(final_features)):
    if final_features[index, 0] >= 0.13:
        group.append(1)
    else:
        group.append(0)
print(*group, sep="; ")

0; 0; 0; 0; 0; 1; 1; 0; 0; 0; 0; 0; 1; 0; 0; 1; 0; 0; 1; 0; 0; 0; 0; 1; 0; 0; 0; 0; 1; 1; 0; 1; 0; 0; 0; 0; 0; 0; 1; 0; 0; 0; 0; 0; 0; 0; 0; 0; 1; 1; 0; 1; 0; 0; 0; 0; 1; 1; 0; 0; 0; 0; 1; 1; 0; 0; 0; 0; 0; 0; 0; 1; 0; 0; 0; 0; 0; 0; 1; 1; 0; 0; 0; 0; 0; 1; 0; 1; 1; 1; 0; 0; 1; 1; 0; 1; 1; 1; 1; 0; 1; 0


In [None]:
print(np.median(final_features))