In [1]:
# header files
%matplotlib inline
import glob
import cv2
import csv
import numpy as np
import pandas as pd
from sksurv.nonparametric import kaplan_meier_estimator
from sksurv.linear_model import CoxnetSurvivalAnalysis
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.ensemble import RandomSurvivalForest
from sksurv.svm import HingeLossSurvivalSVM
from sksurv.metrics import (
    concordance_index_censored,
    concordance_index_ipcw,
    cumulative_dynamic_auc,
    integrated_brier_score,
)
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_regression, f_classif
import matplotlib.pyplot as plt
from datetime import datetime
date_format = "%m/%d/%y"
date_format_1 = "%m/%d/%Y"
plt.rcParams['figure.figsize'] = [4, 4]
print("Header files loaded!")

Header files loaded!


In [2]:
# load cervix cancer files
filenames = (glob.glob("results/vanderbilt/predicted_features_m/*"))
print(len(filenames))

102


In [3]:
features = []
for file in filenames:
    filename = file.split("/")[-1]
    flag = -1
    file_features = []
    with open("results/vanderbilt/predicted_features_m/"+filename, newline='') as csvfile:
        spamreader = csv.reader(csvfile)
        for row in spamreader:
            if flag == -1:
                array = list(row)
                #file_features.append(float(array[0][1:len(array[0])-1]))
                file_features.append(float(array[0][1:len(array[0])-1]))
        features.append(file_features)
print(features)

[[0.0606729284998744], [0.1138422736652199], [0.1239245120249116], [0.1061073482705876], [0.06001054506628327], [0.190415573974464], [0.1431565519729632], [0.0652088635295261], [0.100165219451615], [0.0703204857048407], [0.0680334599459368], [0.0494236262920304], [0.1498280601899735], [0.1185624158135363], [0.1182900946189712], [0.1413033840697502], [0.0702000137782166], [0.0532305305742529], [0.1895549653613], [0.1068607453376163], [0.0692533568331758], [0.0399093810842701], [0.0695154373559627], [0.1952404266144438], [0.0344048472991731], [0.0212479494042479], [0.02224628984573954], [0.0855406077015651], [0.2263544041899789], [0.2276102004506198], [0.1251670657001812], [0.1309282917659454], [0.1035199803808187], [0.1226783223207302], [0.0642181936303003], [0.0644033458183203], [0.0393761673149047], [0.120574882661454], [0.1572327096109015], [0.128563945577692], [0.0231809850609083], [0.1138755613921677], [0.05306265460560019], [0.0931967265781526], [0.1101972205192613], [0.1223255642

In [None]:
features_1 = []
for file in filenames:
    filename = file.split("/")[-1]
    flag = -1
    file_features = []
    with open("results/vanderbilt/new_features_m/"+filename, newline='') as csvfile:
        spamreader = csv.reader(csvfile)
        for row in spamreader:
            if flag == -1:
                array = list(row)
                #file_features.append(float(array[0][1:len(array[0])-1]))
                file_features.append(float(array[0][1:len(array[0])-1])+0.05)
        features_1.append(file_features)
print(features_1)

In [4]:
# create output survival information for training model and get til features
if True:
    censor_clinical = []
    days_clinical = []
    filenames_clinical = []
    flag = -1
    c = 0
    with open("vanderbilt_macrophages.csv", newline='', encoding = "ISO-8859-1") as csvfile:
        spamreader = csv.reader(csvfile)
        for row in spamreader:
            if flag == -1:
                flag = 1
                print(row)
            else:
                array = row
                if len(array[12]) > 0 and array[12] != "#VALUE!":
                    filenames_clinical.append(array[0])
                    #days_clinical.append(float(array[12]))
                    
                    start_date = array[9]
                    
                    if array[14] == "":
                        end_date = array[11]
                    else:
                        end_date = array[14]
                       
                    d = date_format
                    if len(start_date.split("/")[-1]) == 4:
                        d = date_format_1
                        
                    d_1 = date_format
                    if len(end_date.split("/")[-1]) == 4:
                        d_1 = date_format_1
                    first_date = datetime.strptime(str(start_date), d)
                    last_date = datetime.strptime(str(end_date), d_1)
                    delta = last_date - first_date
                    days_clinical.append(float(delta.days))
                    
                    if array[16] == "":
                        array[16] = "1"
                    if int(array[16]) == 1:
                        censor_clinical.append(False)
                    else:
                        censor_clinical.append(True)
    print(len(filenames_clinical))
    print(len(days_clinical))
    print(len(censor_clinical))

['ï»¿Code #', 'Age', 'Sex', 'Race', 'Path Histo #', 'HPV RNA ISH', 'p16 IHC Score', 'p16 Binary 75', 'Treatment', 'Date of Diagnosis ', 'Treatment Start Date', 'Last follow up', 'F/U Length', 'F/U Length Months', 'Last Disease Free FU Date', 'Patient Status', 'Disease status ', 'Local Failure', 'LCRF Date', 'regional fail', 'date reg rec', 'Dist met', 'DM Date', 'Chemo', 'T-Stage', 'N-Stage', 'Overall Stage', 'Margins', 'Smoking?', 'Drinking?', 'ACE-27 Comorbidity Score']
274
274
274


In [5]:
print(filenames_clinical)
print(days_clinical)
print(censor_clinical)

['OP1', 'OP2', 'OP3', 'OP4', 'OP5', 'OP6', 'OP7', 'OP8', 'OP9', 'OP10', 'OP11', 'OP12A', 'OP12B', 'OP13', 'OP14', 'OP15', 'OP16', 'OP17', 'OP18', 'OP19', 'OP20', 'OP21', 'OP22', 'OP23', 'OP24', 'OP25', 'OP26', 'OP27', 'OP28', 'OP29', 'OP30', 'OP31', 'OP32', 'OP33', 'OP34', 'OP35', 'OP36', 'OP37', 'OP38', 'OP39', 'OP40', 'OP41', 'OP42', 'OP43', 'OP44', 'OP45', 'OP46', 'OP47', 'OP48', 'OP49', 'OP50', 'OP51', 'OP52', 'OP53', 'OP54', 'OP55', 'OP56', 'OP57', 'OP58', 'OP59', 'OP60', 'OP61', 'OP62', 'OP63', 'OP64', 'OP65', 'OP66', 'OP67', 'OP68', 'OP69', 'OP70', 'OP71', 'OP72', 'OP73', 'OP74', 'OP75', 'OP76', 'OP77', 'OP78', 'OP79', 'OP80', 'OP81', 'OP82', 'OP83', 'OP84', 'OP85', 'OP86', 'OP87', 'OP88', 'OP89', 'OP90', 'OP91', 'OP92', 'OP93', 'OP94', 'OP95', 'OP96', 'OP97', 'OP98', 'OP99', 'OP100', 'OP101', 'OP102', 'OP103', 'OP104', 'OP105', 'OP106', 'OP107', 'OP108', 'OP109', 'OP110', 'OP111', 'OP112', 'OP113', 'OP114', 'OP115', 'OP116', 'OP117', 'OP118', 'OP119', 'OP120', 'OP121', 'OP122',

In [6]:
final_filenames = []
final_features = []
final_days = []
final_censor = []

for index in range(0, len(filenames)):
    filename = filenames[index].split("/")[-1][:-4]
    
    count = 0
    for file in filenames_clinical:
        if filename == file:
            final_filenames.append(filename)
            final_features.append(features[index])
            final_days.append(days_clinical[count])
            final_censor.append(censor_clinical[count])
        count += 1
print(len(final_filenames))
print(len(final_features))
print(len(final_days))
print(len(final_censor))

102
102
102
102


In [7]:
final_features = np.array(final_features)
final_censor = np.array(final_censor)
final_days = np.array(final_days)

In [8]:
print(filenames[10])
print(final_features[10])
print(final_censor[10])
print(final_days[10])

results/vanderbilt/predicted_features_m/OP136.csv
[0.06803346]
False
1434.0


In [9]:
print(len(final_days))
print(len(final_censor))
print(len(final_features))

102
102
102


In [10]:
print(*(final_days), sep="; ")

2544.0; 503.0; 1405.0; 2926.0; 979.0; 2251.0; 156.0; 160.0; 2007.0; 3325.0; 1434.0; 2007.0; 3060.0; 1277.0; 1518.0; 3968.0; 394.0; 610.0; 623.0; 2357.0; 432.0; 1610.0; 96.0; 225.0; 1583.0; 72.0; 2160.0; 699.0; 28.0; 2296.0; 1057.0; 421.0; 756.0; 763.0; 517.0; 1041.0; 1782.0; 836.0; 996.0; 440.0; 673.0; 427.0; 804.0; 803.0; 1671.0; 917.0; 500.0; 775.0; 2274.0; 73.0; 410.0; 1463.0; 1176.0; 1547.0; 2268.0; 1392.0; 2620.0; 393.0; 699.0; 65.0; 1164.0; 70.0; 1840.0; 906.0; 2087.0; 2714.0; 496.0; 1041.0; 377.0; 464.0; 267.0; 666.0; 3545.0; 4280.0; 3120.0; 2647.0; 1722.0; 1581.0; 1368.0; 1563.0; 1762.0; 3039.0; 3180.0; 2624.0; 316.0; 422.0; 2100.0; 246.0; 2883.0; 1774.0; 2088.0; 795.0; 575.0; 2945.0; 2641.0; 1449.0; 1006.0; 208.0; 473.0; 117.0; 385.0; 844.0


In [11]:
a = []
for index in range(0, len(final_censor)):
    if final_censor[index] == False:
        a.append(0)
    else:
        a.append(1)
print(*a, sep="; ")

0; 0; 0; 0; 0; 1; 1; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 1; 1; 0; 1; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 1; 0; 0; 0; 0; 0; 0; 1; 0; 0; 0; 1; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 1; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 1; 0; 1; 0; 0; 0; 1; 1; 0; 0; 0; 1; 0; 1; 0; 0; 0


In [12]:
group = []
for index in range(0, len(final_features)):
    if final_features[index, 0] >= 0.13:
        group.append(1)
    else:
        group.append(0)
print(*group, sep="; ")

0; 0; 0; 0; 0; 1; 1; 0; 0; 0; 0; 0; 1; 0; 0; 1; 0; 0; 1; 0; 0; 0; 0; 1; 0; 0; 0; 0; 1; 1; 0; 1; 0; 0; 0; 0; 0; 0; 1; 0; 0; 0; 0; 0; 0; 0; 0; 0; 1; 1; 0; 1; 0; 0; 0; 0; 1; 1; 0; 0; 0; 0; 1; 1; 0; 0; 0; 0; 0; 0; 0; 1; 0; 0; 0; 0; 0; 0; 1; 1; 0; 0; 0; 0; 0; 1; 0; 1; 1; 1; 0; 0; 1; 1; 0; 1; 1; 1; 1; 0; 1; 0


In [None]:
print(final_filenames[92])
print(final_features[92, 0])

In [None]:
print(np.mean(final_features[:,0]))

In [None]:
for index in range(0, len(group)):
    if final_features[index, 0] < 0.04 and final_features[index, 0] > 0.03:
        print(final_filenames[index])

In [None]:
print(final_filenames[5])

In [None]:
high_risk_filenames = []
for index in range(0, len(final_filenames)):
    if group[index] == 1:
        high_risk_filenames.append(final_filenames[index])
print(high_risk_filenames)

In [None]:
low_risk_filenames = []
for index in range(0, len(final_filenames)):
    if group[index] == 0:
        low_risk_filenames.append(final_filenames[index])
print(low_risk_filenames)

In [None]:
high_risk_features = []
for file in high_risk_filenames:
    if "OP278" in file:
        continue
    if "OP27" in file:
        continue
    if "OP18" in file:
        continue
    filename = file.split("/")[-1]
    flag = -1
    file_features = []
    with open("results/vanderbilt/features_collagen_immune/200/"+filename+".csv", newline='') as csvfile:
        spamreader = csv.reader(csvfile)
        for row in spamreader:
            if flag == -1:
                array = list(row)
                file_features.append(float(array[1]))
        high_risk_features.append(file_features[0])
print(high_risk_features)

In [None]:
low_risk_features = []
for file in low_risk_filenames:
    if "OP206" in file:
        continue
    if "OP172" in file:
        continue
    if "OP15" in file:
        continue
    if "OP296" in file:
        continue
    if "OP291" in file:
        continue
    
    filename = file.split("/")[-1]
    flag = -1
    file_features = []
    with open("results/vanderbilt/features_collagen_immune/200/"+filename+".csv", newline='') as csvfile:
        spamreader = csv.reader(csvfile)
        for row in spamreader:
            if flag == -1:
                array = list(row)
                file_features.append(float(array[1]))
        low_risk_features.append(file_features[0])
print(low_risk_features)

In [None]:
files_nuclei = glob.glob("data/vanderbilt/nuclei_masks/*")
print(len(files_nuclei))

In [None]:
high_risk_tils = []
for file in high_risk_filenames:
    filename = file
    total_nuclei = 0
    total_macrophage = 0
    
    for index in range(0, 2170):
        file_nuclei = files_nuclei[index]
        file_macrophage = "data/vanderbilt/til_nuclei_masks/"+file_nuclei.split("/")[-1]
        
        if filename+"_" in file_nuclei:
            im_nuclei = cv2.imread(file_nuclei, 0)
            im_macrophage = cv2.imread(file_macrophage, 0)
            total_nuclei += sum(sum(im_nuclei>0))
            total_macrophage += sum(sum(im_macrophage>0))
    macrophage_density = float(total_macrophage)/float(total_nuclei)
    high_risk_tils.append(macrophage_density)
print(len(high_risk_filenames))
print(len(high_risk_tils))

In [None]:
low_risk_tils = []
for file in low_risk_filenames:
    filename = file
    total_nuclei = 0
    total_macrophage = 0
    
    for index in range(0, 2170):
        file_nuclei = files_nuclei[index]
        file_macrophage = "data/vanderbilt/til_nuclei_masks/"+file_nuclei.split("/")[-1]
        
        if filename+"_" in file_nuclei:
            im_nuclei = cv2.imread(file_nuclei, 0)
            im_macrophage = cv2.imread(file_macrophage, 0)
            total_nuclei += sum(sum(im_nuclei>0))
            total_macrophage += sum(sum(im_macrophage>0))
    macrophage_density = float(total_macrophage)/float(total_nuclei)
    low_risk_tils.append(macrophage_density)
print(len(low_risk_filenames))
print(len(low_risk_tils))

In [None]:
print(high_risk_tils)

In [None]:
print(low_risk_tils)

In [None]:
tils = []
til_filenames = []
for index in range(0, len(high_risk_tils)):
    tils.append(high_risk_tils[index])
    til_filenames.append(high_risk_filenames)
for index in range(0, len(low_risk_tils)):
    tils.append(low_risk_tils[index])
    til_filenames.append(low_risk_filenames)
print(len(tils))
print(len(til_filenames))

In [None]:
print(final_filenames)
print(filenames)

In [None]:
main_tils = []
for index in range(0, len(final_filenames)):
    count = 0
    for file in high_risk_filenames:
        if final_filenames[index] == file:
            main_tils.append(tils[count])
        count += 1
    
    count = 0
    for file in low_risk_filenames:
        if final_filenames[index] == file:
            main_tils.append(tils[count])
        count += 1
print(len(main_tils))

In [None]:
features = []
for index in range(0, 102):
    features.append([final_features[index][0]/main_tils[index]])
print(len(features))

In [None]:
features = np.array(features)
group = []
for index in range(0, len(features)):
    if features[index, 0] >= np.mean(features[:,0]):
        group.append(1)
    else:
        group.append(0)
print(*group, sep="; ")

In [None]:
group = []
for index in range(0, len(main_tils)):
    if main_tils[index] >= np.mean(main_tils[:]):
        group.append(0)
    else:
        group.append(1)
print(*group, sep="; ")

In [None]:
import cv2
import os
import glob

In [None]:
files = sorted(glob.glob("data/vanderbilt/macrophage_nuclei_masks_updated/OP202_1*"))
print(len(files))

In [None]:
print(files)

In [None]:
im = np.zeros((8000,8000, 3))
im[0:2000, 0:2000, :] = np.array(cv2.imread(files[0]))
im[0:2000, 2000:4000, :] = np.array(cv2.imread(files[1]))
im[0:2000, 4000:6000, :] = np.array(cv2.imread(files[2]))
im[0:2000, 6000:8000, :] = np.array(cv2.imread(files[3]))
im[2000:4000, 0:2000, :] = np.array(cv2.imread(files[4]))
im[2000:4000, 2000:4000, :] = np.array(cv2.imread(files[5]))
im[2000:4000, 4000:6000, :] = np.array(cv2.imread(files[6]))
im[2000:4000, 6000:8000, :] = np.array(cv2.imread(files[7]))
im[4000:6000, 0:2000, :] = np.array(cv2.imread(files[8]))
im[4000:6000, 2000:4000, :] = np.array(cv2.imread(files[9]))
im[4000:6000, 4000:6000, :] = np.array(cv2.imread(files[10]))
im[4000:6000, 6000:8000, :] = np.array(cv2.imread(files[11]))
im[6000:8000, 0:2000, :] = np.array(cv2.imread(files[12]))
im[6000:8000, 2000:4000, :] = np.array(cv2.imread(files[13]))
im[6000:8000, 4000:6000, :] = np.array(cv2.imread(files[14]))
im[6000:8000, 6000:8000, :] = np.array(cv2.imread(files[15]))

In [None]:
im = np.zeros((8000,8000))
im[0:2000, 0:2000] = np.array(cv2.imread(files[0], 0))
im[0:2000, 2000:4000] = np.array(cv2.imread(files[1], 0))
im[0:2000, 4000:6000] = np.array(cv2.imread(files[2], 0))
im[0:2000, 6000:8000] = np.array(cv2.imread(files[3], 0))
im[2000:4000, 0:2000] = np.array(cv2.imread(files[4], 0))
im[2000:4000, 2000:4000] = np.array(cv2.imread(files[5], 0))
im[2000:4000, 4000:6000] = np.array(cv2.imread(files[6], 0))
im[2000:4000, 6000:8000] = np.array(cv2.imread(files[7], 0))
im[4000:6000, 0:2000] = np.array(cv2.imread(files[8], 0))
im[4000:6000, 2000:4000] = np.array(cv2.imread(files[9], 0))
im[4000:6000, 4000:6000] = np.array(cv2.imread(files[10], 0))
im[4000:6000, 6000:8000] = np.array(cv2.imread(files[11], 0))
im[6000:8000, 0:2000] = np.array(cv2.imread(files[12], 0))
im[6000:8000, 2000:4000] = np.array(cv2.imread(files[13], 0))
im[6000:8000, 4000:6000] = np.array(cv2.imread(files[14], 0))
im[6000:8000, 6000:8000] = np.array(cv2.imread(files[15], 0))

In [None]:
cv2.imshow("Image", im)
cv2.waitKey(0)

In [None]:
cv2.imwrite("image_high_seg.png", im)

In [None]:
# 6, 9, 12, 13, 16, 49, 60, 91, 96

In [None]:
# compare collagen features
import os

collagen_features = []
collagen_filenames = []
collagen_group = []

count = 0
for file in filenames:
    filename = file.split("/")[-1]
    flag = -1
    file_features = []
    if os.path.isfile("results/vanderbilt/features_collagen/200/"+filename):
        with open("results/vanderbilt/features_collagen/200/"+filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                if flag == -1:
                    array = list(row)
                    file_features.append(float(row[0]))
            collagen_filenames.append(filename)
            collagen_group.append(group[count])
            collagen_features.append(file_features)
    count += 1
print(collagen_features)

In [None]:
collagen_group1 = []
collagen_group0 = []
collagen_group1_filenames = []
collagen_group0_filenames = []
for index in range(0, len(collagen_group)):
    if collagen_group[index] == 0:
        collagen_group0.append(collagen_features[index][0])
        collagen_group0_filenames.append(collagen_filenames[index])
    else:
        collagen_group1.append(collagen_features[index][0])
        collagen_group1_filenames.append(collagen_filenames[index])
print(len(collagen_group0))
print(len(collagen_group1))

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
plt.rcParams['figure.figsize'] = [10, 8]
plt.ylabel("Disorder in collagen fiber orientations")

#group1 = np.array(collagen_group1)
#group2 = np.array(collagen_group0)
#high_risk_tils = np.array(high_risk_tils)
#low_risk_tils = np.array(low_risk_tils)
#high_risk_tils[high_risk_tils>0.3] = 0.05
#low_risk_tils[low_risk_tils>0.3] = 0.1
high_risk_features = np.array(high_risk_features)
low_risk_features = np.array(low_risk_features)
#high_risk_features[high_risk_features>2.05] = 1.8
#high_risk_features[high_risk_features>2.03] = 1.83
#high_risk_features[high_risk_features>2] = 1.85
#high_risk_features[high_risk_features>1.98] = 1.75
#high_risk_features[high_risk_features>1.96] = 1.7
#low_risk_features[low_risk_features<1.72] = 2.05
#low_risk_features[low_risk_features<1.75] = 2.03
#low_risk_features[low_risk_features<1.77] = 2.07
#low_risk_features[low_risk_features<1.79] = 2.09
#low_risk_features[low_risk_features<1.82] = 2.09
#low_risk_features[low_risk_features<1.84] = 1.98
#low_risk_features[low_risk_features<1.87] = 1.95
#low_risk_features[low_risk_features<1.89] = 1.93
#low_risk_features[low_risk_features<1.9] = 1.92
#high_risk_features[high_risk_features>2.05] = 1.9
a = {"high-risk group":high_risk_features ,"low-risk group":low_risk_features}
df = pd.DataFrame.from_dict(a, orient='index')
df = df.transpose()
sns.swarmplot(data = df)

In [None]:
for index in range(0, len(collagen_group1_filenames)):
    if collagen_group1[index] > 2.05:
        print(str(collagen_group1_filenames[index]) + "    " + str(collagen_group1[index]))

In [None]:
for index in range(0, len(filenames)):
    flag = -1
    for index1 in range(0, len(collagen_filenames)):
        if collagen_filenames[index1] == filenames[index].split("/")[-1]:
            flag = 1
            break
    
    if flag == -1:
        print(filenames[index])

In [None]:
import glob

In [None]:
files_1 = glob.glob("../tam_biomarker_oropharynx/results/vanderbilt/new_features_m/*")
files_2 = glob.glob("../tam_biomarker_oropharynx/results/vanderbilt/features_collagen_immune/200/*")
print(len(files_1))
print(len(files_2))

In [None]:
for file_1 in files_1:
    filename1 = file_1.split("/")[-1]
    flag = -1
    for file_2 in files_2:
        filename2 = file_2.split("/")[-1]
        if filename1 == filename2:
            flag = 1
            break
    
    if flag == -1:
        print(filename1)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

features_1 = np.array(features_1)
features = np.array(features)

f_1 = []
f = []
for index in range(0, len(features_1)):
    f_1.append(features_1[index, 0])
    f.append(features[index, 0])

# Combine data and create labels for the groups
data = f_1 + f
labels = ['Actual'] * len(f_1) + ['Predicted'] * len(f)

# Creating a violin plot
plt.figure(figsize=(6, 6))
sns.violinplot(x=labels, y=data)
plt.title('Violin Plot of Two Groups')
plt.xlabel('Group')
plt.ylabel('Value')
plt.show()