In [1]:
%load_ext autoreload

%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime

import glob
import os
import sys

from tqdm.notebook import tqdm
from datetime import datetime

import Cross_Eval

In [3]:
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import FunctionTransformer, RobustScaler, MinMaxScaler, StandardScaler, LabelBinarizer
from sklearn.preprocessing import normalize, robust_scale, minmax_scale

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier

from sklearn.kernel_approximation import RBFSampler, Nystroem

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate, GroupKFold, KFold

from sklearn.metrics import make_scorer, confusion_matrix, roc_auc_score, roc_curve, plot_confusion_matrix, f1_score, recall_score, accuracy_score

from sklearn.multiclass import OneVsRestClassifier

from mlxtend.evaluate import scoring

# Preparing Metadata/Labels

In [4]:
all_files = pd.DataFrame(glob.glob("/mnt/b/Git_Projects/Biospec_Analysis/Extracted_Spectra/*.pickle"), columns = ["hdf_loc"])
all_files["TMA"] = all_files.apply(lambda row: float(row["hdf_loc"].split("/")[-1][4:5]), axis = 1)
all_files["Core"] = all_files.apply(lambda row: row["hdf_loc"].split("/")[-1][8:-7], axis = 1)

In [5]:
metadata = pd.read_excel("/mnt/b/Google_Drive/PhD/Project with Janet/Scoring grids Advancing Front (TMAs 1-3).xlsx"
              , sheet_name = "reformatted")

metadata = metadata.merge(all_files, left_on = ["TMA", "TMA site"], right_on = ["TMA", "Core"])
print(metadata.shape)

(79, 18)


In [6]:
new_metadata = pd.read_excel("/mnt/b/Google_Drive/PhD/Project with Janet/new_metadata.xlsx")

In [7]:
new_metadata["DODvsAlive"] = new_metadata["OUTCOME"].map(
                            {"DOD" : "DOD",
                             "alive": "Other",
                             "died other" : "Other",
                             "died" : "Other",
                             "Died" : "Other",
                             "Died other": "Other"})

new_metadata["DiedvsAlive"] = new_metadata["OUTCOME"].map(
                            {"DOD" : "Died",
                             "alive": "Alive",
                             "died other" : "Died",
                             "died" : "Died",
                             "Died" : "Died",
                             "Died other": "Died"})

new_metadata["ECS"] = new_metadata["ECS"].map(
                            {"y" : "Y",
                             "Y": "Y",
                             "N" : "N",
                             "n" : "N"})

In [8]:
new_metadata['lifespan'] = new_metadata['Date of death'] - new_metadata['date of diag']

new_metadata["2year"] = new_metadata['lifespan'].dt.days < 730
new_metadata["5year"] = new_metadata['lifespan'].dt.days < 1825

In [9]:
new_metadata["2year"] = new_metadata.apply(lambda row: True if (row['Date of death']-row['date of diag']).days<730 and row.OUTCOME!='Alive' else False, axis = 1)
new_metadata["5year"] = new_metadata.apply(lambda row: True if 730<(row['Date of death']-row['date of diag']).days<1825 and row.OUTCOME!='Alive' else False, axis = 1)

new_metadata["Alive"] = new_metadata.apply(lambda row: True if 730<(row['Last seen/reviewed']-row['date of diag']).days<1825 and row.OUTCOME=='Alive' else False, axis = 1)

In [10]:
new_metadata.head()

Unnamed: 0,Patient_Number,gender,date of diag,age @ diag,site,pT,pN,stage,ECS,OUTCOME,...,site of recurrence,Last seen/reviewed,Last updated,OUTCOME.updated by,DODvsAlive,DiedvsAlive,lifespan,2year,5year,Alive
0,3143,m,2004-03-29,58,floor of mouth,4,0,4,N,DOD,...,local,NaT,2020-05-11,FGD,DOD,Died,3341 days,False,False,False
1,3167,m,2004-09-30,74,floor of mouth,2,2,4,Y,DOD,...,regional only,NaT,2012-10-04,CBarry,DOD,Died,390 days,True,False,False
2,3169,m,2004-10-28,63,floor of mouth,3,1,3,N,died other,...,,NaT,2006-12-01,RJS,Other,Died,652 days,True,False,False
3,3170,m,2004-11-22,59,other,4,2b,4a,Y,Died other,...,,NaT,2020-04-01,FGD,Other,Died,3690 days,False,False,False
4,3230,m,2006-01-26,74,tongue,2,1,3,Y,DOD,...,local and regional,NaT,2011-05-01,AS,DOD,Died,161 days,True,False,False


In [11]:
new_metadata.site.value_counts()

tongue            38
floor of mouth    34
other             28
oropharynx         7
Name: site, dtype: int64

Import ASMA data

In [12]:
asma = pd.read_excel("/mnt/b/Google_Drive/PhD/Project with Janet/ASMA.xlsx", sheet_name = "Sheet3")
new_metadata = new_metadata.merge(asma, left_on = "Patient_Number", right_on = "Case ID")

In [13]:
from Preprocessing_Methods import *

import sys

sys.path.append("/mnt/b/Git_Projects/Biospec_Analysis/UNET")

In [14]:
select = ['Tumour']

total = pd.concat({"{}-{}".format(row["TMA"], row["Core"]):
                   truncate(pd.read_pickle(row["hdf_loc"]).query(f"Tissue in {select}").sample(frac = 1), start = 1000, end = 1800)
                   for name, row in tqdm(list(metadata.iterrows()))}, names = ["ID"])

wn_cols = total.columns
#total = total.merge(new_metadata, left_on = "Patient nu ", right_on = "Patient Number")
#total = total.drop(np.nan, level = "Overall Death ")

HBox(children=(FloatProgress(value=0.0, max=79.0), HTML(value='')))




In [15]:
total = total.reset_index().merge(new_metadata, left_on = "Patient nu ", right_on = "Patient_Number", how = 'inner')
total = total.set_index(list(total.columns.difference(wn_cols))).dropna(axis = 1)

In [16]:
total

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Unnamed: 16_level_0,Unnamed: 17_level_0,Unnamed: 18_level_0,Unnamed: 19_level_0,Unnamed: 20_level_0,Unnamed: 21_level_0,Unnamed: 22_level_0,Unnamed: 23_level_0,Unnamed: 24_level_0,Unnamed: 25_level_0,Unnamed: 26_level_0,Unnamed: 27_level_0,Unnamed: 28_level_0,Unnamed: 29_level_0,Unnamed: 30_level_0,Unnamed: 31_level_0,Unnamed: 32_level_0,Unnamed: 33_level_0,Unnamed: 34_level_0,Unnamed: 35_level_0,Unnamed: 36_level_0,Unnamed: 37_level_0,Unnamed: 38_level_0,Unnamed: 39_level_0,Unnamed: 40_level_0,Unnamed: 41_level_0,Unnamed: 42_level_0,Unnamed: 43_level_0,Unnamed: 44_level_0,Unnamed: 45_level_0,Unnamed: 46_level_0,998.0,1002.0,1006.0,1010.0,1014.0,1018.0,1022.0,1025.0,1029.0,1033.0,...,1762.0,1766.0,1770.0,1774.0,1778.0,1781.0,1785.0,1789.0,1793.0,1797.0
2year,5year,ASMA,Alive,Annotation_loc,Case ID,Core_x,Core_y,Corrected Site AJCC/UICC,DODvsAlive,Date of death,Date of recurrence,Diagnosis Age,DiedvsAlive,ECS_x,ECS_y,ID,Last seen/reviewed,Last updated,N stage,OUTCOME,OUTCOME.updated by,Overall Death,Pathogical Stage,Patient nu,Patient_Number,Recurrence?,T stage,TMA,TMA site,Tissue,advancing front type,age @ diag,date of diag,diff,envi_loc,gender_x,gender_y,lifespan,pN,pT,radiotherapy,recurrence,site,site of recurrence,stage,survival (months),Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1
True,False,H,False,/mnt/b/Google_Drive/PhD/Project with Janet/Label_Images/TMA_1/TMA_1.4_G13.png,3540,G13,G13,other,DOD,2010-03-20,2010-03-02,85.0,Died,yes,Y,1.0-G13,NaT,2020-04-21,2b,DOD,PG,yes,pT4N2bM0,3540,3540,yes,4,1.0,G13,Tumour,non-cohesive,85,2009-10-19,mod,/mnt/b/Google_Drive/PhD/Project with Janet/Kohler_Corrected/TMA_1TMA_1.4_G13.HDR,m,m,152 days,2b,4,,yes,other,regional,4a,5.0,0.030717,0.031150,0.032310,0.034103,0.036437,0.039218,0.042154,0.045804,0.049053,0.051707,...,0.030167,0.029776,0.029695,0.029826,0.029923,0.030044,0.030002,0.030099,0.030304,0.030482
True,False,H,False,/mnt/b/Google_Drive/PhD/Project with Janet/Label_Images/TMA_1/TMA_1.4_G13.png,3540,G13,G13,other,DOD,2010-03-20,2010-03-02,85.0,Died,yes,Y,1.0-G13,NaT,2020-04-21,2b,DOD,PG,yes,pT4N2bM0,3540,3540,yes,4,1.0,G13,Tumour,non-cohesive,85,2009-10-19,mod,/mnt/b/Google_Drive/PhD/Project with Janet/Kohler_Corrected/TMA_1TMA_1.4_G13.HDR,m,m,152 days,2b,4,,yes,other,regional,4a,5.0,0.019120,0.024715,0.029592,0.033841,0.037552,0.040817,0.043630,0.046248,0.048567,0.050986,...,0.023235,0.023483,0.024011,0.024740,0.025341,0.025941,0.026410,0.026807,0.027176,0.027542
True,False,H,False,/mnt/b/Google_Drive/PhD/Project with Janet/Label_Images/TMA_1/TMA_1.4_G13.png,3540,G13,G13,other,DOD,2010-03-20,2010-03-02,85.0,Died,yes,Y,1.0-G13,NaT,2020-04-21,2b,DOD,PG,yes,pT4N2bM0,3540,3540,yes,4,1.0,G13,Tumour,non-cohesive,85,2009-10-19,mod,/mnt/b/Google_Drive/PhD/Project with Janet/Kohler_Corrected/TMA_1TMA_1.4_G13.HDR,m,m,152 days,2b,4,,yes,other,regional,4a,5.0,0.019243,0.023217,0.026947,0.030478,0.033855,0.037123,0.040092,0.043323,0.046664,0.050020,...,0.027886,0.027818,0.028136,0.028604,0.028965,0.029484,0.029916,0.030331,0.030834,0.031373
True,False,H,False,/mnt/b/Google_Drive/PhD/Project with Janet/Label_Images/TMA_1/TMA_1.4_G13.png,3540,G13,G13,other,DOD,2010-03-20,2010-03-02,85.0,Died,yes,Y,1.0-G13,NaT,2020-04-21,2b,DOD,PG,yes,pT4N2bM0,3540,3540,yes,4,1.0,G13,Tumour,non-cohesive,85,2009-10-19,mod,/mnt/b/Google_Drive/PhD/Project with Janet/Kohler_Corrected/TMA_1TMA_1.4_G13.HDR,m,m,152 days,2b,4,,yes,other,regional,4a,5.0,0.028798,0.030537,0.032928,0.035788,0.038938,0.042196,0.045017,0.047700,0.049951,0.051928,...,0.033213,0.032901,0.032809,0.032952,0.033092,0.033316,0.033528,0.033878,0.034287,0.034534
True,False,H,False,/mnt/b/Google_Drive/PhD/Project with Janet/Label_Images/TMA_1/TMA_1.4_G13.png,3540,G13,G13,other,DOD,2010-03-20,2010-03-02,85.0,Died,yes,Y,1.0-G13,NaT,2020-04-21,2b,DOD,PG,yes,pT4N2bM0,3540,3540,yes,4,1.0,G13,Tumour,non-cohesive,85,2009-10-19,mod,/mnt/b/Google_Drive/PhD/Project with Janet/Kohler_Corrected/TMA_1TMA_1.4_G13.HDR,m,m,152 days,2b,4,,yes,other,regional,4a,5.0,0.033685,0.036324,0.038799,0.041140,0.043375,0.045536,0.047726,0.050060,0.052016,0.053440,...,0.040492,0.040346,0.040402,0.040558,0.040681,0.040880,0.041095,0.041312,0.041503,0.041736
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
False,False,,False,/mnt/b/Google_Drive/PhD/Project with Janet/Label_Images/TMA_2/TMA_1.1_D4.png,3530,D4,D4,oropharynx,Other,NaT,NaT,63.0,Alive,yes,Y,2.0-D4,2019-03-13,2020-04-01,2b,alive,FGD,yes,pT2N2bMx,3530,3530,no,2,2.0,D4,Tumour,non-cohesive,63,2009-08-06,mod,/mnt/b/Google_Drive/PhD/Project with Janet/Kohler_Corrected/TMA_2TMA_1.1_D4.HDR,f,f,NaT,2b,2,yes,,oropharynx,,,10.0,0.013783,0.018662,0.022850,0.026541,0.029925,0.033197,0.036164,0.040011,0.044302,0.048983,...,0.012905,0.012581,0.012519,0.012758,0.013049,0.013636,0.014150,0.014570,0.014829,0.015120
False,False,,False,/mnt/b/Google_Drive/PhD/Project with Janet/Label_Images/TMA_2/TMA_1.1_D4.png,3530,D4,D4,oropharynx,Other,NaT,NaT,63.0,Alive,yes,Y,2.0-D4,2019-03-13,2020-04-01,2b,alive,FGD,yes,pT2N2bMx,3530,3530,no,2,2.0,D4,Tumour,non-cohesive,63,2009-08-06,mod,/mnt/b/Google_Drive/PhD/Project with Janet/Kohler_Corrected/TMA_2TMA_1.1_D4.HDR,f,f,NaT,2b,2,yes,,oropharynx,,,10.0,0.032516,0.034940,0.037238,0.039438,0.041570,0.043665,0.045318,0.047245,0.049725,0.052774,...,0.034916,0.034472,0.034242,0.034169,0.034125,0.034101,0.033884,0.033665,0.033440,0.033198
False,False,,False,/mnt/b/Google_Drive/PhD/Project with Janet/Label_Images/TMA_2/TMA_1.1_D4.png,3530,D4,D4,oropharynx,Other,NaT,NaT,63.0,Alive,yes,Y,2.0-D4,2019-03-13,2020-04-01,2b,alive,FGD,yes,pT2N2bMx,3530,3530,no,2,2.0,D4,Tumour,non-cohesive,63,2009-08-06,mod,/mnt/b/Google_Drive/PhD/Project with Janet/Kohler_Corrected/TMA_2TMA_1.1_D4.HDR,f,f,NaT,2b,2,yes,,oropharynx,,,10.0,0.039226,0.039476,0.040452,0.041995,0.043950,0.046159,0.048065,0.050138,0.052016,0.053951,...,0.052911,0.052412,0.052008,0.051648,0.051355,0.051121,0.050797,0.050329,0.049714,0.049097
False,False,,False,/mnt/b/Google_Drive/PhD/Project with Janet/Label_Images/TMA_2/TMA_1.1_D4.png,3530,D4,D4,oropharynx,Other,NaT,NaT,63.0,Alive,yes,Y,2.0-D4,2019-03-13,2020-04-01,2b,alive,FGD,yes,pT2N2bMx,3530,3530,no,2,2.0,D4,Tumour,non-cohesive,63,2009-08-06,mod,/mnt/b/Google_Drive/PhD/Project with Janet/Kohler_Corrected/TMA_2TMA_1.1_D4.HDR,f,f,NaT,2b,2,yes,,oropharynx,,,10.0,0.031837,0.032311,0.033885,0.036341,0.039464,0.043036,0.046453,0.050348,0.053771,0.056806,...,0.032873,0.032626,0.032497,0.032364,0.032222,0.032262,0.032281,0.032295,0.032313,0.032450
