# Classify 2-D Lung Tumor Middle Slices Based on Persistence Images
This will pick up where '2D_complex_generator.ipynb' left off by representing persistent homology with persistence images (with a range of parameters) and running machine learning classifiers. 

In [1]:
import numpy as np
import matplotlib.pylab as plt
import math
import os
import gudhi as gd
import pandas as pd
import PersistenceImages.persistence_images as pimg

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.utils import shuffle
from xgboost import XGBClassifier

from sklearn.linear_model import LogisticRegression
import sklearn.metrics as metrics

## Read in Data and Combine Datasets

In [2]:
#Imaging Data
rad_phom_0s = np.load('./Radiomics_Homology/rad_phom_middle_0s.npy', allow_pickle = True)
rad_phom_1s = np.load('./Radiomics_Homology/rad_phom_middle_1s.npy', allow_pickle = True)

radg_phom_0s = np.load('./Radiogenomics_Homology/radg_phom_middle_0s.npy', allow_pickle = True)
radg_phom_1s = np.load('./Radiogenomics_Homology/radg_phom_middle_1s.npy', allow_pickle = True)

In [3]:
#Clinical Data
#Radiomics clinical data
rad_clinical = pd.read_csv("rad_clinic.csv")
rad_clinical = rad_clinical.drop(rad_clinical.index[127]) #Tumor 128 has no segmentation

rad_histology = rad_clinical.Histology

#Radiogenomics clinical data
radg_clinical = pd.read_csv("radg_clinic.csv", skiprows = range(1,50))
radg_clinical = radg_clinical[0:146]
radg_clinical = radg_clinical.drop(radg_clinical.index[[8, 142]]) #9 and 143 have no segmentation

radg_clinical["Histology"] = radg_clinical['Histology'].str.lower()
radg_histology = radg_clinical.Histology

In [18]:
#Combine Datasets

#Identify indices we want to keep
rad_adeno = rad_histology == 'adenocarcinoma'
rad_squamous = rad_histology == 'squamous cell carcinoma'
radg_adeno = radg_histology == 'adenocarcinoma'
radg_squamous = radg_histology == 'squamous cell carcinoma'

#Select clinical data
rad_histology_adsq = np.array(rad_histology[rad_adeno | rad_squamous])
radg_histology_adsq = np.array(radg_histology[radg_adeno | radg_squamous])

#Select persistent homology data
rad_phom_0s_adsq = rad_phom_0s[rad_adeno | rad_squamous]
rad_phom_1s_adsq = rad_phom_1s[rad_adeno | rad_squamous]

radg_phom_0s_adsq = radg_phom_0s[radg_adeno | radg_squamous]
radg_phom_1s_adsq = radg_phom_1s[radg_adeno | radg_squamous]


histology_adsq = np.array(list(radg_histology_adsq) + list(rad_histology_adsq))
phom_0s_adsq = np.array(list(radg_phom_0s_adsq) + list(rad_phom_0s_adsq))
phom_1s_adsq = np.array(list(radg_phom_1s_adsq) + list(rad_phom_1s_adsq))



In [13]:
len(phom_0s_adsq)

344

## Define Functions to Streamline Parameter Search

In [19]:
def CreateImager(pixel_size, sigma):
    pers_imager = pimg.PersistenceImager()
    pers_imager.pixel_size = pixel_size
    pers_imager.birth_range = (0,1)
    pers_imager.pers_range = (0,1)
    pers_imager.kernel_params['sigma'][0] = [sigma, 0]
    pers_imager.kernel_params['sigma'][1] = [0, sigma]
    return(pers_imager)
    
    
def HomologyToImageVector(phom_0, phom_1, imager):
    
    pers_img_0 = imager.transform(phom_0, skew=True)
    pers_img_1 = imager.transform(phom_1, skew=True)
    
    pers_img_0_resized = np.resize(pers_img_0, (1, len(pers_img_0)**2))
    pers_img_1_resized = np.resize(pers_img_1, (1, len(pers_img_1)**2))
        
    return(pers_img_0_resized, pers_img_1_resized)


def AllPhomsToImages(phom_0s, phom_1s, imager):
    concatenated_images = []
    for i in range(len(phom_0s)):
        
        pimg_0, pimg_1 = HomologyToImageVector(phom_0s[i], phom_1s[i], imager)
        imgs = np.concatenate((pimg_0[0], pimg_1[0]), axis=0)
        concatenated_images.append(imgs)
        
    concatenated_images = np.array(concatenated_images)
    return(concatenated_images)

def ClassifyImages(images, histology):
    #First we need to shuffle this dataset since otherwise k-fold will magnify batch effects.
    images, histology = shuffle(images, histology, random_state = 10)

    scores = []
    y_pred = []

    clf_logreg = LogisticRegression(penalty = 'l1', solver='liblinear')

    cv = KFold(n_splits=5, shuffle=False)
    for train_index, test_index in cv.split(images):

        X_train, X_test = images[train_index], images[test_index]
        y_train, y_test = histology[train_index], histology[test_index]
        clf_logreg.fit(X_train, y_train)
        y_pred.append(list(clf_logreg.predict(X_test)))
        scores.append(clf_logreg.score(X_test, y_test))
    
    return(np.mean(scores))

def ParameterSearchInstance(pixel_size, sigma, phom_0s, phom_1s, histology):
    imager = CreateImager(pixel_size, sigma)
    images = AllPhomsToImages(phom_0s, phom_1s, imager)
    acc = ClassifyImages(images, histology)
    return(acc)

## Do Grid Search of Persistence Imager Parameters

In [20]:
imager = CreateImager(0.1, 0.05)
images = AllPhomsToImages(phom_0s_adsq, phom_1s_adsq, imager)
acc = ClassifyImages(images, histology_adsq)
print(acc)

0.6395993179880648


In [21]:
pixel_sizes = [0.05, 0.1, 0.2, 0.5]
sigmas = [0.01, 0.005, 0.01, 0.05, 0.1]
accs = np.zeros((len(pixel_sizes), len(sigmas)))

for i in range(len(pixel_sizes)):
    for j in range(len(sigmas)):
        accs[i,j] = ParameterSearchInstance(pixel_sizes[i], sigmas[j], phom_0s_adsq, phom_1s_adsq, histology_adsq)

In [22]:
print(accs)
for line in accs:
    print()
    print(*line)

[[0.64249787 0.63964194 0.64249787 0.58734015 0.51453538]
 [0.6658994  0.64556692 0.6658994  0.63959932 0.61641091]
 [0.65144928 0.65720375 0.65144928 0.6658994  0.63964194]
 [0.65703325 0.64254049 0.65703325 0.66875533 0.66875533]]

0.6424978687127024 0.6396419437340153 0.6424978687127024 0.5873401534526853 0.514535379369139

0.6658994032395567 0.6455669224211424 0.6658994032395567 0.6395993179880648 0.6164109121909633

0.6514492753623189 0.6572037510656437 0.6514492753623189 0.6658994032395567 0.6396419437340153

0.6570332480818415 0.642540494458653 0.6570332480818415 0.6687553282182439 0.6687553282182439
