# Introduction
This project is about creating an algorithm that would predict the detection of lung cancer based on CT scans. The algorithm would also predict what type of cancer is within the patient and the size of the tumor if applied.

In [11]:
# Import the necessary modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
import pandas as pd
from tcia_utils import nbia
import os
import sys
import itk
import pydicom as dicom
from pathlib import Path
import time
from zipfile import ZipFile 

# Data importing
In this chapter, the data will be imported. For this, I've selected a subset from the LUng-PET-CT-Dx collection from this link: https://wiki.cancerimagingarchive.net/pages/viewpage.action?pageId=70224216#7022421621c64ff049c44f03bb442ec5eb88bdf2


In [2]:
# Set start time
start_time = time.time()

# Get the data that's locally stored
basePath = Path("tciaDownload/manifest-1608669183333/Lung-PET-CT-Dx")

# Get the DICOM files present in the path set before
pathFiles = list(basePath.rglob('*.dcm'))

# Set the DICOM files in a numpy array
images = [dicom.filereader.dcmread(x) for x in pathFiles]
pixels = [np.array(image.pixel_array).flatten() for image in images]
age = [image.PatientAge for image in images]
sex = [image.PatientSex for image in images]
patient_id = [image.PatientID for image in images]
cancer_type = [image.PatientID[8] for image in images]
sop_instance_uid = [image.SOPInstanceUID for image in images]

# Print time execution
print(f"Execution time: {time.time() - start_time} seconds")

Execution time: 160.8373317718506 seconds


In [3]:
# Put created lists into dataframe
colNames = ["PatientId", "SOPInstanceId", "ImagePixels", "PatientAge", "PatientSex", "CancerType"]
data = [patient_id, sop_instance_uid, pixels, age, sex, cancer_type]

df = pd.DataFrame()

for colName, values in zip(colNames, data):
    df[colName] = values

display(df.head())

Unnamed: 0,PatientId,SOPInstanceId,ImagePixels,PatientAge,PatientSex,CancerType
0,Lung_Dx-A0002,1.3.6.1.4.1.14519.5.2.1.6655.2359.295499053390...,"[44, 0, 10, 37, 0, 36, 70, 22, 0, 35, 0, 49, 0...",053Y,F,A
1,Lung_Dx-A0002,1.3.6.1.4.1.14519.5.2.1.6655.2359.314562946465...,"[50, 0, 22, 26, 42, 25, 6, 51, 35, 7, 13, 33, ...",053Y,F,A
2,Lung_Dx-A0002,1.3.6.1.4.1.14519.5.2.1.6655.2359.142392682681...,"[16, 3, 0, 89, 20, 10, 1, 22, 12, 61, 10, 25, ...",053Y,F,A
3,Lung_Dx-A0002,1.3.6.1.4.1.14519.5.2.1.6655.2359.117012811165...,"[58, 0, 25, 71, 40, 48, 0, 0, 32, 49, 4, 46, 1...",053Y,F,A
4,Lung_Dx-A0002,1.3.6.1.4.1.14519.5.2.1.6655.2359.208988389922...,"[13, 0, 27, 53, 18, 41, 26, 0, 37, 30, 68, 0, ...",053Y,F,A


In [10]:
# Set patient ID list to retrieve annotations based upon
patient_id[0][-5:-1]

'A000'

In [16]:
# Extract the normal lungs images from the zip file
normalLungs = "C:/Users/ardejong/Documents/final-project-elu/final-project-elu/normal_ct_scans.zip"
  
# open the zip file in read mode
with ZipFile(normalLungs, 'r') as zip: 
    # Extract all the files
    zip.extractall()

# Data cleaning
In this chapter, the data cleaning will be done. This involves mostly around feature engineering the structure of the images to make it ready for training the data.

In [12]:
print(min(df.PatientAge))

000Y


# Model building, training and testing
In this chapter, the model will be build and trained before it will be tested among the test data.


# Results
In this chapter, the results will be shown. For this, we'll look into the following:
- The True Negative Rate for detecting the lung cancer
- The Mean Absolute Error (MAE) for the size of the tumor
- Accuracy for the type of cancer and the corresponding confidence score.

1.2.840.10008.5.1.4.1.1.2
