# Lung Cancer Image Classification

### Overview
In this project, we implement a convolutional neural network to classify a given lung cell image input as one of the following: benign lung tissue, lung squamous cell carcinoma, or lung adenocarcinoma.

### Environment Set-Up

In [156]:
import os
import shutil
import random
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras.layers
from keras.preprocessing.image import ImageDataGenerator
import matplotlib.pyplot as plt
from PIL import Image

%matplotlib inline
SEED = 64

### Data Preparation

In [174]:
# Contains the current data
ORIG_DATA_PATH = "../data/lung_colon_image_set/lung_image_sets"
NEW_DATA_PATH = "lung_model_data"

# Will contain documentation of train, cv, test splits
SPLIT_CSV_PATH = "lung_model_data/lung_model_data.csv"

# Training set details
TRAIN_PROP = 0.8
TRAIN_ID = "train"

# CV set details
CV_PROP = 0.1
CV_ID = "cv"

# Test set details
TEST_PROP = 0.1
TEST_ID = "test"

# Target id's
# aca = 0
# n = 1
# scc = 2
TARGETS = pd.DataFrame({'target_id': [0,1,2], 'target_name': ["aca", "n", "scc"]})

IMG_DIM = (768, 768)

TARGETS

Unnamed: 0,target_id,target_name
0,0,aca
1,1,n
2,2,scc


In [161]:
# Get all image paths
imPaths = []
for root, subdir, files in os.walk(DATA_DIR):
    for file in files:
        imPaths.append((root, os.path.join(root, file), file))
imPaths = imPaths[1:]

random.seed(SEED)
random.shuffle(imPaths)

print(f"There are a total of {len(imPaths)} images, and the file names look like:")
[print(i) for i in imPaths[:5]]

There are a total of 15000 images, and the file names look like:
('../data/lung_colon_image_set/lung_image_sets/lung_aca', '../data/lung_colon_image_set/lung_image_sets/lung_aca/lungaca3865.jpeg', 'lungaca3865.jpeg')
('../data/lung_colon_image_set/lung_image_sets/lung_scc', '../data/lung_colon_image_set/lung_image_sets/lung_scc/lungscc789.jpeg', 'lungscc789.jpeg')
('../data/lung_colon_image_set/lung_image_sets/lung_aca', '../data/lung_colon_image_set/lung_image_sets/lung_aca/lungaca3278.jpeg', 'lungaca3278.jpeg')
('../data/lung_colon_image_set/lung_image_sets/lung_aca', '../data/lung_colon_image_set/lung_image_sets/lung_aca/lungaca1120.jpeg', 'lungaca1120.jpeg')
('../data/lung_colon_image_set/lung_image_sets/lung_n', '../data/lung_colon_image_set/lung_image_sets/lung_n/lungn858.jpeg', 'lungn858.jpeg')


[None, None, None, None, None]

In [162]:
# Split paths into train, CV, test
trainSplitInd = int(TRAIN_PROP*len(imPaths))
cvSplitInd = int((TRAIN_PROP+CV_PROP)*len(imPaths))

trainPaths = imPaths[:trainSplitInd]
cvPaths = imPaths[trainSplitInd:cvSplitInd]
testPaths = imPaths[cvSplitInd:]

print(f"There are a total of {len(trainPaths)} training images, {len(cvPaths)} CV images, and {len(testPaths)} testing images.")
print("These image sets are completely non-intersecting partitions of the full data set.")
print(f"Intersection of training & CV data: {list(set(trainPaths) & set(cvPaths))}")
print(f"Intersection of training & testing data: {list(set(trainPaths) & set(testPaths))}")
print(f"Intersection of testing & CV data: {list(set(testPaths) & set(cvPaths))}")


There are a total of 12000 training images, 1500 CV images, and 1500 testing images.
These image sets are completely non-intersecting partitions of the full data set.
Intersection of training & CV data: []
Intersection of training & testing data: []
Intersection of testing & CV data: []


In [177]:
# Create pd df of image paths, with target and dataset id's, for documentation
ids = [TRAIN_ID]*len(trainPaths) + [CV_ID]*len(cvPaths) + [TEST_ID]*len(testPaths)
df = pd.DataFrame({'dataset_id': ids, 'image_path': imPaths})
df[['root_path', 'image_path', 'file_name']] = pd.DataFrame(df['image_path'].to_list(), index=df.index)
df["target_id"] = df["root_path"].astype("category").cat.codes
df = df.merge(TARGETS, on = 'target_id', how='left')
df

Unnamed: 0,dataset_id,image_path,root_path,file_name,target_id,target_name
0,train,../data/lung_colon_image_set/lung_image_sets/l...,../data/lung_colon_image_set/lung_image_sets/l...,lungaca3865.jpeg,0,aca
1,train,../data/lung_colon_image_set/lung_image_sets/l...,../data/lung_colon_image_set/lung_image_sets/l...,lungscc789.jpeg,2,scc
2,train,../data/lung_colon_image_set/lung_image_sets/l...,../data/lung_colon_image_set/lung_image_sets/l...,lungaca3278.jpeg,0,aca
3,train,../data/lung_colon_image_set/lung_image_sets/l...,../data/lung_colon_image_set/lung_image_sets/l...,lungaca1120.jpeg,0,aca
4,train,../data/lung_colon_image_set/lung_image_sets/l...,../data/lung_colon_image_set/lung_image_sets/l...,lungn858.jpeg,1,n
...,...,...,...,...,...,...
14995,test,../data/lung_colon_image_set/lung_image_sets/l...,../data/lung_colon_image_set/lung_image_sets/l...,lungn119.jpeg,1,n
14996,test,../data/lung_colon_image_set/lung_image_sets/l...,../data/lung_colon_image_set/lung_image_sets/l...,lungscc555.jpeg,2,scc
14997,test,../data/lung_colon_image_set/lung_image_sets/l...,../data/lung_colon_image_set/lung_image_sets/l...,lungscc1843.jpeg,2,scc
14998,test,../data/lung_colon_image_set/lung_image_sets/l...,../data/lung_colon_image_set/lung_image_sets/l...,lungaca4778.jpeg,0,aca


In [178]:
if not os.path.exists(NEW_DATA_PATH):
    os.makedirs(NEW_DATA_PATH)

df.to_csv(SPLIT_CSV_PATH)

### ImageDataGenerator Implementation

In [179]:
idg = ImageDataGenerator(rescale = 1/255.0)

train_df = df.loc[lambda df: df['dataset_id']==TRAIN_ID]
train_idg = idg.flow_from_dataframe(
    dataframe = train_df,
    directory = ".",
    x_col = "image_path",
    y_col = "target_name",
    target_size = IMG_DIM,
    shuffle = False
)

cv_df = df.loc[lambda df: df['dataset_id']==CV_ID]
cv_idg = idg.flow_from_dataframe(
    dataframe = cv_df,
    directory = ".",
    x_col = "image_path",
    y_col = "target_name",
    target_size = IMG_DIM,
    shuffle = False
)

test_df = df.loc[lambda df: df['dataset_id']==TEST_ID]
test_idg = idg.flow_from_dataframe(
    dataframe = test_df,
    directory = ".",
    x_col = "image_path",
    y_col = "target_name",
    target_size = IMG_DIM,
    class_mode = None,
    shuffle = False
)

Found 12000 validated image filenames belonging to 3 classes.
Found 1500 validated image filenames belonging to 3 classes.
Found 1500 validated image filenames.
