Protoype demo:
1. Used simple CNN with dropout
2. tensorflow+keras
3. Current accuracy score of 0.8985 on test data(real_word)


In [None]:
#imports

from numpy.random import seed
seed(101)
from tensorflow import set_random_seed
set_random_seed(101)
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.layers import Dense, Dropout, Flatten, Activation
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
import itertools
import shutil
import matplotlib.pyplot as plt
%matplotlib inline
import datetime
import os
import cv2
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

from time import gmtime, strftime
strftime("%Y-%m-%d %H:%M:%S", gmtime())

In [None]:

image_size = 96
channels = 3
sample_size = 50000 


In [None]:
df_train = pd.read_csv('../input/train_labels.csv')
print(df_train.shape)

In [None]:
df_train['label'].value_counts()

we want a equal number of samples in each class (0 and 1) to prevent class imbalance

In [None]:
df_0 = df_train[df_train['label'] == 0].sample(sample_size, random_state = 101)
df_1 = df_train[df_train['label'] == 1].sample(sample_size, random_state = 101)
df_train = pd.concat([df_0, df_1], axis=0).reset_index(drop=True)
# shuffling to ensure random distribution of labels, especially since we will be further dividing this train set into train and valid sets
df_train = shuffle(df_train)

df_train['label'].value_counts()

In [None]:
# further dividing training data into train and validation sets
y = df_train['label']
#90/10% split into train and val sets
df_train_train, df_train_val = train_test_split(df_train, test_size=0.10, random_state=101, stratify=y)

print(df_train_train.shape)
print(df_train_val.shape)

now, df_train size is 90000, df_val_size is 10000 with equal distributuion of 0 and 1 labels in both sets

In [None]:
df_train_train['label'].value_counts()

In [None]:
df_train_val['label'].value_counts()

Creating a new directory of subdirs to easily feed data into models via generators

In [None]:
# base dir
base_dir = 'base_dir'
os.mkdir(base_dir)
#base_dir structure
# train_dir
    # val0:no tumor
    # val1:has tumor

# val_dir
    # val0:no tumor
    # val1:has tumor

#subpaths for subdirs
train_dir = os.path.join(base_dir, 'train_dir')
os.mkdir(train_dir)
val_dir = os.path.join(base_dir, 'val_dir')
os.mkdir(val_dir)


val0 = os.path.join(train_dir, 'val0')
os.mkdir(val0)
val1 = os.path.join(train_dir, 'val1')
os.mkdir(val1)

val0 = os.path.join(val_dir, 'val0')
os.mkdir(val0)
val1 = os.path.join(val_dir, 'val1')
os.mkdir(val1)



In [None]:
# use id as index in the dataframe
df_train.set_index('id', inplace=True)

train_list = list(df_train_train['id'])
val_list = list(df_train_val['id'])
print(train_list[0:5])

loading files into the directory created above using the index

In [None]:


for image in train_list:
    #.tif is the image format
    fname = image + '.tif'
    target = df_train.loc[image,'label']

    if target == 0:
        label = 'val0'
    if target == 1:
        label = 'val1'
        
    src = os.path.join('../input/train', fname)
    dst = os.path.join(train_dir, label, fname)
    #copy
    shutil.copyfile(src, dst)


for image in val_list:
    #.tif is the image format
    fname = image + '.tif'
    target = df_train.loc[image,'label']

    if target == 0:
        label = 'val0'
    if target == 1:
        label = 'val1'

    src = os.path.join('../input/train', fname)
    dst = os.path.join(val_dir, label, fname)
    #copy
    shutil.copyfile(src, dst)
    


   

In [None]:
# check how many train images we have in each folder

print(len(os.listdir('base_dir/train_dir/val0')))
print(len(os.listdir('base_dir/train_dir/val1')))


In [None]:
# check how many val images we have in each folder

print(len(os.listdir('base_dir/val_dir/val0')))
print(len(os.listdir('base_dir/val_dir/val1')))


feed subdirs into generators

In [None]:
test_path = '../input/test'

train_size = len(df_train_train)
val_size = len(df_train_val)
train_size_batch = 10
val_size_batch = 10

train_steps = np.ceil(train_size / train_size_batch)
val_steps = np.ceil(val_size / val_size_batch)
print(train_steps)
print(val_steps)

In [None]:
#adapted from https://keras.io/preprocessing/image/
datagen = ImageDataGenerator(rescale=1.0/255)

train_gen = datagen.flow_from_directory(train_dir,
                                        target_size=(image_size,image_size),
                                        batch_size=train_size_batch,
                                        class_mode='categorical')

val_gen = datagen.flow_from_directory(val_dir,
                                        target_size=(image_size,image_size),
                                        batch_size=val_size_batch,
                                        class_mode='categorical')


test_gen = datagen.flow_from_directory(val_dir,
                                        target_size=(image_size,image_size),
                                        batch_size=1,
                                        class_mode='categorical',
                                        shuffle=False)

model

In [None]:
kernel_size = (3,3)
pool_size= (2,2)
first_filters = 32
second_filters = 64
third_filters = 128

dropout_conv = 0.3
dropout_dense = 0.3


model = Sequential()

#layer 1
model.add(Conv2D(first_filters, kernel_size, activation = 'relu', input_shape = (96, 96, 3)))
model.add(Conv2D(first_filters, kernel_size, activation = 'relu'))
model.add(Conv2D(first_filters, kernel_size, activation = 'relu'))
model.add(MaxPooling2D(pool_size = pool_size)) 
model.add(Dropout(dropout_conv))

#layer 2
model.add(Conv2D(second_filters, kernel_size, activation ='relu'))
model.add(Conv2D(second_filters, kernel_size, activation ='relu'))
model.add(Conv2D(second_filters, kernel_size, activation ='relu'))
model.add(MaxPooling2D(pool_size = pool_size))
model.add(Dropout(dropout_conv))

#layer 3

model.add(Conv2D(third_filters, kernel_size, activation ='relu'))
model.add(Conv2D(third_filters, kernel_size, activation ='relu'))
model.add(Conv2D(third_filters, kernel_size, activation ='relu'))
model.add(MaxPooling2D(pool_size = pool_size))
model.add(Dropout(dropout_conv))

#fully connected layer
model.add(Flatten())
model.add(Dense(256, activation = "relu"))
model.add(Dropout(dropout_dense))
model.add(Dense(2, activation = "softmax"))

model.summary()

training

In [None]:
model.compile(Adam(lr=0.0001), loss='binary_crossentropy', 
              metrics=['accuracy'])

In [None]:
filepath = "model.h5"
#saves model after every epoch
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, 
                             save_best_only=True, mode='max')
#reduces learning rate if there is no change in val_acc for 2 epochs, by factor of half 
reduce_lr = ReduceLROnPlateau(monitor='val_acc', factor=0.5, patience=2, 
                                   verbose=1, mode='max', min_lr=0.00001)
                              
                              
callbacks = [checkpoint, reduce_lr]

history = model.fit_generator(train_gen, steps_per_epoch=train_steps, 
                    validation_data=val_gen,
                    validation_steps=val_steps,
                    epochs=20, verbose=1,
                   callbacks=callbacks)

validation


In [None]:
# using best epoch on validation

model.load_weights('model.h5')

val_loss, val_acc = \
model.evaluate_generator(test_gen, 
                        steps=len(df_train_val))

print('val_loss:', val_loss)
print('val_acc:', val_acc)

predictions = model.predict_generator(test_gen, steps=len(df_train_val), verbose=1)
#the validation set can be used to create a confusion matrix for presentation purposes#KIV

testing

In [None]:
shutil.rmtree('base_dir')

#create test dir

#feed test images to test_dir
test_dir = 'test_dir'
os.mkdir(test_dir)
test_images = os.path.join(test_dir, 'test_images')
os.mkdir(test_images)

In [None]:
# move test images into image_dir

test_list = os.listdir('../input/test')
    
    
for image in test_list:
    

    fname = image
#     fname=fname+'.tif'
    src = os.path.join('../input/test', fname)
    dst = os.path.join(test_images, fname)
    #copy
    shutil.copyfile(src, dst)

In [None]:
test_path ='test_dir'


test_gen = datagen.flow_from_directory(test_path,
                                        target_size=(image_size,image_size),
                                        batch_size=1,
                                        class_mode='binary',
                                        shuffle=False)

In [None]:
num_test_images = 57458

# make sure we are using the best epoch
model.load_weights('model.h5')

predictions = model.predict_generator(test_gen, steps=num_test_images, verbose=1)


In [None]:
#inserting test preds into dataframe for submission csv

df_preds = pd.DataFrame(predictions, columns=['no_tumor_tissue', 'has_tumor_tissue'])



In [None]:

test_filenames = test_gen.filenames
df_preds['file_names'] = test_filenames

df_preds.head()

In [None]:

#just want id, not .tif
def remove_id(x):
    
    # split into a list
    a = x.split('/')
    # split into a list
    b = a[1].split('.')
    removed_id = b[0]
    
    return removed_id

df_preds['id'] = df_preds['file_names'].apply(remove_id)



In [None]:

y_pred = df_preds['has_tumor_tissue']

image_id = df_preds['id']

submission to kaggle

In [None]:
submission = pd.DataFrame({'id':image_id, 
                           'label':y_pred, 
                          }).set_index('id')

submission.to_csv('results.csv', columns=['label']) 

In [None]:
shutil.rmtree('test_dir')

In [None]:
from time import gmtime, strftime
strftime("%Y-%m-%d %H:%M:%S", gmtime())