In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import metrics
import cv2
import seaborn as sns

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.layers import Dense, Dropout, Flatten, Activation
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
print(os.listdir("../input/histopathologic-cancer-detection/"))

# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
train_labels = pd.read_csv('../input/histopathologic-cancer-detection/train_labels.csv')


In [4]:
len(os.listdir('../input/histopathologic-cancer-detection/train/'))

In [5]:
sns.countplot(train_labels['label'])
# not too imbalanced, but we can reduce the train size

In [6]:
train_path = '../input/histopathologic-cancer-detection/train/'
test_path = '../input/histopathologic-cancer-detection/test/'







In [7]:
df_0 = train_labels[train_labels['label'] == 0].sample(8000, random_state = 101)
df_1 = train_labels[train_labels['label'] == 1].sample(8000, random_state = 101)

In [83]:
train_labels_fin = pd.concat([df_0,df_1], axis = 0).reset_index(drop = True)
sns.countplot(train_labels_fin['label'])


In [12]:
from sklearn.utils import shuffle
train_labels_fin = shuffle(train_labels_fin)

In [13]:
train_labels_fin['label'].value_counts()

In [14]:
y = train_labels_fin['label']
df_train, df_val = train_test_split(train_labels_fin, test_size = .2, random_state = 101, stratify = y)

In [15]:
df_train.shape

In [16]:
df_val.shape

In [17]:
base_dir = 'base_dir'
os.mkdir(base_dir)


In [18]:
train_dir = os.path.join(base_dir, 'train_dir')
os.mkdir(train_dir)

In [19]:
val_dir = os.path.join(base_dir, 'val_dir')
os.mkdir(val_dir)

In [20]:
no_tumor_tissue = os.path.join(train_dir, 'a_no_tumor_tissue')
os.mkdir(no_tumor_tissue)
has_tumor_tissue = os.path.join(train_dir, 'b_has_tumor_tissue')
os.mkdir(has_tumor_tissue)

In [21]:
no_tumor_tissue = os.path.join(val_dir, 'a_no_tumor_tissue')
os.mkdir(no_tumor_tissue)
has_tumor_tissue = os.path.join(val_dir, 'b_has_tumor_tissue')
os.mkdir(has_tumor_tissue)

In [22]:
os.listdir('base_dir/train_dir')

In [23]:
os.listdir('base_dir/train_dir')

In [24]:
train_labels_fin.index

In [26]:
train_labels.set_index('id', inplace=True)

In [27]:
train_list = list(df_train['id'])
val_list = list(df_val['id'])


In [28]:
val_list = list(df_val['id'])

In [29]:
val_list[:10]

In [31]:
train_labels_fin.columns

In [32]:
import shutil
for image in train_list:
    # the id in the csv file does not have the .tif extension therefore we add it here
    fname = image + '.tif'
    target = train_labels.loc[image, 'label']
    if target == 0:
        label = 'a_no_tumor_tissue'
    if target == 1:
        label = 'b_has_tumor_tissue'
        
    src = os.path.join(train_path,fname)
    dst = os.path.join(train_dir, label, fname)
    shutil.copyfile(src, dst)

In [33]:
for image in val_list:
    fname = image + '.tif'
    target = train_labels.loc[image,'label']
    if target == 0:
        label = 'a_no_tumor_tissue'
    if target == 1:
        label = 'b_has_tumor_tissue'
        
    src = os.path.join(train_path,fname)
    dst = os.path.join(val_dir, label, fname)
    shutil.copyfile(src, dst)

In [34]:
print(len(os.listdir('base_dir/train_dir/a_no_tumor_tissue')))
print(len(os.listdir('base_dir/train_dir/b_has_tumor_tissue')))

In [35]:
print(len(os.listdir('base_dir/val_dir/a_no_tumor_tissue')))
print(len(os.listdir('base_dir/val_dir/b_has_tumor_tissue')))

In [36]:
train_path = 'base_dir/train_dir/'
val_path = 'base_dir/val_dir/'
# test_path = '../input/histopathologic-cancer-detection/test/'

In [37]:
num_train_samples = len(df_train)
num_val_samples = len(df_val)
train_batch_size = 10
val_batch_size = 10
train_steps = np.ceil(num_train_samples / train_batch_size)
val_steps = np.ceil(num_val_samples / val_batch_size) 

In [38]:
train_data = ImageDataGenerator(rescale=1/255)
train_generator = train_data.flow_from_directory(train_path, 
                                                 target_size = (96,96),
                                                 batch_size = train_batch_size,
                                                 class_mode = 'categorical'
                                                     )

In [39]:
val_gen = train_data.flow_from_directory(val_path,
                                        target_size=(96,96),
                                        batch_size=val_batch_size,
                                        class_mode='categorical')

In [40]:
test_generator = ImageDataGenerator(rescale=1./255).flow_from_directory(
                                                     val_path,
                                                     target_size=(96, 96),
                                                     batch_size=1,
                                                     class_mode='categorical',
                                                    shuffle = False
                                                     )

In [41]:
kernel_size = (3,3)
pool_size= (2,2)
first_filters = 32
second_filters = 64
third_filters = 128

dropout_conv = 0.3
dropout_dense = 0.3


model = Sequential()
model.add(Conv2D(first_filters, kernel_size, activation = 'relu', input_shape = (96, 96, 3)))
model.add(Conv2D(first_filters, kernel_size, activation = 'relu'))
model.add(Conv2D(first_filters, kernel_size, activation = 'relu'))
model.add(MaxPooling2D(pool_size = pool_size)) 
model.add(Dropout(dropout_conv))

model.add(Conv2D(second_filters, kernel_size, activation ='relu'))
model.add(Conv2D(second_filters, kernel_size, activation ='relu'))
model.add(Conv2D(second_filters, kernel_size, activation ='relu'))
model.add(MaxPooling2D(pool_size = pool_size))
model.add(Dropout(dropout_conv))

model.add(Conv2D(third_filters, kernel_size, activation ='relu'))
model.add(Conv2D(third_filters, kernel_size, activation ='relu'))
model.add(Conv2D(third_filters, kernel_size, activation ='relu'))
model.add(MaxPooling2D(pool_size = pool_size))
model.add(Dropout(dropout_conv))

model.add(Flatten())
model.add(Dense(256, activation = "relu"))
model.add(Dropout(dropout_dense))
model.add(Dense(2, activation = "softmax"))

In [42]:
print(model.summary)

In [43]:
model.compile(Adam(learning_rate=0.0001), loss='binary_crossentropy', 
              metrics=['accuracy'])


In [45]:
filepath = "model.h5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, 
                             save_best_only=True, mode='max')

reduce_lr = ReduceLROnPlateau(monitor='val_acc', factor=0.5, patience=2, 
                                   verbose=1, mode='max', min_lr=0.00001)
                              
                              
callbacks_list = [checkpoint, reduce_lr]

history = model.fit_generator(train_generator, steps_per_epoch=train_steps, 
                    validation_data=val_gen,
                    validation_steps=val_steps,
                    epochs=5, verbose=1,
                   callbacks=callbacks_list)

In [48]:
model.metrics_names

In [52]:
from tensorflow.keras.models import save_model, load_model

In [53]:
model.save("cnnfin.h5")

In [56]:
model.load_weights('cnnfin.h5')

val_loss, val_acc = (
model.evaluate_generator(test_generator, 
                        steps=len(df_val)))

print('val_loss:', val_loss)
print('val_acc:', val_acc)

In [59]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.show()

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.show()

In [60]:
predictions = model.predict_generator(test_generator, steps = len(df_val), verbose = 1)

In [61]:
predictions

In [62]:
predictions.shape

In [63]:
df_predict = pd.DataFrame(predictions, columns = ['no_tumor_tissue', 'has_tumor_tissue'])
df_predict.head

In [65]:
y_true = test_generator.classes
y_pred = df_predict['has_tumor_tissue']

In [77]:
from sklearn.metrics import roc_auc_score

roc_auc = roc_auc_score(y_true, y_pred)
roc_auc

In [68]:
test_labels = test_generator.classes

In [69]:
cc_test = confusion_matrix(test_labels, predictions.argmax(axis = 1))
cc_test

In [72]:
from sklearn.metrics import ConfusionMatrixDisplay
labels = ['no_tumor_tissue', 'has_tumor_tissue']
disp = ConfusionMatrixDisplay(confusion_matrix= cc_test, display_labels = labels)
disp.plot(cmap=plt.cm.Blues)
plt.show()

In [75]:

from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_true, y_pred)

In [81]:


plt.plot(fpr, tpr, 'b', label = 'AUC = .892')
plt.plot([0,1],[0,1],'r--')
plt.title('reciever operating characteristics')
plt.ylabel = ('True positive')
plt.xlabel = ('False positive')
plt.show()