In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import keras
import matplotlib.pyplot as plt
from keras.layers import Dense,GlobalAveragePooling2D
from keras.applications import MobileNet
from keras.preprocessing import image
from keras.applications.mobilenet import preprocess_input
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Model
from keras.optimizers import Adam
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
from sklearn.model_selection import train_test_split
from glob import glob 
from sklearn.utils import shuffle
import shutil

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

Using TensorFlow backend.


['train', 'test', 'train_labels.csv', 'sample_submission.csv']


In [None]:
TRAINING_RATIO = 0.9
IMAGE_SIZE = 96
EPOCHS = 13
BATCH_SIZE = 192
VERBOSITY = 1
TESTING_BATCH_SIZE = 5000

In [None]:
base_tile_dir = '../input/train/'
df = pd.DataFrame({'path': glob(os.path.join(base_tile_dir,'*.tif'))})
df['id'] = df.path.map(lambda x: x.split('/')[3].split(".")[0])
labels = pd.read_csv("../input/train_labels.csv")
df_data = df.merge(labels, on = "id")

# removing this image because it caused a training error previously
df_data = df_data[df_data['id'] != 'dd6dfed324f9fcb6f93f46f32fc800f2ec196be2']

# removing this image because it's black
df_data = df_data[df_data['id'] != '9369c7278ec8bcc6c880d99194de09fc2bd4efbe']
df_data.head(3)

In [None]:
SAMPLE_SIZE = 80000 # load 80k negative examples

# take a random sample of class 0 with size equal to num samples in class 1
df_0 = df_data[df_data['label'] == 0].sample(SAMPLE_SIZE, random_state = 101)
# filter out class 1
df_1 = df_data[df_data['label'] == 1].sample(SAMPLE_SIZE, random_state = 101)

# concat the dataframes
df_data = shuffle(pd.concat([df_0, df_1], axis=0).reset_index(drop=True))



# train_test_split # stratify=y creates a balanced validation set.
y = df_data['label']
df_train, df_val = train_test_split(df_data, test_size=0.20, random_state=101, stratify=y)

num_train_samples = len(df_train)
num_val_samples = len(df_val)
train_batch_size = 128
val_batch_size = 128

train_steps = np.ceil(num_train_samples / train_batch_size)
val_steps = np.ceil(num_val_samples / val_batch_size)

# Create directories
train_path = 'base_dir/train'
valid_path = 'base_dir/valid'
test_path = '../input/test'


for fold in [train_path, valid_path]:
    for subf in ["0", "1"]:
        os.makedirs(os.path.join(fold, subf))

In [None]:
df_data.set_index('id', inplace=True)
df_data.head()

In [None]:
print(os.listdir("../input/test"))

In [None]:
for image in df_train['id'].values:
    # the id in the csv file does not have the .tif extension therefore we add it here
    fname = image + '.tif'
    label = str(df_data.loc[image,'label']) # get the label for a certain image
    src = os.path.join('../input/train', fname)
    dst = os.path.join(train_path, label, fname)
    shutil.copyfile(src, dst)



In [None]:
for image in df_val['id'].values:
    fname = image + '.tif'
    label = str(df_data.loc[image,'label']) # get the label for a certain image
    src = os.path.join('../input/train', fname)
    dst = os.path.join(valid_path, label, fname)
    shutil.copyfile(src, dst)

In [None]:
train_datagen=ImageDataGenerator(preprocessing_function=preprocess_input) #included in our dependencies

In [None]:
train_generator=train_datagen.flow_from_directory(train_path, # this is where you specify the path to the main data folder
                                                 target_size=(128,128),
                                                 color_mode='rgb',
                                                 batch_size=32,
                                                 class_mode='binary',
                                                 shuffle=True)

In [None]:
val_gen = train_datagen.flow_from_directory(valid_path,
                                        target_size=(128,128),
                                        batch_size=32,
                                        class_mode='binary')

In [None]:
to_copy = os.listdir("../input/test")
test_gen_path="base_dir/test"
os.makedirs(os.path.join(test_gen_path, "pred"))

In [None]:
for image in to_copy:
    src = os.path.join('../input/test', image)
    dst = os.path.join(test_gen_path, "pred", image)
    shutil.copyfile(src, dst)

In [None]:
test_gen = train_datagen.flow_from_directory(test_gen_path,
                                        target_size=(128,128),
                                        batch_size=128,
                                        class_mode=None,
                                            shuffle=False)


In [None]:
test_steps =np.ceil(57458 /128)
test_steps

In [None]:
base_model=MobileNet(weights='imagenet',include_top=False,
                    input_shape=(128,128,3)) #imports the mobilenet model and discards the last 1000 neuron layer.

x=base_model.output
x=GlobalAveragePooling2D()(x)
x=Dense(1024,activation='relu')(x) #we add dense layers so that the model can learn more complex functions and classify for better results.
x=Dense(1024,activation='relu')(x) #dense layer 2
x=Dense(512,activation='relu')(x) #dense layer 3
preds=Dense(1,activation='softmax')(x) #final layer with softmax activation

In [None]:
model=Model(inputs=base_model.input,outputs=preds)

In [None]:
for layer in model.layers[:20]:
    layer.trainable=False
for layer in model.layers[20:]:
    layer.trainable=True


In [None]:
model.compile(optimizer='Adam',loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

model.fit_generator(train_generator, steps_per_epoch=train_steps, 
                    validation_data=val_gen,
                    validation_steps=val_steps,
                    epochs=13)

In [None]:
prediction = model.predict_generator(test_gen, steps=test_steps)

In [None]:
len(prediction)

In [None]:
submission = pd.read_csv("../input/sample_submission.csv")
submission.loc['label']=prediction
submission.head()

In [None]:
shutil.rmtree(train_path)
shutil.rmtree(valid_path)
shutil.rmtree(test_gen_path)

In [None]:
submission.to_csv("submission.csv", index = False, header = True)