In [14]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import GlobalAveragePooling2D
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.applications import DenseNet201
from tensorflow.keras.layers import Conv2D, BatchNormalization, GlobalAveragePooling2D, Flatten, Dropout, Dense
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Model

In [15]:
TRAIN_IMAGE_PATH = "jpeg/train/"
TEST_IMAGE_PATH = "jpeg/test/"
IMG_HEIGHT = 500
IMG_WIDTH = 500
BATCH_SIZE = 64
AUTO = tf.data.experimental.AUTOTUNE
REG = 0.0005
EPOCHS=20
METRICS = [
    tf.keras.metrics.TruePositives(name='tp'),
      tf.keras.metrics.FalsePositives(name='fp'),
      tf.keras.metrics.TrueNegatives(name='tn'),
      tf.keras.metrics.FalseNegatives(name='fn'), 
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall'),
      tf.keras.metrics.AUC(name='auc'),
]

In [16]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [17]:
train_df["image_name"] = train_df["image_name"].apply(lambda x: TRAIN_IMAGE_PATH + x + ".jpg")
test_df["image_name"] = test_df["image_name"].apply(lambda x: TEST_IMAGE_PATH + x + ".jpg")

In [18]:
# train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=45, shuffle=True)

In [19]:
# train_df.shape, val_df.shape

In [20]:
def decode_image(filename, label=None, image_size=(IMG_WIDTH, IMG_HEIGHT)):
    bits = tf.io.read_file(filename)
    image = tf.image.decode_jpeg(bits, channels=3)
    image = tf.cast(image, tf.float32)/255.0
    image = tf.image.resize(image, size=image_size)
    
    if label is None:
        return image
    else:
        return image, label

In [21]:
def data_augment(image, label=None):
    image = tf.image.random_flip_left_right(image)
    if label is None:
        return image
    else:
        return image, label

In [22]:
def compute_class_weights(labels):
    total_labels = labels.shape[0]
    
    positive_labels = K.sum(labels, axis=0)/total_labels
    negative_labels = 1 - positive_labels
    return {0:positive_labels, 1:negative_labels}

In [23]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
image_names = train_df["image_name"].values
train_df["image_name"] = lb.fit_transform(train_df["image_name"].values)
train_df["target"] = train_df["target"].astype("int")
train_df.head()
map_name_no = dict(zip(train_df["image_name"], image_names))
y_train = train_df["target"]
x_train = train_df[["image_name"]]


over = SMOTE(random_state=45, sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('o', over), ('u', under)]
ppl = Pipeline(steps=steps)

x_train, y_train = ppl.fit_resample(x_train, y_train)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=45)
x_train["image_name"] = x_train["image_name"].apply(lambda x: map_name_no[x])
x_val["image_name"] = x_val["image_name"].apply(lambda x: map_name_no[x])

In [24]:
x_train.shape, y_train.shape, x_val.shape, y_val.shape

((15619, 1), (15619,), (3905, 1), (3905,))

In [25]:
sum(y_train), sum(y_val)

(5153, 1355)

In [None]:
train_dataset = (tf.data.Dataset
                 .from_tensor_slices((x_train["image_name"].values, y_train))
                 .map(decode_image, num_parallel_calls=AUTO)
                 .map(data_augment, num_parallel_calls=AUTO)
                 .repeat()
                 .shuffle(512)
                 .batch(BATCH_SIZE)
                 .prefetch(AUTO)
                )

In [None]:
val_dataset = (tf.data.Dataset
               .from_tensor_slices((x_val["image_name"].values, y_val))
               .map(decode_image, num_parallel_calls=AUTO)
               .map(data_augment, num_parallel_calls=AUTO)
               .repeat()
               .shuffle(512)
               .batch(BATCH_SIZE)
               .prefetch(AUTO))

In [None]:
test_dataset = (tf.data.Dataset.from_tensor_slices((test_df.image_name))
                .map(decode_image, num_parallel_calls=AUTO)
                .batch(BATCH_SIZE))

In [None]:
lr = ReduceLROnPlateau(
    monitor="val_loss",
    patience=3,
    min_lr=0.000001,
    factor=0.5,
    verbose=1
)

In [None]:
es = EarlyStopping(monitor="val_loss", patience=10)

In [None]:
model_d201 = DenseNet201(include_top=False, weights="imagenet", input_shape=(IMG_WIDTH, IMG_HEIGHT,3))
model_d201.trainable=False

model = Conv2D(filters=32, kernel_size=(3,3), data_format="channels_last", activation="relu", kernel_regularizer=l2(REG))(model_d201.output)
model = BatchNormalization(axis=-1, center=True, scale=False)(model)
model = Conv2D(filters=32, kernel_size=(3,3), activation="relu", kernel_regularizer=l2(REG))(model)
model = BatchNormalization(axis=-1, center=True, scale=False)(model)
model = GlobalAveragePooling2D()(model)
model = Dropout(0.25)(model)

model = Conv2D(filters=64, kernel_size=(3,3), data_format="channels_last", activation="relu", kernel_regularizer=l2(REG))(model_d201.output)
model = BatchNormalization(axis=-1, center=True, scale=False)(model)
model = Conv2D(filters=64, kernel_size=(3,3), activation="relu", kernel_regularizer=l2(REG))(model)
model = BatchNormalization(axis=-1, center=True, scale=False)(model)
model = GlobalAveragePooling2D()(model)
model = Dropout(0.25)(model)


model = Flatten()(model)
model = Dense(256, activation="relu")(model)
model = BatchNormalization(axis=-1, center=True, scale=False)(model)
model = Dropout(0.5)(model)
model = Dense(64, activation="relu")(model)
model = BatchNormalization(axis=-1, center=True, scale=False)(model)

output = Dense(1, activation="sigmoid")(model)
model_d201 = Model(inputs=model_d201.input, outputs=output)
model_d201.summary()
model_d201.compile(optimizer="adam", loss="binary_crossentropy", metrics=METRICS)

In [None]:
compute_class_weights(y_train)

In [None]:
x_val.shape

In [None]:
history = model_d201.fit(train_dataset, epochs=EPOCHS, callbacks=[lr, es],
                        steps_per_epoch=x_train.shape[0]//BATCH_SIZE, validation_data=val_dataset,
                        validation_steps=x_val.shape[0]//BATCH_SIZE,
                        class_weights = compute_class_weights())

In [None]:
probs = model_d201.predict(test_dataset, verbose = 1)

In [None]:
sam = pd.read_csv("sample_submission.csv")

In [None]:
sam.head()


In [None]:
sam["target"] = probs

In [None]:
sam

In [None]:
sam.to_csv("model_d201_smote_oversampling_undersampling.csv", index=False)