In [None]:
import numpy as np
import tensorflow_datasets as tfds
import tensorflow as tf
import matplotlib.pyplot as plt
import keras
from keras import layers
from keras.applications import EfficientNetB0
from tensorflow.keras.models import Sequential
import glob

## Preprocessing
Based on visual inspection of the dataset, there are some duplicate images found. Use hash to encode images and find duplicates. Create deduplicated dataset, by selecting only the first image of identified duplicates and moving it to a new directory.

In [None]:
image_paths = glob.glob('../data/train_data/**/*.jpeg', recursive = True)
print('Number of images found:', len(image_paths))

In [None]:
import hashlib

hash_img_map = {}
for image_path in image_paths:
    with open(image_path, "rb") as f:
        img_hash = hashlib.sha256(f.read()).hexdigest()
        hash_img_map[img_hash] = hash_img_map.get(img_hash, []) + [image_path]
hash_img_map

In [None]:
# Check samples of duplicated images (display up to 3 images)
from PIL import Image

check_hash = '7cd910ccf43da503a9dc10a12bdd699f5ef7601aedf8534006e54a6efe01d41d'
for i, image in enumerate(hash_img_map[check_hash]):
    with Image.open(image) as im:
        display(im)
    if i >= 2:
        break


In [None]:
# check if there are duplicates between different classes, by comparing paths in image names
import os 

for file, img_paths in hash_img_map.items():
    same_img_paths = []
    for img_path in img_paths:
        same_img_paths.append('/'.join(img_path.split('/')[:4]))
    if len(set(same_img_paths)) > 1:
        print('----------')
        print(img_paths)
        print(set(same_img_paths))

# No duplicates found between different classes

In [None]:
# dedup (take the first file) and move to a new folder
import shutil

for file, img_paths in hash_img_map.items():
    src = img_paths[0]
    dst = img_paths[0].replace('train_data', 'train_data_dedup')
    os.makedirs(os.path.split(dst)[0], exist_ok=True)
    shutil.copyfile(src, dst)



## Train model

In [None]:
import keras
img_shape = (256, 256, 3)

train_ds, val_ds = keras.utils.image_dataset_from_directory(
    directory='../data/train_data_dedup',
    labels='inferred',
    label_mode='categorical',
    batch_size=8,
    image_size=(img_shape[0], img_shape[1]),
    shuffle=True,
    validation_split=0.2,
    seed=235,
    subset="both",)

num_classes = len(train_ds.class_names)
others_class_id = train_ds.class_names.index('others')

In [None]:
# Display some of the images with labels
for image, label in train_ds.take(1):
    for i in range(image.shape[0]):
        ax = plt.subplot(2, 4, i+1)
        plt.imshow(image[i].numpy().astype("uint8"))
        plt.title("{}".format(train_ds.class_names[np.argmax(label[i])]))
        plt.axis("off")

In [None]:
# check classes distributions
import plotly.express as px 

train_labels = []
for _, batch_class_ids in train_ds:
    for class_ids in batch_class_ids:
        train_labels.append(train_ds.class_names[np.argmax(class_ids)])

fig = px.histogram(train_labels, text_auto=True).update_xaxes(categoryorder='category ascending')
fig.update_layout(showlegend=False, xaxis_title="classes", title="Distribution of train dataset")

In [None]:
val_labels = []
for _, batch_class_ids in val_ds:
    for class_ids in batch_class_ids:
        val_labels.append(val_ds.class_names[np.argmax(class_ids)])

fig = px.histogram(val_labels, text_auto=True).update_xaxes(categoryorder='category ascending')
fig.update_layout(showlegend=False, xaxis_title="classes", title="Distribution of val dataset")

In [None]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
from tensorflow.keras.metrics import Recall

model = EfficientNetB0(
    include_top=True,
    weights=None,
    classes=num_classes,
    input_shape=img_shape,
)
model.compile(optimizer="adam", loss="CategoricalFocalCrossentropy", metrics=["recall", Recall(class_id=9, name='recall_others'), "auc"])

model.summary()

In [None]:
import tensorboard
%load_ext tensorboard

In [None]:
%tensorboard --logdir logs --host localhost --port 6006

In [None]:
from datetime import datetime
early_stopping_callback = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,
    verbose=1,
    mode='auto',
    restore_best_weights=True,
    start_from_epoch=0
)
logdir="logs/" + datetime.now().strftime("%Y%m%d-%H%M%S")

tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)

In [None]:
epochs = 40
hist = model.fit(train_ds, epochs=epochs, validation_data=val_ds, callbacks=[early_stopping_callback, tensorboard_callback])

In [None]:
hist.model.summary()


In [None]:
model.save('../model/efficientnet_othersfocused.keras')

In [None]:
model = keras.models.load_model('../model/efficientnetb0.keras')

In [None]:
predictions = model.predict(val_ds)

In [None]:
labels = [
            "Facebook, Inc.",
            "Microsoft",
            "Microsoft OneDrive",
            "Orange",
            "Spotify",
            "Steam",
            "UPS",
            "Vodafone",
            "Wells Fargo & Company",
            "others",
        ]

In [None]:
pred_labels = []
pred_classes = np.argmax(predictions, axis=1)
for pred_class in pred_classes:
    pred_labels.append(labels[pred_class])


val_labels = []
for _, batch_class_ids in val_ds:
    for class_ids in batch_class_ids:
        val_labels.append(val_ds.class_names[np.argmax(class_ids)])

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
sr = classification_report(val_labels, pred_labels, zero_division=0)
print(sr)

In [None]:
cm = confusion_matrix(val_labels, pred_labels)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot()
disp.ax_.tick_params(axis='x', labelrotation=90)
plt.show()