In [1]:
import os
import cv2
from os import path

import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import pandas as pd

from tensorflow import keras
from keras import regularizers
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Dense, Dropout, Flatten
from keras.utils import to_categorical
from keras import regularizers
from sklearn.model_selection import train_test_split
from skimage.io import imread
from skimage.transform import resize
from skimage.color import rgb2gray
from skimage import exposure
from random import sample

import wandb
from wandb import AlertLevel
from datetime import timedelta

wandb.login()

wandb: Currently logged in as: camperko (use `wandb login --relogin` to force relogin)


True

In [6]:
labels = pd.read_csv('data/train.csv')
labels.head()

Unnamed: 0,image,labels
0,800113bb65efe69e.jpg,healthy
1,8002cb321f8bfcdf.jpg,scab frog_eye_leaf_spot complex
2,80070f7fb5e2ccaa.jpg,scab
3,80077517781fb94f.jpg,scab
4,800cbf0ff87721f8.jpg,complex


In [7]:
categories = []
for index, row in labels.iterrows():
    parts = row['labels'].split(' ')
    for part in parts:
        if part not in categories:
            categories.append(part)

category_dict = {}
for category in categories:
    labels[category] = np.zeros(labels.shape[0])
    category_dict[category] = 0

for index, row in labels.iterrows():
    parts = row['labels'].split(' ')
    for part in parts:
        labels[part][index] = 1
        category_dict[part] += 1

labels = labels.drop(['labels'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labels[part][index] = 1


In [8]:
import hashlib
import os

def file_hash(path):
    with open(path, 'rb') as f:
        return hashlib.md5(f.read()).hexdigest()

duplicates = []
hash_set = []

for index, row in labels.iterrows():
    image = row['image']
    calc_hash = file_hash(os.getcwd() + '\\data\\train_images\\' + image)
    if calc_hash in hash_set:
        duplicate_index = hash_set.index(calc_hash)
        duplicates.append((duplicate_index, index))
    hash_set.append(calc_hash)

In [15]:
new_labels = labels.copy()
to_remove = []
for dup in duplicates:
    for category in categories:
        if labels[category][dup[1]] == 1:
            labels[category][dup[0]] = 1
    to_remove.append(dup[1])

to_remove.sort(reverse=True)

removed_images = []
for rem in to_remove:
    removed_images.append(labels['image'][rem])
    new_labels = new_labels.drop(index=rem)
    if path.exists('data/train_images/' + labels['image'][rem]):
        os.remove(os.getcwd() + '\\data\\train_images\\' + labels['image'][rem])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labels[category][dup[0]] = 1


In [16]:
config = {"lr": 1e-5,
          "num_epoch": 100,
          "size": 512,
          "optimizer": "adam"}

def normalize_image(in_image):
    sharpen_kernel = np.array([[-0.5, -0.5, -0.5],
                               [-0.5,    5, -0.5],
                               [-0.5, -0.5, -0.5]])

    # resized
    resized_image = resize(in_image, (config["size"], config["size"]), anti_aliasing=True)

    # with expo
    # expo_image = exposure.equalize_adapthist(resized_image, clip_limit=0.03)

    # after sharpen expo
    sharpen_image = cv2.filter2D(resized_image, -1, sharpen_kernel)

    # after gray sharpen expo
    # sharpen_expo_gray_image = rgb2gray(sharpen_expo_image)

    return sharpen_image

image_set = []
labels_set = []

for index, row in new_labels.iterrows():
    if path.exists('data/train_images/' + row['image']):
        image_normalized = normalize_image(imread('data/train_images/' + row['image']))
        image_set.append(image_normalized)
        labels_set.append(row)
    if index == 100:
        break
        
labels_set = pd.DataFrame(labels_set)

In [17]:
model = Sequential()

model.add(Conv2D(64, input_shape=(config['size'], config['size'], 3), kernel_size=(3, 3), padding='same', activation='relu'))
model.add(Conv2D(128, kernel_size=(3, 3), padding='same', activation='relu'))
model.add(MaxPooling2D(pool_size=(3, 3), padding='same'))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(512, activation = 'relu'))
model.add(Dropout(0.5))

model.add(Dense(512, activation = 'relu', kernel_regularizer = regularizers.l2(0.001)))
model.add(Dropout(0.5))

model.add(Dense(6, activation='sigmoid'))

model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 512, 512, 64)      1792      
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 512, 512, 128)     73856     
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 171, 171, 128)     0         
_________________________________________________________________
dropout (Dropout)            (None, 171, 171, 128)     0         
_________________________________________________________________
flatten (Flatten)            (None, 3742848)           0         
_________________________________________________________________
dense (Dense)                (None, 512)               1916338688
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0

In [18]:
opt = keras.optimizers.Adam(learning_rate=config['lr'])
model.compile(loss='binary_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])

model.predict(np.asarray(image_set))

array([[0.47277465, 0.49395263, 0.49551615, 0.5039826 , 0.4788181 ,
        0.49206182],
       [0.4736737 , 0.49760833, 0.49583256, 0.5084543 , 0.47841865,
        0.5009205 ],
       [0.46796238, 0.49271357, 0.49354723, 0.50863826, 0.4774256 ,
        0.49857545],
       [0.46800625, 0.491346  , 0.49648926, 0.5072556 , 0.48068482,
        0.4979735 ],
       [0.46549615, 0.49764776, 0.4944974 , 0.5048449 , 0.482241  ,
        0.49683017],
       [0.470128  , 0.49501783, 0.4906391 , 0.5107676 , 0.47649935,
        0.4905801 ],
       [0.4718315 , 0.5007675 , 0.49330798, 0.50693136, 0.4786547 ,
        0.49684024],
       [0.48158857, 0.49155653, 0.49225056, 0.505018  , 0.4844087 ,
        0.50031227],
       [0.47148138, 0.4959215 , 0.4973664 , 0.5063042 , 0.48069572,
        0.5003324 ],
       [0.47344   , 0.49973187, 0.49932143, 0.50997525, 0.4795876 ,
        0.49834764],
       [0.4679477 , 0.49741113, 0.49670827, 0.5066102 , 0.48722026,
        0.5014159 ],
       [0.47452816, 0