In [3]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import os
import glob
import argparse
import random
import numpy as np
import lmdb
import cv2
import pickle
import caffe
from caffe.proto import caffe_pb2
from time import time
from sklearn.model_selection import train_test_split

In [None]:
!apt-get install -y caffe-cpu

## Directory parser for file .py

In [33]:
# ap = argparse.ArgumentParser()
# ap.add_argument('-p', '--path', required=True, help='Path to dataset directory')
args = '/home/helpthx/TCC-1/TCC-1-UnB/downloads'

## Default imagem size for resnet architecture 

In [34]:
IMAGE_WIDTH = 224
IMAGE_HEIGHT = 224

In [35]:
def transform_img(img, img_width=IMAGE_WIDTH, img_height=IMAGE_HEIGHT, equalize=False):
  """Function that resize an image and equalize it if necessary."""
  if equalize:
    # Histogram Equalization
    img[:, :, 0] = cv2.equalizeHist(img[:, :, 0])
    img[:, :, 1] = cv2.equalizeHist(img[:, :, 1])
    img[:, :, 2] = cv2.equalizeHist(img[:, :, 2])
    
  # Image Resizing
  img = cv2.resize(img, (img_width, img_height), interpolation=cv2.INTER_CUBIC)
  
  return img

In [36]:
def make_datum(img, label):
  # Image is numpy.ndarray format. BGR instead of RGB
  return caffe_pb2.Datum(channels=3,
                         width=IMAGE_WIDTH,
                         height=IMAGE_HEIGHT,
                         label=label,
                         data=np.rollaxis(img, 2).tostring())

## Creating train and validation datasets with lmdb lib

In [37]:
path = '/home/helpthx/TCC-1/TCC-1-UnB/downloads'
parent_path = os.path.sep.join(path.split(os.path.sep)[:-1])
sibling_path = path.split(os.path.sep)[-1] + '_lmdb'
sibling_path = os.path.sep.join([parent_path, sibling_path])
train_lmdb = os.path.sep.join([sibling_path, 'train'])
validation_lmdb = os.path.sep.join([sibling_path, 'validation'])

In [38]:
print('train lmdb path: ', train_lmdb)
print('validation path: ', validation_lmdb)
print('sibling path: ', sibling_path)
print('parent path: ', parent_path)

train lmdb path:  /home/helpthx/TCC-1/TCC-1-UnB/downloads_lmdb/train
validation path:  /home/helpthx/TCC-1/TCC-1-UnB/downloads_lmdb/validation
sibling path:  /home/helpthx/TCC-1/TCC-1-UnB/downloads_lmdb
parent path:  /home/helpthx/TCC-1/TCC-1-UnB


In [39]:
if not os.path.exists(sibling_path):
    os.makedirs(sibling_path)

In [40]:
os.system('rm -rf ' + train_lmdb)
os.system('rm -rf ' + validation_lmdb)

0

In [41]:
dataset = []
for r, dirs, files in os.walk(path):
    if len(dirs) > 0:
        labels = dirs
        continue # use only leaf folders
    files_full_path = ['{}/{}'.format(r, f) for f in files]
    directory_name = r.split(os.path.sep)[-1]
    dataset.append((files_full_path, directory_name))

In [42]:
label_dict = [(l, i) for i, l in enumerate(labels)]
label_dict = dict(label_dict)
label_dict

{'                     Basal cell carcinoma': 0,
 '                     Dermatofibroma': 1,
 '                     Squamous cell carcinoma': 2,
 'Actinic Keratosis': 3,
 '                     Pyogenic granuloma': 4,
 '                     Seborrheic keratosis': 5,
 '                     Melanocytic nevus': 6,
 '                     Wart': 7,
 '                     Lentigo': 8,
 '                     Bowen’s disease': 9,
 '                     Intraepithelial carcinoma': 10,
 '                    Hemangioma': 11,
 '                     Malignant melanoma': 12}

'''
Save dictionary in the form of:

label_dict = { \
'basalcellcarcinoma': 0, \
'lentigo': 1, \
'malignantmelanoma': 2, \
'pigmentednevus': 3, \
'seborrheickeratosis': 4, \
'wart': 5, \
... \
}
'''

In [43]:
with open('label_dict.pkl', 'wb') as f:
    pickle.dump(label_dict, f)
    f.close()

In [48]:
X = [(img, label) for ndataset, label in dataset for img in ndataset]
y = [label_dict[label] for _, label in X]

In [49]:
random.shuffle(X)

## Spliting datasets

In [50]:
train_data, test_data, _, _ = train_test_split(X, y, train_size=0.8, stratify=y)

In [51]:
print('Creating train_lmdb...')

Creating train_lmdb...


In [59]:
train_time = time()
in_db = lmdb.open(train_lmdb, map_size=int(1e12))
with in_db.begin(write=True) as in_txn:
    for in_idx, (img_path, label) in enumerate(train_data):
        if in_idx % 100 == 0:
            print('Processed {}/{}'.format(in_idx, len(train_data)), end='\r')

    img = cv2.imread(img_path, cv2.IMREAD_COLOR)
    img = transform_img(img, img_width=IMAGE_WIDTH, img_height=IMAGE_HEIGHT)
 
    num_label = label_dict[label]
    datum = make_datum(img, num_label)

    key = '{:0>6d}'.format(in_idx)
    in_txn.put(key.encode(), datum.SerializeToString())

Processed 0/748Processed 100/748Processed 200/748Processed 300/748Processed 400/748Processed 500/748Processed 600/748Processed 700/748channels: 3
height: 224
width: 224
label: 4



In [54]:
in_db.close()
print('Finished {} train_lmdb in {:.2f} sec'.format(len(train_data), (time() - train_time)))

Finished 748 train_lmdb in 1.81 sec


In [55]:
print('\nCreating validation_lmdb...')


Creating validation_lmdb...


In [56]:
test_time = time()
in_db = lmdb.open(validation_lmdb, map_size=int(1e12))
with in_db.begin(write=True) as in_txn:
    old_t = time()
    for in_idx, (img_path, label) in enumerate(test_data):
        if in_idx % 100 == 0:
            print('Processed {}/{}'.format(in_idx, len(test_data)), end='\r\r')
      
            old_t = time()
      
    img = cv2.imread(img_path, cv2.IMREAD_COLOR)
    img = transform_img(img, img_width=IMAGE_WIDTH, img_height=IMAGE_HEIGHT)
    num_label = label_dict[label]
    datum = make_datum(img, num_label)
  
    key = '{:0>6d}'.format(in_idx)
    in_txn.put(key.encode(), datum.SerializeToString())

Processed 0/188Processed 100/188

In [57]:
in_db.close()
print('Finished {} test_lmdb in {:.2f} sec'.format(len(test_data), (time() - test_time)))

Finished 188 test_lmdb in 1.49 sec


In [58]:
print('\nFinished processing all images in {:.2f}'.format(time() - train_time))


Finished processing all images in 23.86
