## Create ABEJA Platform dataset

Tutorial for creating ABEJA Platform dataset from online data source  
Sample code uses Stanford Dogs Dataset: http://vision.stanford.edu/aditya86/ImageNetDogs/

### Step1: Download data

In [None]:
# save data in /home/data
!wget http://vision.stanford.edu/aditya86/ImageNetDogs/images.tar -P /home/data

In [None]:
# extract tar file
n_files = !tar tf /home/data/images.tar | grep .jpg | wc -l
!tar xvf /home/data/images.tar -C /home/data/ | pv -l -s {n_files[0]} > /dev/null

### Step 2: Load data

In [None]:
from glob import glob

# load filenames for images
test_files = list(glob('/home/data/Images/*/*'))
test_dir = list(glob('/home/data/Images/*'))

# print number of images in dataset
print('There are %d total images.' % len(test_files))
test_dir

In [None]:
# load filenames for selected images
selected = ['French_bulldog', 'Chihuahua','bull_mastiff',
          'Labrador_retriever', 'Boston_bull']

selected_files = []

for file in test_files:
    for item in selected:
        if item in file:
            selected_files.append(file)

# print number of images in dataset
print('There are %d selected images.' % len(selected_files))

In [None]:
# show one of images
import matplotlib.pyplot as plt                        
%matplotlib inline         
from PIL import Image, ImageFile 

image = Image.open(selected_files[0])
plt.imshow(image)
plt.show()

### Step 3: Create Datalake channel

In [None]:
from abeja.datalake import Client as DatalakeClient
from abeja.datalake.storage_type import StorageType

datalake_client = DatalakeClient()

# define your own name and description
name = 'XXXXX'
description = 'XXXXX'

channel = datalake_client.channels.create(name, description, StorageType.DATALAKE.value)
channel_id = channel.channel_id
    
print('channel_id is %s.' % channel_id)

In [None]:
import os
from tqdm import tqdm

# upload home directory data to datalake
for file in tqdm(selected_files):
    dir_name = os.path.basename(os.path.dirname(file))
    dir_name = os.path.basename(dir_name) # get directory name for labeling
    dir_name = dir_name.upper() # conver to uppercase 
    label_name = dir_name[10:] # remove numbers
    metadata = {'label': label_name}
    file = channel.upload_file(file, metadata=metadata)

### Step 4: Create dataset label

In [None]:
categories = sorted([item.upper() for item in selected])
    
print('No. of categories is %d.' % len(categories))
categories

In [None]:
# create dataset label from category names
labels = []
label_to_id = {}

for i, name in enumerate(categories):
    label_to_id[name] = i
    labels.append({'label_id': i,
                   'label': name})
labels

In [None]:
# define dataset name
dataset_name = 'XXXXX'

# create dataset label
category = {
    'category_id': 0,
    'name': dataset_name,
    'labels': labels}

props = {'categories': [category]}
props

### Step 5: Create Dataset from Datalake channel

In [None]:
from abeja.datasets import Client as DatasetClient

dataset_client = DatasetClient()

# create dataset
dataset = dataset_client.datasets.create(name=dataset_name, type='classification', props=props)
dataset_id = dataset.dataset_id

print('dataset_id is %s.' % dataset_id)

In [None]:
# delete dataset
#dataset_client.datasets.delete(dataset_id)

In [None]:
# create dataset by importing datlake data

for f in tqdm(channel.list_files()):
    data_uri = f.uri
    filename = f.metadata['filename']
    label = f.metadata['label']
    label_id = label_to_id[label]
    
    if os.path.splitext(filename)[1].lower() == '.jpg' or \
    os.path.splitext(filename)[1].lower() == '.jpeg':
        content_type = 'image/jpeg'
    elif os.path.splitext(filename)[1].lower() == '.png':
        content_type = 'image/png'
    else:
        print('{} is invalid file type.'.format(filename))
        continue
    
    source_data = [{'data_uri': data_uri, 'data_type': content_type}]
    attributes = {'classification': [{'category_id': 0, 'label_id': label_id, 'label': label}]}

    dataset.dataset_items.create(source_data, attributes)