## 01-collect-data

### Step 1: Load data

In [None]:
from glob import glob

# load filenames for images
file_names = list(glob('./Images/*/*'))
dir_names = list(glob('./Images/*'))

# print number of images in dataset
print('There are %d total images.' % len(file_names))

In [None]:
# select directories
selected_dirnames = [d for d in dir_names]
print(selected_dirnames)

### Step 2: Upload data to Datalake channel

In [None]:
# set credential
credential = {
    'user_id': 'user-XXXXXXXXXXXXX',
    'personal_access_token': 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
}

organization_id='XXXXXXXXXXXXX'

In [None]:
from abeja.datalake import Client as DatalakeClient

# set datalake channel_id
channel_id = 'XXXXXXXXXXXXX'

datalake_client = DatalakeClient(organization_id, credential)
channel = datalake_client.get_channel(channel_id)

import os
from tqdm import tqdm

# upload directory data to datalake
for d in tqdm(selected_dirnames):
    # convert to uppercase and remove numbers
    label_name = os.path.basename(d).upper()[10:]
    metadata = {'label': label_name}
    channel.upload_dir(d, metadata=metadata)

### Step 4: Create dataset label

In [None]:
import json

labels = sorted([os.path.basename(d).upper()[10:] for d in selected_dirnames])
labels_and_id = []
label_to_id = {}

for i, name in enumerate(labels):
    label_to_id[name] = i
    labels_and_id.append({'label_id': i,
                   'label': name})
    
# define category name
category_name = 'bull-classificaiton'

# create dataset label
category = {
    'category_id': 0,
    'name': category_name,
    'labels': labels_and_id}

props = {'categories': [category]}
json.dumps(props)

### Step 5: Create Dataset from Datalake channel

In [None]:
# create dataset by importing datalake files
from abeja.datasets import Client as DatasetClient

dataset_client = DatasetClient(organization_id, credential)

# define dataset id
dataset_id = 'XXXXXXXXXXXXX'

dataset = dataset_client.get_dataset(dataset_id)

for f in tqdm(channel.list_files()):
    data_uri = f.uri
    filename = f.metadata['filename']
    label = f.metadata['label']
    label_id = label_to_id[label]
    
    if os.path.splitext(filename)[1].lower() == '.jpg' or \
    os.path.splitext(filename)[1].lower() == '.jpeg':
        content_type = 'image/jpeg'
    elif os.path.splitext(filename)[1].lower() == '.png':
        content_type = 'image/png'
    else:
        print('{} is invalid file type.'.format(filename))
        continue
    
    source_data = [{'data_uri': data_uri, 'data_type': content_type}]
    attributes = {'classification': [{'category_id': 0, 'label_id': label_id, 'label': label}]}
    dataset.dataset_items.create(source_data, attributes)