# Installing Dependencies

In [None]:
!pip3 install hub numpy pandas --quiet

# Loading Packages

In [None]:
import hub
import numpy as np
import pandas as pd

# Downloading Raw Data

In [None]:
source_url = "http://codh.rois.ac.jp/kmnist/dataset/kmnist/"

train_images_filepath = "kmnist-train-imgs.npz"
train_labels_filepath = "kmnist-train-labels.npz"

test_images_filepath = "kmnist-test-imgs.npz"
test_labels_filepath = "kmnist-test-labels.npz"

class_map_filepath = "kmnist_classmap.csv"

In [None]:
!curl -O {source_url}/{train_images_filepath} # Can also use `wget` if available
!curl -O {source_url}/{train_labels_filepath}

!curl -O {source_url}/{test_images_filepath}
!curl -O {source_url}/{test_labels_filepath}

!curl -O {source_url}/{class_map_filepath}

# Loading Class Labels

In [None]:
class_map_table = pd.read_csv(
    class_map_filepath, 
    encoding='utf-8', 
    index_col=0
)

class_names = class_map_table.codepoint.tolist()

# Creating Dataset and Uploading to `hub`

## Login

This is needed if using Activeloop storage.

In [None]:
username = "<USERNAME>"
password = "<PASSWORD>"

!activeloop login -u '{username}' -p '{password}'

In [None]:
workspace_path = f"hub://{username}" # Or `"."` if local

## Train Set

In [None]:
dataset_name = "kmnist-train"
dataset_path = f"{workspace_path}/{dataset_name}"

In [None]:
ds = hub.empty(dataset_path, overwrite=True) # Set `overwrite=True` to overwrite any existing data under the same path

with ds:
    ds.create_tensor('images', htype = 'image', sample_compression = "jpg")
    ds.create_tensor('labels', htype = 'class_label', class_names = class_names)

    ds.info.update(
        description = "Kuzushiji-MNIST is a drop-in replacement for the MNIST dataset. It contains 70,000 28x28 grayscale images spanning 10 classes (one from each column of hiragana), and is perfectly balanced like the original MNIST dataset (6k/1k train/test for each class).", 
        citation="@online{clanuwat2018deep,  author={Tarin Clanuwat and Mikel Bober-Irizar and Asanobu Kitamoto and Alex Lamb and Kazuaki Yamamoto and David Ha},  title={Deep Learning for Classical Japanese Literature},  date={2018-12-03},  year={2018},  eprintclass={cs.CV},  eprinttype={arXiv},  eprint={cs.CV/1812.01718}}"
    )


with ds:
    for image, label in zip(np.load(train_images_filepath)['arr_0'], np.load(train_labels_filepath)['arr_0']):
        ds.append({'images': image, 'labels': np.uint32(label)})

# Test Set

In [None]:
dataset_name = "kmnist-test"
dataset_path = f"{workspace_path}/{dataset_name}"

In [None]:
ds = hub.empty(dataset_path, overwrite=True)

with ds:
    ds.create_tensor('images', htype = 'image', sample_compression = "jpg")
    ds.create_tensor('labels', htype = 'class_label', class_names = class_names)

    ds.info.update(
        description = "Kuzushiji-MNIST is a drop-in replacement for the MNIST dataset. It contains 70,000 28x28 grayscale images spanning 10 classes (one from each column of hiragana), and is perfectly balanced like the original MNIST dataset (6k/1k train/test for each class).", 
        citation="@online{clanuwat2018deep,  author={Tarin Clanuwat and Mikel Bober-Irizar and Asanobu Kitamoto and Alex Lamb and Kazuaki Yamamoto and David Ha},  title={Deep Learning for Classical Japanese Literature},  date={2018-12-03},  year={2018},  eprintclass={cs.CV},  eprinttype={arXiv},  eprint={cs.CV/1812.01718}}"
    )


with ds:
    for image, label in zip(np.load(test_images_filepath)['arr_0'], np.load(test_labels_filepath)['arr_0']):
        ds.append({'images': image, 'labels': np.uint32(label)})

Dataset documentation: https://docs.activeloop.ai/datasets/kmnist