# Creating HF dataset
This NB reads the exising data and creates a HF dataset.

In [1]:
import pandas as pd
from datasets import Dataset, load_from_disk, concatenate_datasets, DatasetInfo, DatasetDict
from PIL import Image
import glob
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Function to load images
def load_image(image_path):
    return Image.open(image_path)

# Directory containing images and pandas dataframes
output_folder = "../data/Chinese"
text_dir = f"{output_folder}/texts"
image_dir = f"{output_folder}/images"

In [3]:
# Function to process each dataframe and create a Hugging Face dataset
def process_dataframe(df):
    dataset = Dataset.from_pandas(df)
    
    # Function to map the image loading for each row
    def process_images(example):
        image_filename = example['identifier'] + '.png' 
        image_path = os.path.join(image_dir, image_filename)  # 'identifier' is the image file name
        example['image'] = load_image(image_path)
        return example

    # Apply the image loading function
    dataset = dataset.map(process_images)
    return dataset

In [4]:
dataframes = [pd.read_csv(x) for x in glob.glob(f"{text_dir}/*.csv")]

In [5]:
len(dataframes)

11

In [6]:
# Combine datasets from each dataframe
datasets = [process_dataframe(df) for df in dataframes]
combined_dataset = concatenate_datasets(datasets)

Map: 100%|██████████| 4/4 [00:00<00:00, 392.70 examples/s]
Map: 100%|██████████| 7/7 [00:00<00:00, 1851.09 examples/s]
Map: 100%|██████████| 7/7 [00:00<00:00, 1941.17 examples/s]
Map: 100%|██████████| 4/4 [00:00<00:00, 1389.30 examples/s]
Map: 100%|██████████| 3/3 [00:00<00:00, 1026.42 examples/s]
Map: 100%|██████████| 5/5 [00:00<00:00, 1542.82 examples/s]
Map: 100%|██████████| 9/9 [00:00<00:00, 2513.73 examples/s]
Map: 100%|██████████| 7/7 [00:00<00:00, 2143.23 examples/s]
Map: 100%|██████████| 12/12 [00:00<00:00, 2768.21 examples/s]
Map: 100%|██████████| 3/3 [00:00<00:00, 1035.03 examples/s]
Map: 100%|██████████| 8/8 [00:00<00:00, 2169.70 examples/s]


In [7]:
dataset_info = DatasetInfo(
    description="This dataset includes images with corresponding text captions for OCR tests in Jawi (Malay written with Arabic letters). The data is taken from a publicly available repository of the Warta Malaya newspaper.",
    citation="""
    @dataset{miguelescobarvarela_jawi_ocr_2024,
      author    = {NUS},
      title     = {Gongguan-OCR-1},
      year      = {2025},
      version   = {1.0.0},
      publisher = {Hugging Face},
      url       = {https://huggingface.co/datasets/your-username/Jawi-OCR-1}
    }
    """,
    license="CC BY-SA 4.0",  # Example license
    version="1.0.0"
)

In [8]:
#combined_dataset.info = dataset_info

In [9]:
#combined_dataset.save_to_disk('../../data/hf/Jawi-OCR-v1')

## Spliting into train and validation set

In [10]:
# Split the dataset (80% train, 20% validation)
train_test_split = combined_dataset.train_test_split(test_size=0.2)

In [11]:
# If you want to specify specific columns to split, use the split function like this:
# train_test_split = dataset['your_dataset_name'].train_test_split(test_size=0.2)

# Create a DatasetDict for saving
split_dataset = DatasetDict({
    'train': train_test_split['train'],
    'validation': train_test_split['test']
})

### Saving to HF hub

Make sure that you add tokens/ to .gitignore to avoid sharing your tokens publicly!

In [None]:
with open("tokens/token") as f:
    token = f.read()

In [13]:
split_dataset.push_to_hub("mevsg/Gongguan-OCR-v1", token=token)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]
[A:   0%|          | 0/55 [00:00<?, ? examples/s]
Map: 100%|██████████| 55/55 [00:00<00:00, 483.93 examples/s]

[Aating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  4.99ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.29s/it]
Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]
Map: 100%|██████████| 14/14 [00:00<00:00, 1342.33 examples/s]

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 29.69ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.27it/s]


CommitInfo(commit_url='https://huggingface.co/datasets/mevsg/Gongguan-OCR-v1/commit/a63da6cd8c81444b4d995aa6a2cb0b15d6b5ab8e', commit_message='Upload dataset', commit_description='', oid='a63da6cd8c81444b4d995aa6a2cb0b15d6b5ab8e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mevsg/Gongguan-OCR-v1', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mevsg/Gongguan-OCR-v1'), pr_revision=None, pr_num=None)