In [1]:
from datasets import load_dataset

# Load training split
dataset = load_dataset("openpecha/OCR-Betsug")

# Example features
print(dataset)

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/4.51M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/28318 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['filename', 'label', 'url', 'batch_id', 'state'],
        num_rows: 28318
    })
})


In [2]:
dataset['train'][0]  # Display the first example in the training set

{'filename': 'I1KG812780077_1.jpg',
 'label': 'ཀྱི་ཐིག་ལེ་ཏེ་།   རྟེན་དང་བརྟེན་པར་བཅས་པ་ཉིད་།  རང་ཉིད་གསལ་བར་བྱིན་བརླབས་ན་།  ཇི་ལྟར་སྣང་སྲིད་ཆོས་ཐམས་ཅད་།   ལྷན་ཅིག་སྐྱེས་པའི་ངོ་བོར་',
 'url': 'https://s3.amazonaws.com/monlam.ai.ocr/OCR/training_images/I1KG812780077_1.jpg',
 'batch_id': 'batch22',
 'state': 'finalised'}

## Add Placeholder Image Column

In [3]:
from datasets import Image as ImageFeature

def add_placeholder_image(example):
    #Add a None placeholder for the image column

    example['image'] = None
    return example

# Add the image column with placeholders
print("Adding placeholder image column...")
dataset = dataset.map(
    add_placeholder_image,
    desc="Adding image column"
)

dataset = dataset.cast_column('image', ImageFeature())

dataset['train'][0]

Adding placeholder image column...


Adding image column:   0%|          | 0/28318 [00:00<?, ? examples/s]

{'filename': 'I1KG812780077_1.jpg',
 'label': 'ཀྱི་ཐིག་ལེ་ཏེ་།   རྟེན་དང་བརྟེན་པར་བཅས་པ་ཉིད་།  རང་ཉིད་གསལ་བར་བྱིན་བརླབས་ན་།  ཇི་ལྟར་སྣང་སྲིད་ཆོས་ཐམས་ཅད་།   ལྷན་ཅིག་སྐྱེས་པའི་ངོ་བོར་',
 'url': 'https://s3.amazonaws.com/monlam.ai.ocr/OCR/training_images/I1KG812780077_1.jpg',
 'batch_id': 'batch22',
 'state': 'finalised',
 'image': None}

## Add Images

In [4]:
import requests
from PIL import Image
import io
from datasets import Dataset
import pandas as pd

def download_image(example):
    """
    Function to be used with dataset.map() to download images for each example
    """
    try:
        # Download the image
        response = requests.get(example['url'])
        response.raise_for_status()
        
        # Open image and convert to PIL Image object
        image = Image.open(io.BytesIO(response.content))
        example['image'] = image
        
    except Exception as e:
        # If download fails, set image to None and continue
        print(f"Failed to download image from {example.get('url', 'unknown')}. Error: {e}")
        example['image'] = None
    
    return example

In [5]:
dataset = dataset['train'].map(download_image, num_proc=4)

Map (num_proc=4):   0%|          | 0/28318 [00:00<?, ? examples/s]

## Fix Schema

In [6]:
def filename_to_id(batch):
    batch['id'] = batch['filename'].split('.')[0]
    return batch

ds = dataset.map(filename_to_id, num_proc=4)

Map (num_proc=4):   0%|          | 0/28318 [00:00<?, ? examples/s]

In [7]:
ds = ds.remove_columns(['filename', 'url', 'batch_id', 'state'])  # Drop unnecessary columns

In [8]:
from datasets import Dataset

reordered = Dataset.from_dict({
    'id': ds['id'],
    'image': ds['image'],
    'label': ds['label'],
})

In [9]:
print(reordered)
print(reordered[0])

Dataset({
    features: ['id', 'image', 'label'],
    num_rows: 28318
})
{'id': 'I1KG812780077_1', 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=2133x52 at 0x7750B0E2A210>, 'label': 'ཀྱི་ཐིག་ལེ་ཏེ་།   རྟེན་དང་བརྟེན་པར་བཅས་པ་ཉིད་།  རང་ཉིད་གསལ་བར་བྱིན་བརླབས་ན་།  ཇི་ལྟར་སྣང་སྲིད་ཆོས་ཐམས་ཅད་།   ལྷན་ཅིག་སྐྱེས་པའི་ངོ་བོར་'}


In [10]:
reordered.push_to_hub("openpecha/OCR-Betsug")

Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ? shards/s]

Map:   0%|          | 0/9440 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Map:   0%|          | 0/9439 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Map:   0%|          | 0/9439 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

CommitInfo(commit_url='https://huggingface.co/datasets/openpecha/OCR-Betsug/commit/7745be0c6fd7a52ec63af9fcf4a0d10930b9fdf0', commit_message='Upload dataset', commit_description='', oid='7745be0c6fd7a52ec63af9fcf4a0d10930b9fdf0', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/openpecha/OCR-Betsug', endpoint='https://huggingface.co', repo_type='dataset', repo_id='openpecha/OCR-Betsug'), pr_revision=None, pr_num=None)

In [11]:
dataset = load_dataset("openpecha/OCR-Betsug", split="train")

# Example features
print(dataset[0])

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00003.parquet:   0%|          | 0.00/340M [00:00<?, ?B/s]

data/train-00001-of-00003.parquet:   0%|          | 0.00/415M [00:00<?, ?B/s]

data/train-00002-of-00003.parquet:   0%|          | 0.00/364M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/28318 [00:00<?, ? examples/s]

{'id': 'I1KG812780077_1', 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=2133x52 at 0x7750B32C6450>, 'label': 'ཀྱི་ཐིག་ལེ་ཏེ་།   རྟེན་དང་བརྟེན་པར་བཅས་པ་ཉིད་།  རང་ཉིད་གསལ་བར་བྱིན་བརླབས་ན་།  ཇི་ལྟར་སྣང་སྲིད་ཆོས་ཐམས་ཅད་།   ལྷན་ཅིག་སྐྱེས་པའི་ངོ་བོར་'}


In [12]:
len(dataset)

28318