In [1]:
from datasets import load_dataset

# Load training split
dataset = load_dataset("openpecha/OCR-Drutsa")

# Example features
print(dataset)

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/4.61M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/32364 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['filename', 'label', 'url', 'batch_id', 'state'],
        num_rows: 32364
    })
})


In [2]:
dataset['train'][0]  # Display the first example in the training set

{'filename': 'KS_11-061_line_9874_4.jpg',
 'label': 'བས་སོ་། །༢པ་ཡང་མི་འཐད་ཏེ་སྣལ་མ་དུ་མ་འདུས་པ་ལས་འབྲས་བུ་སྣམ་པུ་ཡོད་ན་སྣལ་མ་སོ་སོ་ལ་འབྲས་བུ་ཆ་རི་དམིགས་པར་ཐལ་ལོ་། །དེས་ན་སྣལ་',
 'url': 'https://s3.amazonaws.com/monlam.ai.ocr/line_to_text/batch35/KS_11-061_line_9874_4.jpg',
 'batch_id': 'batch35',
 'state': 'finalised'}

## Add Placeholder Image Column

In [3]:
from datasets import Image as ImageFeature

def add_placeholder_image(example):
    #Add a None placeholder for the image column

    example['image'] = None
    return example

# Add the image column with placeholders
print("Adding placeholder image column...")
dataset = dataset.map(
    add_placeholder_image,
    desc="Adding image column"
)

dataset = dataset.cast_column('image', ImageFeature())

dataset['train'][0]

Adding placeholder image column...


Adding image column:   0%|          | 0/32364 [00:00<?, ? examples/s]

{'filename': 'KS_11-061_line_9874_4.jpg',
 'label': 'བས་སོ་། །༢པ་ཡང་མི་འཐད་ཏེ་སྣལ་མ་དུ་མ་འདུས་པ་ལས་འབྲས་བུ་སྣམ་པུ་ཡོད་ན་སྣལ་མ་སོ་སོ་ལ་འབྲས་བུ་ཆ་རི་དམིགས་པར་ཐལ་ལོ་། །དེས་ན་སྣལ་',
 'url': 'https://s3.amazonaws.com/monlam.ai.ocr/line_to_text/batch35/KS_11-061_line_9874_4.jpg',
 'batch_id': 'batch35',
 'state': 'finalised',
 'image': None}

## Add Images

In [4]:
import requests
from PIL import Image
import io
from datasets import Dataset
import pandas as pd

def download_image(example):
    """
    Function to be used with dataset.map() to download images for each example
    """
    try:
        # Download the image
        response = requests.get(example['url'])
        response.raise_for_status()
        
        # Open image and convert to PIL Image object
        image = Image.open(io.BytesIO(response.content))
        example['image'] = image
        
    except Exception as e:
        # If download fails, set image to None and continue
        print(f"Failed to download image from {example.get('url', 'unknown')}. Error: {e}")
        example['image'] = None
    
    return example

In [None]:
dataset = dataset['train'].map(download_image, num_proc=4)

Map (num_proc=4):   0%|          | 0/32364 [00:00<?, ? examples/s]