In [1]:
# Install necessary libraries
!pip install -U deep-translator datasets

Collecting deep-translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
num_cores = os.cpu_count()
print(f"Number of CPU cores: {num_cores}")

Number of CPU cores: 8


**BEST IMPLEMENTATION SO FAR**

In [3]:
# First mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

from deep_translator import GoogleTranslator
from datasets import load_dataset
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import random
import datetime
import os
import json
from collections import deque
import nest_asyncio
import asyncio
import aiohttp

# Enable nested event loops for Jupyter
nest_asyncio.apply()

# Configuration
BATCH_SIZE = 32
MAX_WORKERS = 8
CACHE_SIZE = 10000
INTERMEDIATE_SAVE_FREQUENCY = 10000
REQUESTS_PER_SECOND = 3
MAX_RETRIES = 3
RUNTIME_LIMIT = datetime.timedelta(hours=24)
SAVE_DIR = '/content/drive/My Drive/my_data/'

# Create save directory if it doesn't exist
os.makedirs(SAVE_DIR, exist_ok=True)

# LRU Cache implementation
class LRUCache:
    def __init__(self, capacity):
        self.cache = {}
        self.capacity = capacity
        self.usage = deque()

    def get(self, key):
        if key in self.cache:
            self.usage.remove(key)
            self.usage.append(key)
            return self.cache[key]
        return None

    def put(self, key, value):
        if key in self.cache:
            self.usage.remove(key)
        elif len(self.cache) >= self.capacity:
            oldest = self.usage.popleft()
            del self.cache[oldest]
        self.cache[key] = value
        self.usage.append(key)

# Initialize cache
translation_cache = LRUCache(CACHE_SIZE)

# Load local cache if exists
def load_cache():
    cache_path = os.path.join(SAVE_DIR, 'translation_cache.json')
    try:
        with open(cache_path, 'r', encoding='utf-8') as f:
            cache_data = json.load(f)
            for k, v in cache_data.items():
                translation_cache.put(k, v)
            print(f"Loaded {len(cache_data)} cached translations")
    except FileNotFoundError:
        print("No existing cache found")

# Save cache
def save_cache():
    cache_path = os.path.join(SAVE_DIR, 'translation_cache.json')
    cache_data = {k: translation_cache.cache[k] for k in translation_cache.usage}
    with open(cache_path, 'w', encoding='utf-8') as f:
        json.dump(cache_data, f, ensure_ascii=False)
    print(f"Saved {len(cache_data)} translations to cache")

# Rate limiter class
class RateLimiter:
    def __init__(self, max_per_second):
        self.delay = 1.0 / max_per_second
        self.last_called = 0
        self._lock = asyncio.Lock()

    async def acquire(self):
        async with self._lock:
            now = time.time()
            elapsed = now - self.last_called
            if elapsed < self.delay:
                await asyncio.sleep(self.delay - elapsed)
            self.last_called = time.time()

# Translator class with rate limiting and retries
class TranslatorService:
    def __init__(self):
        self.translator = GoogleTranslator(source='en', target='hi')
        self.rate_limiter = RateLimiter(REQUESTS_PER_SECOND)
        self.session = None

    async def initialize(self):
        if not self.session:
            self.session = aiohttp.ClientSession()

    async def close(self):
        if self.session:
            await self.session.close()

    async def translate_text(self, text):
        cached = translation_cache.get(text)
        if cached:
            return cached

        for attempt in range(MAX_RETRIES):
            try:
                await self.rate_limiter.acquire()
                translation = self.translator.translate(text)
                translation_cache.put(text, translation)
                return translation
            except Exception as e:
                if attempt == MAX_RETRIES - 1:
                    print(f"Failed to translate after {MAX_RETRIES} attempts: {e}")
                    return text
                await asyncio.sleep(2 ** attempt + random.random())

# Batch processor
async def process_batch(translator_service, batch):
    tasks = []
    for text in batch:
        task = asyncio.create_task(translator_service.translate_text(text))
        tasks.append(task)
    return await asyncio.gather(*tasks)

# Save translations
def save_translations(stories, chunk_num, batch_num):
    filename = f"tinystories_translatedHindi_chunk{chunk_num}_batch{batch_num}.txt"
    filepath = os.path.join(SAVE_DIR, filename)
    with open(filepath, "w", encoding="utf-8") as f:
        for story in stories:
            if story:
                f.write(story + "<अंत>\n")
    print(f"\nSaved {len(stories)} translations to {filepath}")

# Main translation function
async def translate_stories():
    # Load dataset
    dataset = load_dataset("roneneldan/TinyStories", split="train")

    # Get chunk selection from user
    chunk_size = 50000
    num_chunks = (len(dataset) + chunk_size - 1) // chunk_size
    chunk_number = int(input(f"Enter the chunk number (1 to {num_chunks}): ")) - 1

    # Select chunk
    start_index = chunk_number * chunk_size
    end_index = min((chunk_number + 1) * chunk_size, len(dataset))
    selected_chunk = dataset.select(range(start_index, end_index))
    print(f"Selected chunk length: {len(selected_chunk)} stories")

    # Get the starting offset within this chunk (to resume translation)
    offset_within_chunk = int(input(f"Enter the starting offset within this chunk (0-{len(selected_chunk)}, e.g. 30000 to skip first 30k): "))

    # Validate offset
    if offset_within_chunk < 0 or offset_within_chunk >= len(selected_chunk):
        print(f"Invalid offset. Must be between 0 and {len(selected_chunk) - 1}")
        return

    # Apply offset to skip already processed items
    start_from_index = offset_within_chunk
    print(f"Will process from story #{start_from_index} to #{len(selected_chunk) - 1} in this chunk")

    # Calculate the actual batch number to start from
    starting_batch_number = (offset_within_chunk // INTERMEDIATE_SAVE_FREQUENCY) + 1

    # Load cache
    load_cache()

    # Initialize translator service
    translator_service = TranslatorService()
    await translator_service.initialize()

    try:
        start_time = datetime.datetime.now()
        current_batch = []
        batch_number = starting_batch_number

        with tqdm(total=len(selected_chunk) - start_from_index, desc="Translating") as pbar:
            for i in range(start_from_index, len(selected_chunk), BATCH_SIZE):
                if datetime.datetime.now() - start_time > RUNTIME_LIMIT:
                    print("\nRuntime limit reached. Saving progress...")
                    break

                batch = selected_chunk['text'][i:i + BATCH_SIZE]
                translations = await process_batch(translator_service, batch)
                current_batch.extend(t for t in translations if t)
                pbar.update(len(batch))

                # Calculate how many stories we've processed in this resumed session
                stories_processed = i - start_from_index + len(batch)

                # Use modulo of stories processed against save frequency to decide when to save
                if len(current_batch) >= INTERMEDIATE_SAVE_FREQUENCY or stories_processed % INTERMEDIATE_SAVE_FREQUENCY < BATCH_SIZE:
                    save_translations(current_batch, chunk_number + 1, batch_number)
                    save_cache()
                    batch_number += 1
                    current_batch = []

        # Save remaining translations
        if current_batch:
            save_translations(current_batch, chunk_number + 1, batch_number)

    finally:
        await translator_service.close()
        save_cache()

# Run the translation
loop = asyncio.get_event_loop()
loop.run_until_complete(translate_stories())

Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

(…)-00000-of-00004-2d5a1467fff1081b.parquet:   0%|          | 0.00/249M [00:00<?, ?B/s]

(…)-00001-of-00004-5852b56a2bd28fd9.parquet:   0%|          | 0.00/248M [00:00<?, ?B/s]

(…)-00002-of-00004-a26307300439e943.parquet:   0%|          | 0.00/246M [00:00<?, ?B/s]

(…)-00003-of-00004-d243063613e5a057.parquet:   0%|          | 0.00/248M [00:00<?, ?B/s]

(…)-00000-of-00001-869c898b519ad725.parquet:   0%|          | 0.00/9.99M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2119719 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/21990 [00:00<?, ? examples/s]

Selected chunk length: 50000 stories
Will process from story #39999 to #49999 in this chunk
Loaded 10000 cached translations


Translating: 100%|██████████| 10001/10001 [3:33:21<00:00,  1.28s/it]


Saved 9990 translations to /content/drive/My Drive/my_data/tinystories_translatedHindi_chunk22_batch4.txt


Translating: 100%|██████████| 10001/10001 [3:33:22<00:00,  1.28s/it]

Saved 10000 translations to cache





Saved 10000 translations to cache
