In [None]:
!sudo apt install aria2

In [None]:
%pip install datasets ctranslate2

In [None]:
import datasets

In [None]:
!pip install -U datasets

In [None]:
dataset = datasets.load_dataset("Lin-Chen/ShareGPT4V", "ShareGPT4V")

In [None]:
dataset["train"][0]

In [None]:
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
import datasets
from openai import OpenAI
import time
import json
import os
from tqdm import tqdm
import copy
import random
import threading

def translate_dataset(
    dataset_name=None,
    dataset=None,
    target_language="Spanish",
    api_key=None,
    site_url=None,
    site_name=None,
    max_examples=None,
    start_idx=0,
    checkpoint_file="translation_checkpoint.json",
    max_retries=5,
    num_threads=4  # New parameter for controlling number of threads
):
    if not api_key:
        raise ValueError("Please provide your OpenRouter API key")

    if dataset is None and dataset_name is None:
        raise ValueError("Please provide either a dataset or dataset_name")

    # Initialize the OpenRouter client
    client = OpenAI(
        base_url="https://openrouter.ai/api/v1",
        api_key=api_key,
    )

    # Extra headers for OpenRouter
    extra_headers = {}
    if site_url:
        extra_headers["HTTP-Referer"] = site_url
    if site_name:
        extra_headers["X-Title"] = site_name

    # Load the dataset if name is provided
    if dataset_name:
        print(f"Loading dataset {dataset_name}...")
        dataset = datasets.load_dataset(dataset_name)

    if isinstance(dataset, datasets.Dataset):
        dataset = datasets.DatasetDict({'train': dataset})

    print(f"Dataset loaded with splits: {', '.join(dataset.keys())}")

    # Thread-safe checkpoint handling
    checkpoint_lock = threading.Lock()

    def save_checkpoint(split, idx, example, checkpoint_data):
        with checkpoint_lock:
            if split not in checkpoint_data:
                checkpoint_data[split] = {"current_idx": 0, "examples": []}
            checkpoint_data[split]["examples"].append(example)
            checkpoint_data[split]["current_idx"] = max(
                checkpoint_data[split]["current_idx"],
                idx + 1
            )
            with open(checkpoint_file, 'w') as f:
                json.dump(checkpoint_data, f)

    def translate_with_retry(content, retries=0, max_wait=60):
        """Helper function to translate with exponential backoff for retries"""
        prompt = f"Translate the following text to {target_language}. Preserve any formatting, keep <image> tags unchanged, and maintain the structure of the text, PROVIDE JUST THE TRANSLATION, DO NOT ADD ANYTHING:\n\n{content}"

        try:
            completion = client.chat.completions.create(
                extra_headers=extra_headers,
                model="meta-llama/llama-4-scout",
                messages=[
                    {
                        "role": "user",
                        "content": prompt
                    }
                ]
            )
            return completion.choices[0].message.content
        except Exception as e:
            if retries < max_retries:
                wait_time = min(2 ** retries + random.random(), max_wait)
                print(f"API error: {str(e)}. Retrying in {wait_time:.2f} seconds...")
                time.sleep(wait_time)
                return translate_with_retry(content, retries + 1, max_wait)
            else:
                print(f"Max retries exceeded. Error: {str(e)}")
                return content

    def process_example(args):
        split, idx, example = args
        translated_example = copy.deepcopy(example)

        if "conversations" in example:
            translated_conversations = []
            for message in example["conversations"]:
                role = message["from"]
                content = message["value"]
                translated_content = translate_with_retry(content)
                translated_conversations.append({
                    "from": role,
                    "value": translated_content
                })
            translated_example["conversations"] = translated_conversations
        else:
            for key, value in example.items():
                if isinstance(value, str):
                    translated_example[key] = translate_with_retry(value)

        return split, idx, translated_example

    # Load checkpoint if exists
    checkpoint = {}
    if os.path.exists(checkpoint_file):
        print(f"Loading checkpoint from {checkpoint_file}")
        with open(checkpoint_file, 'r') as f:
            checkpoint = json.load(f)

    translated_datasets = {}

    for split in dataset:
        print(f"\nProcessing {split} split...")

        split_checkpoint = checkpoint.get(split, {})
        current_idx = split_checkpoint.get("current_idx", start_idx)
        translated_examples = split_checkpoint.get("examples", [])

        print(f"Starting from example {current_idx}")
        if translated_examples:
            print(f"Found {len(translated_examples)} previously translated examples")

        end_idx = len(dataset[split])
        if max_examples:
            end_idx = min(current_idx + max_examples, end_idx)

        # Prepare tasks for parallel processing
        tasks = [(split, idx, dataset[split][idx])
                for idx in range(current_idx, end_idx)]

        # Process examples in parallel
        with ThreadPoolExecutor(max_workers=num_threads) as executor:
            futures = [executor.submit(process_example, task) for task in tasks]

            for future in tqdm(
                concurrent.futures.as_completed(futures),
                total=len(tasks),
                desc=f"Translating {split}"
            ):
                split, idx, translated_example = future.result()
                save_checkpoint(split, idx, translated_example, checkpoint)

        # Create dataset from checkpoint
        translated_datasets[split] = datasets.Dataset.from_list(
            checkpoint[split]["examples"]
        )

    return datasets.DatasetDict(translated_datasets)

def load_from_checkpoint(checkpoint_file="translation_checkpoint.json"):
    """
    Load a translated dataset from a checkpoint file.

    Parameters:
    - checkpoint_file: Path to the checkpoint file

    Returns:
    - Translated dataset as a DatasetDict
    """
    if not os.path.exists(checkpoint_file):
        raise FileNotFoundError(f"Checkpoint file {checkpoint_file} not found")

    print(f"Loading translations from checkpoint: {checkpoint_file}")
    with open(checkpoint_file, 'r') as f:
        checkpoint = json.load(f)

    translated_datasets = {}

    for split, split_data in checkpoint.items():
        examples = split_data.get("examples", [])
        print(f"Found {len(examples)} examples for {split} split")
        translated_datasets[split] = datasets.Dataset.from_list(examples)

    return datasets.DatasetDict(translated_datasets)

# Example usage
# if __name__ == "__main__":
#     import argparse

#     parser = argparse.ArgumentParser(description="Translate the ShareGPT4V dataset")
#     parser.add_argument("--api_key", type=str, required=True, help="OpenRouter API key")
#     parser.add_argument("--target_language", type=str, default="Spanish", help="Target language for translation")
#     parser.add_argument("--site_url", type=str, help="Your site URL for OpenRouter rankings")
#     parser.add_argument("--site_name", type=str, help="Your site name for OpenRouter rankings")
#     parser.add_argument("--max_examples", type=int, help="Maximum number of examples to translate")
#     parser.add_argument("--start_idx", type=int, default=0, help="Index to start from")
#     parser.add_argument("--checkpoint_file", type=str, default="translation_checkpoint.json", help="Checkpoint file path")
#     parser.add_argument("--output_dir", type=str, default="translated_ShareGPT4V", help="Output directory for translated dataset")
#     parser.add_argument("--load_only", action="store_true", help="Only load from checkpoint, don't translate")

#     args = parser.parse_args()

#     if args.load_only:
#         # Just load from checkpoint
#         translated_dataset = load_from_checkpoint(args.checkpoint_file)
#     else:
#         # Translate the dataset
#         translated_dataset = translate_dataset(
#             target_language=args.target_language,
#             api_key=args.api_key,
#             site_url=args.site_url,
#             site_name=args.site_name,
#             max_examples=args.max_examples,
#             start_idx=args.start_idx,
#             checkpoint_file=args.checkpoint_file
#         )

#     # Save the translated dataset
#     print(f"Saving translated dataset to {args.output_dir}")
#     translated_dataset.save_to_disk(args.output_dir)
#     print("Done!")

In [None]:
# from google.colab import userdata

# ds = translate_dataset(
#     target_language="Indonesian",
#     api_key=userdata.get('OPENROUTER_API_KEY'),
#     site_url=None,
#     site_name=None,
#     max_examples=1,
#     start_idx=0,
#     checkpoint_file="translation_checkpoint.json",
# )

# ds

In [None]:
# ds["train"]["conversations"]

In [None]:
%mkdir /content/dataset

In [None]:
# Define the base path for the project
BASE_PATH = "/content/dataset"

In [None]:
%mkdir /content/dataset/sharegpt4v

In [None]:
%cd /content
!aria2c -x 2 --auto-file-renaming=false https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/share-captioner_coco_lcs_sam_1246k_1107.json?download=true -o dataset/sharegpt4v/share-captioner_coco_lcs_sam_1246k_1107.json
!aria2c -x 2 --auto-file-renaming=false https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json?download=true -o dataset/sharegpt4v/sharegpt4v_instruct_gpt4-vision_cap100k.json
!aria2c -x 2 --auto-file-renaming=false https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_mix665k_cap23k_coco-ap9k_lcs3k_sam9k_div2k.json?download=true -o dataset/sharegpt4v/sharegpt4v_mix665k_cap23k_coco-ap9k_lcs3k_sam9k_div2k.json

In [None]:
!wget https://files.catbox.moe/wdlcqp.parquet

In [None]:
dp = datasets.load_dataset("parquet", data_files="wdlcqp.parquet")

In [None]:
dp["train"]["image_path"]

In [None]:
dp["train"][0]

In [None]:
def transform_format(old_entry):
    new_entry = {
        'id': old_entry['id'],
        'image': old_entry['image_path'],
        'conversations': [
            {
                'from': 'human',
                'value': old_entry['human_messages'][0]
            },
            {
                'from': 'gpt',
                'value': old_entry['assistant_messages'][0]
            }
        ]
    }
    return new_entry

In [None]:
new_dataset = {split: [transform_format(entry) for entry in data]
              for split, data in dp.items()}

In [None]:
new_dataset

In [None]:
from google.colab import userdata

translated_ds = translate_dataset(
    dataset=new_dataset,
    target_language="Indonesian",
    api_key=userdata.get('OPENROUTER_API_KEY'),
    site_url=None,
    site_name=None,
    start_idx=0,
    checkpoint_file="translation_checkpoint.json",
    num_threads=128
)

Dataset loaded with splits: train
Loading checkpoint from translation_checkpoint.json

Processing train split...
Starting from example 20000
Found 20000 previously translated examples


Translating train: 0it [00:00, ?it/s]


In [None]:
translated_ds["train"]

Dataset({
    features: ['id', 'image', 'conversations'],
    num_rows: 20000
})

In [None]:
!curl -F "reqtype=fileupload" -F "fileToUpload=@translation_checkpoint.json" https://catbox.moe/user/api.php

https://files.catbox.moe/lzi8zt.json

In [None]:
translated_ds["train"][-1]

{'id': '000000183360',
 'image': 'coco/train2017/000000183360.jpg',
 'conversations': [{'from': 'human',
   'value': '<image>\nApa yang digambarkan dalam foto ini?'},
  {'from': 'gpt',
   'value': 'Gambar tersebut menangkap sebuah momen, menampilkan sebuah lokomotif uap hitam yang megah dengan balok penyangga merah cerah. Lokomotif, dengan bangga menampilkan nomor 30587 dalam warna putih, diposisikan di atas jalur kereta api. Ini adalah sebuah adegan aktif, dengan lokomotif mengeluarkan semburan uap dari cerobong asapnya, menunjukkan bahwa lokomotif itu sedang bersiap untuk berangkat atau baru saja tiba.\n\nLokomotif menghadap ke sisi kanan gambar, seolah-olah siap untuk memulai perjalanan ke tempat yang tidak diketahui. Latar belakang menyediakan kontras yang tenang dengan lokomotif industri, menampilkan area berhutan yang lebat dengan pohon-pohon dan semak-semak. Kehijauan hutan dan warna hitam dan merah lokomotif menciptakan kontras yang mencolok, menambahkan kedalaman dan minat pad

In [None]:
translated_ds["train"].to_parquet("datasettt.parquet")

Creating parquet from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]

22719464

In [None]:
!curl -F "reqtype=fileupload" -F "fileToUpload=@datasettt.parquet" https://catbox.moe/user/api.php

https://files.catbox.moe/hfawow.parquet