In [14]:
from google.cloud import translate
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

In [2]:
def translate_text_with_model(text: str) -> dict:
    
    translate_client = translate.Client()

    if isinstance(text, bytes):
        text = text.decode("utf-8")

    result = translate_client.translate(text, 
                                        source_language="ar",
                                        target_language='en', 
                                        model='nmt')

    #print(result["translatedText"])
    return result["translatedText"]


In [3]:
translate_text_with_model("مرحبا بالعالم")

'Hello World'

In [11]:
pairs_200 = pd.read_parquet('data/ar-en-200.parquet')
pairs_200.reset_index(inplace=True)
pairs_200.drop(columns=['index'], inplace=True)

In [12]:
batches = {
    0: pairs_200.iloc[:100_000,:],
    1: pairs_200.iloc[100_000:200_000,:],
    2: pairs_200.iloc[200_000:300_000,:],
    3: pairs_200.iloc[300_000:400_000,:],
    4: pairs_200.iloc[400_000:500_000,:],
    5: pairs_200.iloc[500_000:600_000,:],
    6: pairs_200.iloc[600_000:700_000,:],
    7: pairs_200.iloc[700_000:800_000,:],
    8: pairs_200.iloc[800_000:900_000,:],
    9: pairs_200.iloc[900_000:,:]
}

In [6]:
def translator(idx) -> pd.DataFrame:
    batches[idx]['en_translated'] = batches[idx]['ar'].progress_apply(lambda x: translate_text_with_model(x))

In [19]:
n = 10_000
pairs_split = [pairs_200[i:i+n] for i in range(0,pairs_200.shape[0],n)]

In [27]:
len(pairs_split)

99

In [23]:
for idx, split in tqdm(enumerate(pairs_split)):
    split['ar'].to_csv(f'batches/ar_{idx}.tsv', index=True, header=False, sep='\t')

99it [00:05, 18.89it/s]


In [35]:
from google.cloud import translate


def batch_translate_text(
    input_uris,
    output_uri: str = "gs://wiki_matrix_translated/",
    project_id: str = "neurofy-403605"
) -> translate.TranslateTextResponse:
    """Translates a batch of texts on GCS and stores the result in a GCS location.

    Args:
        input_uri: The input URI of the texts to be translated.
        output_uri: The output URI of the translated texts.
        project_id: The ID of the project that owns the destination bucket.
        timeout: The timeout for this batch translation operation.

    Returns:
        The translated texts.
    """

    client = translate.TranslationServiceClient()

    location = "us-central1"
    # Supported file types: https://cloud.google.com/translate/docs/supported-formats

    input_configs_elements = [{
        "gcs_source": {"input_uri": input_uri},
        "mime_type": "text/plain",  # Can be "text/plain" or "text/html".
    } for input_uri in input_uris]

    gcs_destination = {"output_uri_prefix": output_uri}
    output_config = {"gcs_destination": gcs_destination}
    parent = f"projects/{project_id}/locations/{location}"

    # Supported language codes: https://cloud.google.com/translate/docs/languages
    operation = client.batch_translate_text(
        request={
            "parent": parent,
            "source_language_code": "ar",
            "target_language_codes": ["en"],  # Up to 10 language codes here.
            "input_configs": input_configs_elements,
            "output_config": output_config,
        }
    )

    print("Waiting for operation to complete...")
    response = operation.result(timeout=None)

    print(f"Total Characters: {response.total_characters}")
    print(f"Translated Characters: {response.translated_characters}")

    return response


In [36]:
done = [f"gs://wiki_matrix_batches/ar_{idx}.tsv" for idx in range(50)]

In [37]:
all = [f"gs://wiki_matrix_batches/ar_{idx}.tsv" for idx in range(99)]

In [38]:
remaining = [x for x in all if x not in done]

In [39]:
len(remaining)

49

In [40]:
batch_translate_text(input_uris = remaining)

Waiting for operation to complete...
Total Characters: 58740228
Translated Characters: 58740228


total_characters: 58740228
translated_characters: 58740228
submit_time {
  seconds: 1698688065
}
end_time {
  seconds: 1698688586
}