### Azure Translation API

In [None]:
# !pip install datasets
# !pip install fsspec==2023.9.2

In [None]:
import requests, uuid, json

# Add your key and endpoint

# location, also known as region.
# required if you're using a multi-service or regional (not global) resource. It can be found in the Azure portal on the Keys and Endpoint page.

def translate_text(text):
    key = ""
    endpoint = ""
    location = ""
    path = '/translate'
    constructed_url = endpoint + path

    params = {
        'api-version': '3.0',
        'from': 'en',
        'to': 'kn'
    }

    headers = {
        'Ocp-Apim-Subscription-Key': key,
        'Ocp-Apim-Subscription-Region': location,
        'Content-type': 'application/json',
        'X-ClientTraceId': str(uuid.uuid4())
    }
    try:

        body = [{
            'text': text
        }]

        request = requests.post(constructed_url, params=params, headers=headers, json=body)
        response = request.json()
        response = response[0]['translations'][0]['text']
        return response
        pass
    except Exception as e:
        print(f"Translation failed for text: {text}")
        return None
translated_response = translate_text("""
\onicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " <unk> Raven " . 
\
\enjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " <unk> Raven " . 
""")
print(translated_response)



In [None]:
import concurrent.futures
from datasets import load_dataset
import random
from tqdm import tqdm
import pandas as pd

In [None]:
kannada_dataset = load_dataset("CognitiveLab/Project_K_TrainDataset_500k")

In [None]:
kannada_dataset

In [None]:
random_samples = random.sample(kannada_dataset['test']['text'], 10)
random_samples

In [None]:
def translate_batch(batch):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        english_translations = list(tqdm(executor.map(translate_text, batch), total=len(batch), desc="Translating"))
    return english_translations

# Split the random samples into batches for parallel processing
batch_size = 10
sample_batches = [random_samples[i:i + batch_size] for i in range(0, len(random_samples), batch_size)]

# Translate each batch of Kannada texts to English in parallel
translated_data = {'src': [], 'tgt': []}

for batch in tqdm(sample_batches, desc="Translating and Saving"):
    english_translations = translate_batch(batch)

    for kannada_text, english_translation in zip(batch, english_translations):
        if english_translation is not None:
            translated_data['src'].append(kannada_text)
            translated_data['tgt'].append(english_translation)

# Create a DataFrame from the translated data
translated_df = pd.DataFrame(translated_data)

# Save the translated dataset to disk
translated_df.to_csv('./translated_dataset.csv', index=False)

In [None]:
hf_username = 'your_username'
hf_dataset_name = 'your_translated_dataset'
translated_dataset = Dataset.from_pandas(translated_df)
translated_dataset.save_to_disk('./huggingface_dataset')
translated_dataset.push_to_hub(f"{hf_username}/{hf_dataset_name}")