In [3]:
import tritonclient.http as http_client
from tritonclient.utils import *
import numpy as np

ENABLE_SSL = False
ENDPOINT_URL = '52.151.255.217:8000'
HTTP_HEADERS = {"Authorization": "Bearer __PASTE_KEY_HERE__"}

# Connect to the server
if ENABLE_SSL:
    import gevent.ssl
    triton_http_client = http_client.InferenceServerClient(
        url=ENDPOINT_URL, verbose=False,
        ssl=True, ssl_context_factory=gevent.ssl._create_default_https_context,
    )
else:
    triton_http_client = http_client.InferenceServerClient(
        url=ENDPOINT_URL, verbose=False,
    )

print("Is server ready - {}".format(triton_http_client.is_server_ready(headers=HTTP_HEADERS)))

def get_string_tensor(string_values, tensor_name):
    string_obj = np.array(string_values, dtype="object")
    input_obj = http_client.InferInput(tensor_name, string_obj.shape, np_to_triton_dtype(string_obj.dtype))
    input_obj.set_data_from_numpy(string_obj)
    return input_obj

def get_translation_input_for_triton(texts: list, src_lang: str, tgt_lang: str):
    return [
        get_string_tensor([[text] for text in texts], "INPUT_TEXT"),
        get_string_tensor([[src_lang]] * len(texts), "INPUT_LANGUAGE_ID"),
        get_string_tensor([[tgt_lang]] * len(texts), "OUTPUT_LANGUAGE_ID"),
    ]

# Prepare input and output tensors
input_sentences = [
    "When I was young, I used to go to the park every day.",
    "He has many old books, which he inherited from his ancestors.",
    "I can't figure out how to solve my problem.",
    "She is very hardworking and intelligent, which is why she got all the good marks.",
    "We watched a new movie last week, which was very inspiring.",
    "If you had met me at that time, we would have gone out to eat.",
    "She went to the market with her sister to buy a new sari.",
    "Raj told me that he is going to his grandmother's house next month.",
    "All the kids were having fun at the party and were eating lots of sweets.",
    "My friend has invited me to his birthday party, and I will give him a gift.",
]
inputs = get_translation_input_for_triton(input_sentences, "en", "kn")
output0 = http_client.InferRequestedOutput("OUTPUT_TEXT")

# Send request
response = triton_http_client.infer(
    "nmt",
    model_version='1',
    inputs=inputs,
    outputs=[output0],
    headers=HTTP_HEADERS,
)#.get_response()

# Decode the response
output_batch = response.as_numpy('OUTPUT_TEXT').tolist()
for input_sentence, translation in zip(input_sentences, output_batch):
    print()
    print(input_sentence)
    print(translation[0].decode("utf-8"))

Is server ready - True

When I was young, I used to go to the park every day.
ನಾನು ಚಿಕ್ಕವಳಿದ್ದಾಗ, ಪ್ರತಿದಿನ ಉದ್ಯಾನವನಕ್ಕೆ ಹೋಗುತ್ತಿದ್ದೆ.

He has many old books, which he inherited from his ancestors.
ಅವರು ತಮ್ಮ ಪೂರ್ವಜರಿಂದ ಆನುವಂಶಿಕವಾಗಿ ಪಡೆದ ಅನೇಕ ಹಳೆಯ ಪುಸ್ತಕಗಳನ್ನು ಹೊಂದಿದ್ದಾರೆ.

I can't figure out how to solve my problem.
ನನ್ನ ಸಮಸ್ಯೆಯನ್ನು ಹೇಗೆ ಪರಿಹರಿಸಬೇಕೆಂದು ನನಗೆ ಅರ್ಥವಾಗುತ್ತಿಲ್ಲ.

She is very hardworking and intelligent, which is why she got all the good marks.
ಅವಳು ತುಂಬಾ ಕಷ್ಟಪಟ್ಟು ದುಡಿಯುವವಳು ಮತ್ತು ಬುದ್ಧಿವಂತಳು, ಅದಕ್ಕಾಗಿಯೇ ಅವಳು ಎಲ್ಲಾ ಉತ್ತಮ ಅಂಕಗಳನ್ನು ಪಡೆದಳು.

We watched a new movie last week, which was very inspiring.
ನಾವು ಕಳೆದ ವಾರ ಹೊಸ ಚಲನಚಿತ್ರವೊಂದನ್ನು ನೋಡಿದೆವು, ಅದು ಬಹಳ ಸ್ಪೂರ್ತಿದಾಯಕವಾಗಿತ್ತು.

If you had met me at that time, we would have gone out to eat.
ಆ ಸಮಯದಲ್ಲಿ ನೀವು ನನ್ನನ್ನು ಭೇಟಿಯಾಗಿದ್ದರೆ, ನಾವು ತಿನ್ನಲು ಹೊರಗೆ ಹೋಗುತ್ತಿದ್ದೆವು.

She went to the market with her sister to buy a new sari.
ಆಕೆ ತನ್ನ ಸಹೋದರಿಯೊಂದಿಗೆ ಹೊಸ ಸೀರೆಯನ್ನು ಖರೀದಿಸಲು ಮಾರುಕಟ್ಟೆಗೆ ಹೋದಳು.

Raj told me that he is going to his grandmot

In [4]:
from huggingface_hub import notebook_login
from datasets import load_dataset


In [4]:
dataset = load_dataset("Anthropic/hh-rlhf")

Downloading readme:   0%|          | 0.00/5.77k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/13.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/25.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/743k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/875k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['chosen', 'rejected'],
        num_rows: 160800
    })
    test: Dataset({
        features: ['chosen', 'rejected'],
        num_rows: 8552
    })
})

In [16]:
import tritonclient.http as http_client
import numpy as np
from datasets import load_dataset
from tqdm import tqdm
import pandas as pd


ENABLE_SSL = False
ENDPOINT_URL = '52.151.255.217:8000'
HTTP_HEADERS = {"Authorization": "Bearer __PASTE_KEY_HERE__"}

# Connect to the server
if ENABLE_SSL:
    import gevent.ssl
    triton_http_client = http_client.InferenceServerClient(
        url=ENDPOINT_URL, verbose=False,
        ssl=True, ssl_context_factory=gevent.ssl._create_default_https_context,
    )
else:
    triton_http_client = http_client.InferenceServerClient(
        url=ENDPOINT_URL, verbose=False,
    )

print("Is server ready - {}".format(triton_http_client.is_server_ready(headers=HTTP_HEADERS)))

def get_string_tensor(string_values, tensor_name):
    string_obj = np.array(string_values, dtype="object")
    input_obj = http_client.InferInput(tensor_name, string_obj.shape, np_to_triton_dtype(string_obj.dtype))
    input_obj.set_data_from_numpy(string_obj)
    return input_obj

def get_translation_input_for_triton(texts: list, src_lang: str, tgt_lang: str):
    return [
        get_string_tensor([[text] for text in texts], "INPUT_TEXT"),
        get_string_tensor([[src_lang]] * len(texts), "INPUT_LANGUAGE_ID"),
        get_string_tensor([[tgt_lang]] * len(texts), "OUTPUT_LANGUAGE_ID"),
    ]

def translate_to_kannada(texts):
    inputs = get_translation_input_for_triton(texts, "en", "kn")
    output0 = http_client.InferRequestedOutput("OUTPUT_TEXT")

    # Send request
    response = triton_http_client.infer(
        "nmt",
        model_version='1',
        inputs=inputs,
        outputs=[output0],
        headers=HTTP_HEADERS,
    )

    # Decode the response
    output_batch = response.as_numpy('OUTPUT_TEXT').tolist()
    translations = [translation[0].decode("utf-8") for translation in output_batch]
    
    return translations

def save_to_csv(original_instructions, original_outputs, translated_instructions, translated_outputs, file_path):
    data = {
        "original_instruction": original_instructions,
        "original_output": original_outputs,
        "translated_instruction": translated_instructions,
        "translated_output": translated_outputs
    }

    df = pd.DataFrame(data)
    df.to_csv(file_path, index=False)

def translate_dataset_to_kannada(dataset,output_folder="final_translate_data_3"):
    batch_size = 512  # Adjust the batch size based on your requirements
    num_examples = len(dataset["chosen"])

    translated_chosen = []
    translated_rejected = []

    for start_idx in tqdm(range(0, num_examples, batch_size), desc="Translating Dataset"):
        end_idx = min(start_idx + batch_size, num_examples)
        batch_chosen = dataset["chosen"][start_idx:end_idx]
        batch_rejected = dataset["rejected"][start_idx:end_idx]

        translated_batch_instructions = translate_to_kannada(batch_chosen)
        translated_batch_outputs = translate_to_kannada(batch_rejected)

        translated_chosen.extend(translated_batch_instructions)
        translated_rejected.extend(translated_batch_outputs)
         # Save to CSV file after each iteration
        iteration_num = start_idx // batch_size
        save_path = f"{output_folder}/iteration_{iteration_num}_{end_idx}.csv"
        save_to_csv(batch_chosen, batch_rejected, translated_batch_instructions, translated_batch_outputs, save_path)

    translated_dataset = {
        "chosen_english": dataset["chosen"],  
        "rejected_english":  dataset["rejected"],  
        "chosen_kannada": translated_chosen,
        "rejected_kannda": translated_rejected
    }

    return translated_dataset

Is server ready - True


In [17]:
translated_dataset = translate_dataset_to_kannada(dataset["train"][74782:])

Translating Dataset:   4%|▎         | 6/169 [05:55<2:40:53, 59.23s/it]


ConnectionRefusedError: [Errno 10061] [WinError 10061] No connection could be made because the target machine actively refused it.

: 