# Translate MFD to Kannada

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

def translate_dict_to_kannada(input_dict: dict) -> dict:
    """
    Translates the string values in a dictionary of lists to Kannada.

    Args:
        input_dict: A dictionary where keys are strings and values are lists of strings.
                    Example: {'moral': ['word1', 'word2'], 'moral2': ['word3']}

    Returns:
        A new dictionary with the same keys but with translated string values.
    """
    # Define the model name and target language
    model_name = "sarvamai/sarvam-translate"
    tgt_lang = "Kannada"

    # Load the tokenizer and model
    # The .to('cuda:0') part moves the model to the GPU for faster inference
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name).to('cuda:0')

    translated_dict = {}
    # Iterate over each key-value pair in the input dictionary
    for key, words in input_dict.items():
        translated_words = []
        # Iterate over each word in the list
        for word in words:
            # Create the prompt for the model using a chat template
            messages = [
                {"role": "system", "content": f"Translate the text below to {tgt_lang}."},
                {"role": "user", "content": word}
            ]

            # Apply the chat template to structure the conversation
            text = tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True
            )

            # Tokenize the input and move it to the model's device (GPU)
            model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

            # Generate the translation
            generated_ids = model.generate(
                **model_inputs,
                max_new_tokens=128,  # Increased token limit for potentially longer words
                do_sample=True,
                temperature=0.01,
                num_return_sequences=1
            )

            # Decode the generated output to get the translated text
            output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
            output_text = tokenizer.decode(output_ids, skip_special_tokens=True)
            translated_words.append(output_text.strip())

        # Assign the list of translated words to the corresponding key
        translated_dict[key] = translated_words

    return translated_dict

# --- Example Usage ---


# 2. Call the function to get the translated dictionary
#    (This requires a machine with a compatible GPU and transformers installed)
translated_dictionary_kannada = translate_dict_to_kannada(mfd2)

# 3. Print the result
print(translated_dictionary_kannada)



tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.64G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

In [None]:
## [TODO]: Save this dict locally or to git, and pull it from there. Shouldn't run every time.

## [Kannada] Embeddings

In [None]:
word_embeddings_kannada = {}
for foundation, words in translated_dictionary_kannada.items():
  word_embeddings_kannada[foundation] = model.encode(words)



## [Kannada] Master Moral Vectors

In [None]:
import numpy as np

master_moral_vectors_kannada = {}
for foundation, embeddings in word_embeddings_kannada.items():
    master_moral_vectors_kannada[foundation] = np.mean(embeddings, axis=0)

print("Master Moral Vectors:")
for foundation, vector in master_moral_vectors_kannada.items():
    print(f"{foundation}: {vector[:5]}...") # Print first 5 elements for brevity

# Task
Create a Kannada moral foundations dictionary file (`kannada_mfd.dic`). This file should contain a '% word %' section with each translated Kannada word from `translated_dictionary_kannada` mapped to its corresponding numerical code, and a '% category %' section mapping these numerical codes back to their original moral foundation names. Finally, confirm the successful creation of this file.

## Create Foundation to Number Mapping

### Subtask:
Generate a reverse mapping from moral foundation names (e.g., 'care.virtue') to their numerical codes (e.g., '1') using the existing `nummap` dictionary. This mapping is essential for structuring the new `.dic` file.


**Reasoning**:
To generate the reverse mapping from moral foundation names to their numerical codes, I will iterate through the existing `nummap` dictionary and swap the keys and values to populate the `foundation_to_num` dictionary as instructed.



In [None]:
foundation_to_num = {}
for num_code, foundation_name in nummap.items():
    foundation_to_num[foundation_name] = num_code

print("Foundation to Number Mapping:")
print(foundation_to_num)

Foundation to Number Mapping:
{'care.virtue': '1', 'care.vice': '2', 'fairness.virtue': '3', 'fairness.vice': '4', 'loyalty.virtue': '5', 'loyalty.vice': '6', 'authority.virtue': '7', 'authority.vice': '8', 'sanctity.virtue': '9', 'sanctity.vice': '10'}


**Reasoning**:
Now that the reverse mapping from moral foundation names to numerical codes is established, the next step is to create the `kannada_mfd.dic` file. This involves writing the category mapping and then the translated Kannada words with their corresponding numerical codes into the file.



In [None]:
output_filename = 'kannada_mfd.dic'

with open(output_filename, 'w', encoding='utf-8') as f:
    f.write('% category %\n')
    # Write the numerical code to foundation name mapping
    for num_code, foundation_name in nummap.items():
        f.write(f'{num_code}\t{foundation_name}\n')

    f.write('% word %\n')
    # Write the translated Kannada words mapped to their numerical codes
    for foundation_name, kannada_words in translated_dictionary_kannada.items():
        num_code = foundation_to_num.get(foundation_name)
        if num_code:
            for word in kannada_words:
                f.write(f'{word}\t{num_code}\n')

print(f"Successfully created '{output_filename}' with Kannada MFD data.")

Successfully created 'kannada_mfd.dic' with Kannada MFD data.
