<a href="https://colab.research.google.com/github/arnavkekre/hackathon/blob/main/Linguistic_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers sentencepiece sacrebleu
import pandas as pd
import numpy as np



In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load model + tokenizer
model_name = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

def eng_to_odia(text):
    # Source language (English)
    tokenizer.src_lang = "eng_Latn"
    # Encode input
    inputs = tokenizer(text, return_tensors="pt")
    # Find target language ID (Odia = ory_Orya)
    odia_lang_id = tokenizer.convert_tokens_to_ids("ory_Orya")
    # Translate
    translated_tokens = model.generate(
        **inputs,
        forced_bos_token_id=odia_lang_id,  # target language id
        max_length=128
    )
    odia_translation = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

def odia_to_eng(text):
    tokenizer.src_lang="ory_Orya"
    inputs=tokenizer(text, return_tensors='pt')
    eng_lang_id=tokenizer.convert_tokens_to_ids("eng_Latn")
    translated_tokens=model.generate(
        **inputs,
        forced_bos_token_id=eng_lang_id,
        max_length=128
    )
    eng_translation=tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

def eng_to_hind(text, target="hin_Deva"):
    tokenizer.src_lang = "eng_Latn"
    inputs = tokenizer(text, return_tensors="pt")
    translated_tokens = model.generate(
        **inputs,
        forced_bos_token_id=tokenizer.convert_tokens_to_ids(target),
        max_length=128
    )
    return tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

def hind_to_eng(text, source="hin_Deva"):
    tokenizer.src_lang = source
    inputs = tokenizer(text, return_tensors="pt")
    translated_tokens = model.generate(
        **inputs,
        forced_bos_token_id=tokenizer.convert_tokens_to_ids("eng_Latn"),
        max_length=128
    )
    return tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

In [None]:
eng_to_hind("Sailors Of Maurya")

'मौर्य के नाविक'

In [None]:
eng_to_odia("Sailors Of Maurya's")

English: Sailors Of Maurya's
Odia: ମୌର୍ଯ୍ୟଙ୍କ ନାବିକ


'ମୌର୍ଯ୍ୟଙ୍କ ନାବିକ'

In [None]:
odia_to_eng("କୋରିଆ କେବଳ ଉତ୍ତର କୋରିଆ ହେଉଛି! ଅନ୍ୟ କିଛି ନୁହେଁ!..?")

In [None]:
df=pd.read_csv('/content/parallel_dataset_en_hi_or.csv')
df

Unnamed: 0,english,hindi,odia
0,The child is playing under the tree.,बच्चा पेड़ के नीचे खेल रहा है।,ଶିଶୁଟି ଗଛ ତଳେ ଖେଳୁଛି।
1,The school is closed today.,आज स्कूल बंद है।,ବିଦ୍ୟାଳୟ ଆଜି ବନ୍ଦ ଅଛି।
2,Education is the key to success.,शिक्षा सफलता की कुंजी है।,ଶିକ୍ଷା ହେଉଛି ସଫଳତାର କୁଞ୍ଜୀ।
3,We should protect the environment.,हमें पर्यावरण की रक्षा करनी चाहिए।,ଆମେ ପରିବେଶର ସୁରକ୍ଷା କରିବା ଉଚିତ।
4,Clean water is essential for health.,स्वच्छ पानी स्वास्थ्य के लिए आवश्यक है।,ପରିଷ୍କାର ପାଣି ସ୍ୱାସ୍ଥ୍ୟ ପାଇଁ ଆବଶ୍ୟକ।
5,The farmer works hard in the field.,किसान खेत में कड़ी मेहनत करता है।,ଚାଷୀଟି ମାଟିରେ କଷ୍ଟ କରେ।
6,Books are our best friends.,किताबें हमारे सबसे अच्छे दोस्त हैं।,ପୁସ୍ତକ ଆମର ସବୁଠୁ ଭଲ ବନ୍ଧୁ।
7,The sun rises in the east.,सूरज पूर्व से उगता है।,ସୂର୍ଯ୍ୟ ପୂର୍ବ ଦିଗରେ ଉଦୟ ହୁଏ।
8,We should always speak the truth.,हमें हमेशा सच बोलना चाहिए।,ଆମେ ସଦା ସତ୍ୟ କହିବା ଉଚିତ।
9,Health is wealth.,स्वास्थ्य ही धन है।,ସ୍ୱାସ୍ଥ୍ୟ ହିଁ ଧନ।


In [None]:
# Create an empty list to store the translated text
translated_odia_texts = []

# Iterate through the 'english' column and translate each entry
for text in df['english']:
    translated_odia_texts.append(eng_to_odia(text))

# Create a new DataFrame with the original and translated columns
new_df = pd.DataFrame({
    'english': df['english'],
    'odia_translated': translated_odia_texts
})

display(new_df)

English: The child is playing under the tree.
Odia: ପିଲାଟି ଗଛ ତଳେ ଖେଳୁଛି ।
English: The school is closed today.
Odia: ଆଜି ସ୍କୁଲ ବନ୍ଦ ଅଛି ।
English: Education is the key to success.
Odia: ଶିକ୍ଷା ହିଁ ସଫଳତାର ଚାବି ।
English: We should protect the environment.
Odia: ଆମେ ପରିବେଶକୁ ସୁରକ୍ଷା ଦେବା ଉଚିତ୍ ।
English: Clean water is essential for health.
Odia: ସ୍ୱଚ୍ଛ ଜଳ ସ୍ୱାସ୍ଥ୍ୟ ପାଇଁ ଜରୁରୀ ।
English: The farmer works hard in the field.
Odia: ଚାଷୀ ଚାଷ ଜମିରେ ପରିଶ୍ରମ କରେ ।
English: Books are our best friends.
Odia: ବହି ଆମର ସର୍ବୋତ୍ତମ ବନ୍ଧୁ।
English: The sun rises in the east.
Odia: ପୂର୍ବ ଦିଗରେ ସୂର୍ଯ୍ୟ ଉଦୟ କରନ୍ତି ।
English: We should always speak the truth.
Odia: ଆମେ ସର୍ବଦା ସତ କହିବା ଉଚିତ୍ ।
English: Health is wealth.
Odia: ସ୍ୱାସ୍ଥ୍ୟ ହେଉଛି ଧନ ।


Unnamed: 0,english,odia_translated
0,The child is playing under the tree.,ପିଲାଟି ଗଛ ତଳେ ଖେଳୁଛି ।
1,The school is closed today.,ଆଜି ସ୍କୁଲ ବନ୍ଦ ଅଛି ।
2,Education is the key to success.,ଶିକ୍ଷା ହିଁ ସଫଳତାର ଚାବି ।
3,We should protect the environment.,ଆମେ ପରିବେଶକୁ ସୁରକ୍ଷା ଦେବା ଉଚିତ୍ ।
4,Clean water is essential for health.,ସ୍ୱଚ୍ଛ ଜଳ ସ୍ୱାସ୍ଥ୍ୟ ପାଇଁ ଜରୁରୀ ।
5,The farmer works hard in the field.,ଚାଷୀ ଚାଷ ଜମିରେ ପରିଶ୍ରମ କରେ ।
6,Books are our best friends.,ବହି ଆମର ସର୍ବୋତ୍ତମ ବନ୍ଧୁ।
7,The sun rises in the east.,ପୂର୍ବ ଦିଗରେ ସୂର୍ଯ୍ୟ ଉଦୟ କରନ୍ତି ।
8,We should always speak the truth.,ଆମେ ସର୍ବଦା ସତ କହିବା ଉଚିତ୍ ।
9,Health is wealth.,ସ୍ୱାସ୍ଥ୍ୟ ହେଉଛି ଧନ ।


In [None]:
def odia_to_eng(text):
    tokenizer.src_lang="ory_Orya"
    inputs=tokenizer(text, return_tensors='pt')
    eng_lang_id=tokenizer.convert_tokens_to_ids("eng_Latn")
    translated_tokens=model.generate(
        **inputs,
        forced_bos_token_id=eng_lang_id,
        max_length=128
    )
    eng_translation=tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
    return eng_translation # Added return statement

# Re-create new_df with original english and odia columns
new_df = pd.DataFrame({
    'english': df['english'],
    'odia': df['odia']
})

# Create an empty list to store the translated English text
translated_english_texts = []

# Iterate through the 'odia' column and translate each entry to English
for text in new_df['odia']:
    translated_english_texts.append(odia_to_eng(text))

# Add a new column named 'english_translated' to new_df
new_df['english_translated'] = translated_english_texts

# Display the updated new_df DataFrame
display(new_df)

import sacrebleu

# The translated English sentences (from the odia_to_eng translation)
translations = new_df['english_translated'].tolist()

# The original English sentences (reference)
references = [[ref] for ref in new_df['english'].tolist()]

# Calculate the SacreBLEU score
bleu_score = sacrebleu.corpus_bleu(translations, references)

print("SacreBLEU score:", bleu_score.score)

Unnamed: 0,english,odia,english_translated
0,The child is playing under the tree.,ଶିଶୁଟି ଗଛ ତଳେ ଖେଳୁଛି।,The child is playing under the tree.
1,The school is closed today.,ବିଦ୍ୟାଳୟ ଆଜି ବନ୍ଦ ଅଛି।,Schools are closed today.
2,Education is the key to success.,ଶିକ୍ଷା ହେଉଛି ସଫଳତାର କୁଞ୍ଜୀ।,Education is the key to success.
3,We should protect the environment.,ଆମେ ପରିବେଶର ସୁରକ୍ଷା କରିବା ଉଚିତ।,We must protect the environment.
4,Clean water is essential for health.,ପରିଷ୍କାର ପାଣି ସ୍ୱାସ୍ଥ୍ୟ ପାଇଁ ଆବଶ୍ୟକ।,Clean water is essential for health.
5,The farmer works hard in the field.,ଚାଷୀଟି ମାଟିରେ କଷ୍ଟ କରେ।,The farmer is working hard in the soil.
6,Books are our best friends.,ପୁସ୍ତକ ଆମର ସବୁଠୁ ଭଲ ବନ୍ଧୁ।,Books are our best friend.
7,The sun rises in the east.,ସୂର୍ଯ୍ୟ ପୂର୍ବ ଦିଗରେ ଉଦୟ ହୁଏ।,The sun rises in the east.
8,We should always speak the truth.,ଆମେ ସଦା ସତ୍ୟ କହିବା ଉଚିତ।,We must always tell the truth.
9,Health is wealth.,ସ୍ୱାସ୍ଥ୍ୟ ହିଁ ଧନ।,Health is wealth.


SacreBLEU score: 100.00000000000004


In [None]:
references #from old df

[['The child is playing under the tree.'],
 ['The school is closed today.'],
 ['Education is the key to success.'],
 ['We should protect the environment.'],
 ['Clean water is essential for health.'],
 ['The farmer works hard in the field.'],
 ['Books are our best friends.'],
 ['The sun rises in the east.'],
 ['We should always speak the truth.'],
 ['Health is wealth.']]

In [None]:
translations # predicted from odia language

['The child is playing under the tree.',
 'Schools are closed today.',
 'Education is the key to success.',
 'We must protect the environment.',
 'Clean water is essential for health.',
 'The farmer is working hard in the soil.',
 'Books are our best friend.',
 'The sun rises in the east.',
 'We must always tell the truth.',
 'Health is wealth.']

bhaisab 100 se upar ye to toap log hote he(keno and ...)

-----------------Similiarly For "Hindi" Can Also Be DONE------------------

quiz interaction in linguo

In [None]:
import gradio as gr

def quiz_flow(question_eng, student_answer_odia):
    # Teacher question → Odia translation
    question_in_odia = eng_to_odia(question_eng)
    # Student answer → English translation
    student_answer_eng = odia_to_eng(student_answer_odia)

    return question_in_odia, student_answer_eng

with gr.Blocks() as demo:
    gr.Markdown("## 🌍 Quiz Translation Tool (English ↔ Odia)")

    with gr.Row():
        eng_input = gr.Textbox(label="Teacher's Question (English)")
        odia_output = gr.Textbox(label="Translated Question (Odia)")

    with gr.Row():
        odia_input = gr.Textbox(label="Student's Answer (Odia)")
        eng_output = gr.Textbox(label="Translated Answer (English)")

    btn = gr.Button("Translate")
    btn.click(quiz_flow, inputs=[eng_input, odia_input], outputs=[odia_output, eng_output])

demo.launch()


# $QUIZ DEMO$:

-> quiz questa

In [None]:
qa_dataset = [
    {"question": "What is H2O commonly known as?", "answer": "Water"},
    {"question": "What is the speed of light?", "answer": "299792458 m/s"},
    {"question": "Who is known as the father of computers?", "answer": "Charles Babbage"},
    {"question": "What is 9 x 8?", "answer": "72"},
]


-> matching

In [None]:
from difflib import SequenceMatcher

def check_answer(student_answer, correct_answer):
    ratio = SequenceMatcher(None, student_answer.lower(), correct_answer.lower()).ratio()
    return ratio >= 0.7  # 70% similarity threshold


-> gradio ai gp bai

In [None]:
import gradio as gr
import matplotlib.pyplot as plt
from difflib import SequenceMatcher

# ==== Example STEM Quiz Dataset ====
qa_dataset = [
    {"question": "What is H2O commonly known as?", "answer": "Water"},
    {"question": "What is the speed of light?", "answer": "299792458 m/s"},
    {"question": "Who is known as the father of computers?", "answer": "Charles Babbage"},
    {"question": "What is 9 x 8?", "answer": "72"},
]

# ==== Fuzzy Answer Checking ====
def check_answer(student_answer, correct_answer):
    ratio = SequenceMatcher(None, student_answer.lower(), correct_answer.lower()).ratio()
    return ratio >= 0.7  # 70% similarity threshold

# ==== Store results ====
student_results = []

# ==== Question Display Function ====
def get_question(lang, q_index):
    question_eng = qa_dataset[q_index]["question"]

    if lang == "Odia":
        return eng_to_lang(question_eng, "ory_Orya")
    elif lang == "Hindi":
        return eng_to_lang(question_eng, "hin_Deva")
    else:
        return question_eng

# ==== Quiz Answer Function ====
def quiz_system(lang, q_index, student_ans):
    question_eng = qa_dataset[q_index]["question"]
    correct_ans = qa_dataset[q_index]["answer"]

    # Translate student answer → English
    if lang == "Odia":
        student_ans_eng = lang_to_eng(student_ans, "ory_Orya")
    elif lang == "Hindi":
        student_ans_eng = lang_to_eng(student_ans, "hin_Deva")
    else:
        student_ans_eng = student_ans

    # Check correctness
    is_correct = check_answer(student_ans_eng, correct_ans)
    student_results.append(is_correct)

    feedback = f"✅ Correct!" if is_correct else f"❌ Incorrect! Correct Answer: {correct_ans}"

    return student_ans_eng, feedback

# ==== Results Pie Chart ====
def show_results():
    correct = sum(student_results)
    wrong = len(student_results) - correct

    fig, ax = plt.subplots()
    ax.pie([correct, wrong], labels=["Correct", "Incorrect"], autopct='%1.1f%%')
    ax.set_title("Quiz Performance")

    return fig

# ==== Gradio App ====
with gr.Blocks() as demo:
    gr.Markdown("## 🌍 Multilingual STEM Quiz (English ↔ Odia/Hindi)")

    lang_choice = gr.Radio(["English", "Odia", "Hindi"], label="Select Language", value="Odia")
    q_index = gr.Number(label="Question Index (0-3)", value=0, precision=0)

    q_display = gr.Textbox(label="Translated Question", interactive=False)
    get_q_btn = gr.Button("Get Question")
    get_q_btn.click(get_question, inputs=[lang_choice, q_index], outputs=q_display)

    student_ans = gr.Textbox(label="Your Answer")
    ans_translated = gr.Textbox(label="Answer (English Translation)", interactive=False)
    feedback = gr.Textbox(label="Feedback", interactive=False)

    submit = gr.Button("Submit Answer")
    submit.click(quiz_system, inputs=[lang_choice, q_index, student_ans], outputs=[ans_translated, feedback])

    result_btn = gr.Button("Show Results")
    result_plot = gr.Plot()
    result_btn.click(show_results, outputs=result_plot)

demo.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://2741e5d17d8361d75d.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


