In [5]:
!pip install transformers torch pandas




In [4]:
import pandas as pd
from transformers import MarianMTModel, MarianTokenizer

# Load the MarianMT model for Hindi-to-English translation
model_name = "Helsinki-NLP/opus-mt-hi-en"  # Use Hindi-to-English model
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

def translate_text(text):
    """Translate Hinglish text to English using MarianMT."""
    if not isinstance(text, str) or text.strip() == "":
        return ""  # Handle empty or invalid inputs
    # Tokenize and translate
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    translated = model.generate(**inputs)
    return tokenizer.decode(translated[0], skip_special_tokens=True)

# Load Hinglish dataset
df = pd.read_csv("/kaggle/input/texts-extracted/hinglish_dataset.csv")  # Change filename if needed

# Apply translation function to each row
df_1 = df.head(10).copy()  # Take first 5 rows

df_1["translated_text"] = df_1["Text"].apply(translate_text)

# Save translated data
df_1.to_csv("translated_texts.csv", index=False)
print("Translation complete. File saved as translated_texts.csv.")




Translation complete. File saved as translated_texts.csv.


In [7]:
import pandas as pd
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

# Load M2M-100 Model (Generic Multilingual Translation)
model_name = "facebook/m2m100_418M"
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
model = M2M100ForConditionalGeneration.from_pretrained(model_name)

def translate_m2m100(text):
    """Translate Hinglish to English using M2M-100"""
    if not isinstance(text, str) or text.strip() == "":
        return ""
    
    tokenizer.src_lang = "hi"  # Hindi as source (best fit for Hinglish)
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    translated_tokens = model.generate(**inputs, forced_bos_token_id=tokenizer.get_lang_id("en"))
    return tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

# Load dataset
df = pd.read_csv("/kaggle/input/texts-extracted/hinglish_dataset.csv")
df_1 = df.head(10).copy()

# Translate each row
df_1["translated_text"] = df_1["Text"].apply(translate_m2m100)

# Save to CSV
df_1.to_csv("translated_texts_m2m100.csv", index=False)
print("Translation complete using M2M-100. File saved as translated_texts_m2m100.csv.")


Translation complete using M2M-100. File saved as translated_texts_m2m100.csv.


# Best among others on 10 rows

In [2]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from tqdm import tqdm

# Load model and tokenizer
model_name = "rudrashah/RLM-hinglish-translator"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)

# Define template required by the model
TEMPLATE = "Hinglish:\n{hi_en}\n\nEnglish:\n{en}"

# Load first 10 rows from CSV
df = pd.read_csv("/kaggle/input/texts-extracted/hinglish_dataset.csv").head(10)

# Define translation function
def translate(text):
    if not isinstance(text, str) or text.strip() == "":
        return ""
    prompt = TEMPLATE.format(hi_en=text.strip(), en="")
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=100)
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract only translated English part
    if "English:" in result:
        return result.split("English:")[-1].strip()
    return result.strip()

# Translate with progress bar
tqdm.pandas()
df["translated_text"] = df["Text"].progress_apply(translate)

# Save result
df.to_csv("translated_texts_10rows.csv", index=False)
print("✅ Translated first 10 rows. Saved as translated_texts_10rows.csv")


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:06<00:00,  1.43it/s]

✅ Translated first 10 rows. Saved as translated_texts_10rows.csv





# Testing

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM

In [4]:
tokenizer = AutoTokenizer.from_pretrained("rudrashah/RLM-hinglish-translator")
model = AutoModelForCausalLM.from_pretrained("rudrashah/RLM-hinglish-translator")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
template = "Hinglish:\n{hi_en}\n\nEnglish:\n{en}" #THIS IS MOST IMPORTANT, WITHOUT THIS IT WILL GIVE RANDOM OUTPUT
input_text = tokenizer(template.format(hi_en="Agar 1 me 2 dale to kya hota hai....?? ye to apki himmat hai. hamari to ekse hi jaan nikal jati hai",en=""),return_tensors="pt")

output = model.generate(**input_text)
print(tokenizer.decode(output[0]))

<bos>Hinglish:
Agar 1 me 2 dale to kya hota hai....?? ye to apki himmat hai. hamari to ekse hi jaan nikal jati hai

English:
If you go to 2, what happens...?? That's your decision. We have to
