In [None]:
# Install Hugging Face datasets library
!pip install datasets

# Install Hugging Face transformers library
!pip install transformers

# Install Kaggle Hub (for downloading Kaggle datasets)
!pip install kagglehub

# Install pandas for DataFrame operations
!pip install pandas

# Install PyTorch (Colab usually has it pre-installed, but if not, use this)
!pip install torch

# Install PEFT library (for LoRA support)
!pip install peft

# Install the evaluation library for BLEU, ROUGE, METEOR, etc.
!pip install evaluate

# Install tqdm for progress bars
!pip install tqdm

# Install accelerate for efficient multi-device training
!pip install accelerate

# Install trl (for SFTTrainer from the Hugging Face Transformers Reinforcement Learning)
!pip install trl


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [75]:
from datasets import load_dataset
HF_TOKEN = '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'
# Load the OPUS-100 dataset for English to Hindi
dataset = load_dataset("opus100", "en-hi")

In [76]:
# Display 5 rows from the training set
for i in range(5):
    print(f"Row {i + 1}:")
    print(f"English: {dataset['train'][i]['translation']['en']}")
    print(f"Hindi: {dataset['train'][i]['translation']['hi']}")
    print()

Row 1:
English: Other, Private Use
Hindi: अन्य, निज़ी उपयोग

Row 2:
English: [SCREAMING]
Hindi: ऊबड़ .

Row 3:
English: Spouse
Hindi: जीवनसाथी

Row 4:
English: I will never salute you!
Hindi: - तुम एक कमांडर कभी नहीं होगा!

Row 5:
English: and the stars and the trees bow themselves;
Hindi: और तारे और वृक्ष सजदा करते है;



In [77]:
# Convert to Pandas DataFrame
import pandas as pd
opus_df = pd.DataFrame({
    "english": [entry["en"] for entry in dataset["train"]["translation"]],
    "hindi": [entry["hi"] for entry in dataset["train"]["translation"]]
})

# Display the first few rows
print("First few rows from OPUS-100:")
print(opus_df.head())

First few rows from OPUS-100:
                                       english                           hindi
0                           Other, Private Use               अन्य, निज़ी उपयोग
1                                  [SCREAMING]                          ऊबड़ .
2                                       Spouse                        जीवनसाथी
3                     I will never salute you!  - तुम एक कमांडर कभी नहीं होगा!
4  and the stars and the trees bow themselves;  और तारे और वृक्ष सजदा करते है;


In [78]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("vaibhavkumar11/hindi-english-parallel-corpus")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/vaibhavkumar11/hindi-english-parallel-corpus/versions/1


In [79]:
import kagglehub

# Re-download the dataset
dataset_path = kagglehub.dataset_download("vaibhavkumar11/hindi-english-parallel-corpus")

# Print the path to the dataset
print("Dataset downloaded to:", dataset_path)

# List files in the dataset directory
files = os.listdir(dataset_path)
print("Files in the dataset:", files)


Dataset downloaded to: /root/.cache/kagglehub/datasets/vaibhavkumar11/hindi-english-parallel-corpus/versions/1
Files in the dataset: ['hindi_english_parallel.csv']


In [80]:
dataset_path = "/root/.cache/kagglehub/datasets/vaibhavkumar11/hindi-english-parallel-corpus/versions/1"


In [81]:
import os

# Use the correct path printed in the previous step
dataset_path = "/root/.cache/kagglehub/datasets/vaibhavkumar11/hindi-english-parallel-corpus/versions/1"

# List files in the dataset directory
if os.path.exists(dataset_path):
    files = os.listdir(dataset_path)
    print("Files in the dataset:", files)
else:
    print(f"The directory {dataset_path} does not exist.")


Files in the dataset: ['hindi_english_parallel.csv']


In [82]:
import os


# List files in the dataset directory
files = os.listdir(dataset_path)
print("Files in the dataset:", files)


Files in the dataset: ['hindi_english_parallel.csv']


In [83]:
files

['hindi_english_parallel.csv']

In [84]:
import pandas as pd

# Path to the dataset file
dataset_file = os.path.join(dataset_path, "hindi_english_parallel.csv")

# Load the CSV file
df = pd.read_csv(dataset_file)



In [85]:
# Inspect the first few rows
print(df.head())

# Access Hindi and English columns
hindi_sentences = df['hindi']
english_sentences = df['english']

                                               hindi  \
0    अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें   
1                    एक्सेर्साइसर पहुंचनीयता अन्वेषक   
2              निचले पटल के लिए डिफोल्ट प्लग-इन खाका   
3               ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका   
4  उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से नि...   

                                          english  
0  Give your application an accessibility workout  
1               Accerciser Accessibility Explorer  
2  The default plugin layout for the bottom panel  
3     The default plugin layout for the top panel  
4  A list of plugins that are disabled by default  


In [86]:
# Combine the two datasets
combined_df = pd.concat([opus_df, df], ignore_index=True)

# Display the size of the combined dataset
print(f"Combined dataset size: {combined_df.shape}")

# Display the first few rows of the combined dataset
print("First few rows of the combined dataset:")
print(combined_df.head())


Combined dataset size: (2096160, 2)
First few rows of the combined dataset:
                                       english                           hindi
0                           Other, Private Use               अन्य, निज़ी उपयोग
1                                  [SCREAMING]                          ऊबड़ .
2                                       Spouse                        जीवनसाथी
3                     I will never salute you!  - तुम एक कमांडर कभी नहीं होगा!
4  and the stars and the trees bow themselves;  और तारे और वृक्ष सजदा करते है;


In [87]:
# Remove duplicate rows
combined_df = combined_df.drop_duplicates()

# Display the size after removing duplicates
print(f"Dataset size after removing duplicates: {combined_df.shape}")


Dataset size after removing duplicates: (1635847, 2)


In [88]:
import re

# Function to check if a string contains Hindi characters
def contains_hindi(text):
    if not isinstance(text, str):  # Ensure the input is a string
        return False
    hindi_pattern = re.compile('[\u0900-\u097F]')
    return bool(hindi_pattern.search(text))

# Apply the function to the English column
combined_df['hindi_in_english'] = combined_df['english'].apply(contains_hindi)

# Display rows where Hindi words are found in the English column
hindi_in_english_rows = combined_df[combined_df['hindi_in_english']]
print(f"Number of rows with Hindi in English column: {len(hindi_in_english_rows)}")
print(hindi_in_english_rows.head())


Number of rows with Hindi in English column: 3049
                                                  english  \
973263          2. Infection caused by germs. 2. जीवाणुओं   
973343  This position is similar to armchair. In this ...   
973383  Many countries in the [unclear], they need leg...   
973483  Virtually all groups of plants and animals, an...   
973518  Let 's first review what we know does not and ...   

                                                    hindi  hindi_in_english  
973263                             द्वारा संक्रामण होना।               True  
973343  यह हत्थाकुर्सी से मिलती जुलती पोजीशन है इसमें ...              True  
973383          के बहुत सारे राष्ट्रों को मान्यता चाहिए.               True  
973483  पौधों तथा जानवरों के सभी समूहों और उनके अन्दर ...              True  
973518                                  समस्या” का समाधान              True  


In [89]:
# Function to check if a string contains English characters
def contains_english(text):
    if not isinstance(text, str):  # Ensure the input is a string
        return False
    english_pattern = re.compile('[A-Za-z]')
    return bool(english_pattern.search(text))

# Apply the function to the Hindi column
combined_df['english_in_hindi'] = combined_df['hindi'].apply(contains_english)

# Display rows where English words are found in the Hindi column
english_in_hindi_rows = combined_df[combined_df['english_in_hindi']]
print(f"Number of rows with English in Hindi column: {len(english_in_hindi_rows)}")
print(english_in_hindi_rows.head())


Number of rows with English in Hindi column: 116542
                                      english  \
5        _Download Messages for Offline Usage   
8   The application '%s' could not be created   
17                                  Kennebunk   
20                          FIB(9) returns 34   
25                                Third power   

                                                hindi  hindi_in_english  \
5       ऑफ़लाइन प्रयोग के लिए संदेश डाउनलोड करें (_D)             False   
8   अनुप्रयोग '%s' के लिए इस्तेमाल किया जा के लिए ...             False   
17  केनेबंकCity name (optional, probably does not ...             False   
20                          FIB( 9) का परिणाम होगा 34             False   
25                       तृतीय घातx to the power of y             False   

    english_in_hindi  
5               True  
8               True  
17              True  
20              True  
25              True  


In [90]:
# Drop the columns 'hindi_in_english' and 'english_in_hindi' from the DataFrame
cleaned_df = combined_df.drop(columns=['hindi_in_english', 'english_in_hindi'])

# Display the size of the cleaned dataset
print(f"Dataset size after removing unwanted columns: {len(cleaned_df)}")

# Inspect the first few rows of the cleaned dataset
print("First few rows of the cleaned dataset:")
print(cleaned_df.head())


Dataset size after removing unwanted columns: 1635847
First few rows of the cleaned dataset:
                                       english                           hindi
0                           Other, Private Use               अन्य, निज़ी उपयोग
1                                  [SCREAMING]                          ऊबड़ .
2                                       Spouse                        जीवनसाथी
3                     I will never salute you!  - तुम एक कमांडर कभी नहीं होगा!
4  and the stars and the trees bow themselves;  और तारे और वृक्ष सजदा करते है;


In [91]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Randomly sample 10,000 rows from cleaned_df
df = cleaned_df.sample(n=10000, random_state=42)

# Rename columns for consistency
df = df.rename(columns={'hindi': 'source', 'english': 'target'})

# Drop missing values
df = df.dropna()

# Split dataset into train, validation, and test
train_df = df.sample(frac=0.8, random_state=42)
remaining_df = df.drop(train_df.index)
valid_df = remaining_df.sample(frac=0.5, random_state=42)
test_df = remaining_df.drop(valid_df.index)

# Convert DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)
test_dataset = Dataset.from_pandas(test_df)


In [92]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9974 entries, 1293567 to 2089657
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   target  9974 non-null   object
 1   source  9974 non-null   object
dtypes: object(2)
memory usage: 491.8+ KB


In [93]:
# Load tokenizer
access_token= "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"
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it",token=access_token)

# Preprocessing function
def preprocess_function(examples):
    inputs = tokenizer(
        examples['source'],
        truncation=True,
        padding="max_length",
        max_length=128
    )
    targets = tokenizer(
        examples['target'],
        truncation=True,
        padding="max_length",
        max_length=128
    )
    inputs["labels"] = targets["input_ids"]
    return inputs

# Preprocess datasets
train_dataset = train_dataset.map(preprocess_function, batched=True)
valid_dataset = valid_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# Set dataset format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
valid_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


Map:   0%|          | 0/7979 [00:00<?, ? examples/s]

Map:   0%|          | 0/998 [00:00<?, ? examples/s]

Map:   0%|          | 0/997 [00:00<?, ? examples/s]

In [94]:
from transformers import AutoModelForCausalLM

access_token = "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"  # Replace with your actual Hugging Face token

model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2-2b-it",         # Correct model name
    device_map="auto",            # Automatically map to available devices
    torch_dtype=torch.float16,   # Use bfloat16 for efficiency
    use_auth_token=access_token   # Pass your access token for authentication
)




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [95]:
from peft import LoraConfig, get_peft_model, TaskType
from transformers import TrainingArguments, Trainer

In [96]:
# Define LoRA configuration
# lora_config = LoraConfig(
#     task_type=TaskType.CAUSAL_LM,  # Task type for causal language modeling
#     inference_mode=False,         # Fine-tune mode
#     r=8,                          # Low-rank dimension
#     lora_alpha=32,                # Scaling factor
#     lora_dropout=0.1,             # Dropout for LoRA layers
# )

lora_config = LoraConfig(
    r=64,
    lora_alpha=32,
    #target_modules=modules,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

In [97]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    learning_rate=2e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    save_steps=499,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=499,
    load_best_model_at_end=True,
    fp16=True,
    push_to_hub=False,
    save_safetensors=False  # Use PyTorch save format
)




In [71]:
from trl import SFTTrainer
from huggingface_hub import login

# Initialize the SFTTrainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
)

# Temporarily disable shared weights for training
model.tie_weights = lambda: None
trainer.train()  # Train the model
model.tie_weights()  # Re-enable shared weights after training




Step,Training Loss,Validation Loss


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-71-49e13a7c45fa>", line 15, in <cell line: 15>
    trainer.train()  # Train the model
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2123, in train
    return inner_training_loop(
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2481, in _inner_training_loop
    tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 3579, in training_step
    loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 3633, in compute_loss
    outputs = model(**inputs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.

TypeError: object of type 'NoneType' has no len()

In [98]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import SFTTrainer
from huggingface_hub import login

# Initialize the SFTTrainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
)

# Temporarily disable shared weights for training
model.tie_weights = lambda: None
trainer.train()  # Train the model
model.tie_weights()  # Re-enable shared weights after training

# Log in to Hugging Face with your token
login(token="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")

# Define the new model name for Hugging Face
new_model = "ananyavarma/model-gemma2-2b-hindi-English"

# Push the model to the Hugging Face Hub
model.push_to_hub(repo_id=new_model, use_auth_token=True)

# Push the tokenizer to the Hugging Face Hub
tokenizer.push_to_hub(repo_id=new_model, use_auth_token=True)

# Save the model and tokenizer locally after pushing
model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)  # Save the tokenizer locally

print("Model and Tokenizer successfully pushed to Hugging Face Hub!")




Step,Training Loss,Validation Loss
499,3.0129,2.914976
998,2.7218,2.871517
1497,2.5311,2.864537
1996,2.3412,2.915221
2495,2.1558,2.986551
2994,1.9898,3.072925
3493,1.8351,3.200818
3992,1.6947,3.323694
4491,1.5847,3.432273
4990,1.5077,3.528729




adapter_model.safetensors:   0%|          | 0.00/51.1M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/34.4M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

Model and Tokenizer successfully pushed to Hugging Face Hub!


In [100]:
model.save_pretrained(new_model)

In [None]:
merged_model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import SFTTrainer
from huggingface_hub import login

# Initialize the SFTTrainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,  # Ensure the tokenizer is passed here
)

# Temporarily disable shared weights for training
model.tie_weights = lambda: None
trainer.train()  # Train the model
model.tie_weights()  # Re-enable shared weights after training

# Log in to Hugging Face with your token
login(token="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")

# Define the new model name for Hugging Face
new_model = "ananyavarma/gemma2-2b-hindi-English"

# Push the model and tokenizer to the Hugging Face Hub
model.push_to_hub(repo_id=new_model, use_auth_token=True)
tokenizer.push_to_hub(repo_id=new_model, use_auth_token=True)  # Push tokenizer too

# Save the model and tokenizer locally after pushing
model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)


In [None]:
HF_TOKEN = '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'

In [None]:
from huggingface_hub import login
login(token="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")


In [None]:
# Save the model after training
new_model = "gemma2-2b-mt-hindi-English"

# Replace with the model name you used for your repository
model.push_to_hub("ananyavarma/my-trained-model", use_auth_token=True)
#new_model = "gemma2-2b-mt-hindi-English"
trainer.model.save_pretrained(new_model)
#fc2c9f2ff31f0d51fb760ce7b58e582937060ac9

In [None]:
import accelerate
print(accelerate.__version__)

0.26.0


In [None]:
import pandas as pd
from datasets import Dataset
import evaluate
from tqdm import tqdm
from transformers import AutoTokenizer

# Load the BLEU metric
bleu = evaluate.load("bleu")

# Ensure the columns match expectations
test_df = test_df.rename(columns={"Source": "source", "Target": "target"})

# Convert the DataFrame to a Hugging Face Dataset
raw_test_dataset = Dataset.from_pandas(test_df)

# Function to generate predictions
def generate_translation(model, tokenizer, text, max_length=256):
    prompt = f"Translate the following Hindi text to English:\n\n{text}\n\nEnglish Translation:"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_length).to("cuda")
    outputs = model.generate(inputs.input_ids, max_new_tokens=100, num_beams=5, early_stopping=True)
    translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translation.split("English Translation:")[-1].strip()  # Remove prompt if included

# Generate predictions and references
predictions = []
references = []

print("Generating predictions for BLEU evaluation...")
for example in tqdm(raw_test_dataset):  # Iterate over the test dataset
    source_text = example["source"]  # Raw Hindi text
    reference_text = example["target"]  # Reference English translation
    prediction = generate_translation(model, tokenizer, source_text)
    predictions.append(prediction)
    references.append([reference_text])  # Wrap the reference in a list

# Compute BLEU score
bleu_score = bleu.compute(predictions=predictions, references=references)
print("\nBLEU Score:", bleu_score)


Generating predictions for BLEU evaluation...


100%|██████████| 49/49 [00:33<00:00,  1.45it/s]


BLEU Score: {'bleu': 0.892242062829399, 'precisions': [0.9573643410852714, 0.9138755980861244, 0.86875, 0.8468468468468469], 'brevity_penalty': 0.996131532880095, 'length_ratio': 0.9961389961389961, 'translation_length': 258, 'reference_length': 259}





In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
model_id = "ananyavarma/gemma2-2b-mt-hindi-English"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)

In [None]:
def get_completion(query: str, model, tokenizer) -> str:
  device = "cuda:0"

  prompt_template = """
  <start_of_turn>user
  Translate english to hindi when the given input is english and hindi to english when the given input is hindi.
  {query}
  <end_of_turn>\n<start_of_turn>model


  """
  prompt = prompt_template.format(query=query)

  encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)

  model_inputs = encodeds.to(device)


  generated_ids = model.generate(**model_inputs, max_new_tokens=1000, do_sample=True, pad_token_id=tokenizer.eos_token_id)
  # decoded = tokenizer.batch_decode(generated_ids)
  decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
  return (decoded)

In [None]:
result = get_completion(query="हाय तुम्हारा नाम क्या है? ", model=model, tokenizer=tokenizer)
print(result)


  user
  Translate english to hindi when the given input is english and hindi to english when the given input is hindi.
  हाय तुम्हारा नाम क्या है? 
  
model
  

    
* English: What is your name? 
* Hindi: तुमका नाम क्या है? 

 


In [39]:
# Load model directly
from transformers import AutoModel
model = AutoModel.from_pretrained("ananyavarma/gemma2-2b-mt-hindi-English")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading adapter weights from ananyavarma/gemma2-2b-mt-hindi-English led to unexpected keys not found in the model:  ['model.layers.0.self_attn.q_proj.lora_A.default.weight', 'model.layers.0.self_attn.q_proj.lora_B.default.weight', 'model.layers.0.self_attn.v_proj.lora_A.default.weight', 'model.layers.0.self_attn.v_proj.lora_B.default.weight', 'model.layers.1.self_attn.q_proj.lora_A.default.weight', 'model.layers.1.self_attn.q_proj.lora_B.default.weight', 'model.layers.1.self_attn.v_proj.lora_A.default.weight', 'model.layers.1.self_attn.v_proj.lora_B.default.weight', 'model.layers.10.self_attn.q_proj.lora_A.default.weight', 'model.layers.10.self_attn.q_proj.lora_B.default.weight', 'model.layers.10.self_attn.v_proj.lora_A.default.weight', 'model.layers.10.self_attn.v_proj.lora_B.default.weight', 'model.layers.11.self_attn.q_proj.lora_A.default.weight', 'model.layers.11.self_attn.q_proj.lora_B.default.weight', 'model.layers.11.self_attn.v_proj.lora_A.default.weight', 'model.layers.11.self

In [40]:
def get_completion(query: str, model, tokenizer) -> str:
  device = "cuda:0"

  prompt_template = """
  <start_of_turn>user
  Translate english to hindi when the given input is english and hindi to english when the given input is hindi.
  {query}
  <end_of_turn>\n<start_of_turn>model


  """
  prompt = prompt_template.format(query=query)

  encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)

  model_inputs = encodeds.to(device)


  generated_ids = model.generate(**model_inputs, max_new_tokens=1000, do_sample=True, pad_token_id=tokenizer.eos_token_id)
  # decoded = tokenizer.batch_decode(generated_ids)
  decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
  return (decoded)