In [1]:
from datasets import load_dataset
HF_TOKEN = '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'
# Load the OPUS-100 dataset for English to Hindi
dataset = load_dataset("opus100", "en-hi")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Display 5 rows from the training set
for i in range(5):
    print(f"Row {i + 1}:")
    print(f"English: {dataset['train'][i]['translation']['en']}")
    print(f"Hindi: {dataset['train'][i]['translation']['hi']}")
    print()

Row 1:
English: Other, Private Use
Hindi: अन्य, निज़ी उपयोग

Row 2:
English: [SCREAMING]
Hindi: ऊबड़ .

Row 3:
English: Spouse
Hindi: जीवनसाथी

Row 4:
English: I will never salute you!
Hindi: - तुम एक कमांडर कभी नहीं होगा!

Row 5:
English: and the stars and the trees bow themselves;
Hindi: और तारे और वृक्ष सजदा करते है;



In [3]:
# Convert to Pandas DataFrame
import pandas as pd
opus_df = pd.DataFrame({
    "english": [entry["en"] for entry in dataset["train"]["translation"]],
    "hindi": [entry["hi"] for entry in dataset["train"]["translation"]]
})

# Display the first few rows
print("First few rows from OPUS-100:")
print(opus_df.head())

First few rows from OPUS-100:
                                       english                           hindi
0                           Other, Private Use               अन्य, निज़ी उपयोग
1                                  [SCREAMING]                          ऊबड़ .
2                                       Spouse                        जीवनसाथी
3                     I will never salute you!  - तुम एक कमांडर कभी नहीं होगा!
4  and the stars and the trees bow themselves;  और तारे और वृक्ष सजदा करते है;


In [4]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("vaibhavkumar11/hindi-english-parallel-corpus")

print("Path to dataset files:", path)

Path to dataset files: /home/ubuntu/.cache/kagglehub/datasets/vaibhavkumar11/hindi-english-parallel-corpus/versions/1


In [5]:
import os

# Define the path to the dataset
dataset_path = "/home/ubuntu/.cache/kagglehub/datasets/vaibhavkumar11/hindi-english-parallel-corpus/versions/1"

# List files in the dataset directory
files = os.listdir(dataset_path)
print("Files in the dataset:", files)


Files in the dataset: ['hindi_english_parallel.csv']


In [6]:
files

['hindi_english_parallel.csv']

In [7]:
import pandas as pd

# Path to the dataset file
dataset_file = os.path.join(dataset_path, "hindi_english_parallel.csv")

# Load the CSV file
df = pd.read_csv(dataset_file)



In [8]:
# Inspect the first few rows
print(df.head())

# Access Hindi and English columns
hindi_sentences = df['hindi']
english_sentences = df['english']

                                               hindi  \
0    अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें   
1                    एक्सेर्साइसर पहुंचनीयता अन्वेषक   
2              निचले पटल के लिए डिफोल्ट प्लग-इन खाका   
3               ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका   
4  उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से नि...   

                                          english  
0  Give your application an accessibility workout  
1               Accerciser Accessibility Explorer  
2  The default plugin layout for the bottom panel  
3     The default plugin layout for the top panel  
4  A list of plugins that are disabled by default  


In [9]:
# Combine the two datasets
combined_df = pd.concat([opus_df, df], ignore_index=True)

# Display the size of the combined dataset
print(f"Combined dataset size: {combined_df.shape}")

# Display the first few rows of the combined dataset
print("First few rows of the combined dataset:")
print(combined_df.head())


Combined dataset size: (2096160, 2)
First few rows of the combined dataset:
                                       english                           hindi
0                           Other, Private Use               अन्य, निज़ी उपयोग
1                                  [SCREAMING]                          ऊबड़ .
2                                       Spouse                        जीवनसाथी
3                     I will never salute you!  - तुम एक कमांडर कभी नहीं होगा!
4  and the stars and the trees bow themselves;  और तारे और वृक्ष सजदा करते है;


In [10]:
# Remove duplicate rows
combined_df = combined_df.drop_duplicates()

# Display the size after removing duplicates
print(f"Dataset size after removing duplicates: {combined_df.shape}")


Dataset size after removing duplicates: (1635847, 2)


In [11]:
import re

# Function to check if a string contains Hindi characters
def contains_hindi(text):
    if not isinstance(text, str):  # Ensure the input is a string
        return False
    hindi_pattern = re.compile('[\u0900-\u097F]')
    return bool(hindi_pattern.search(text))

# Apply the function to the English column
combined_df['hindi_in_english'] = combined_df['english'].apply(contains_hindi)

# Display rows where Hindi words are found in the English column
hindi_in_english_rows = combined_df[combined_df['hindi_in_english']]
print(f"Number of rows with Hindi in English column: {len(hindi_in_english_rows)}")
print(hindi_in_english_rows.head())


Number of rows with Hindi in English column: 3049
                                                  english  \
973263          2. Infection caused by germs. 2. जीवाणुओं   
973343  This position is similar to armchair. In this ...   
973383  Many countries in the [unclear], they need leg...   
973483  Virtually all groups of plants and animals, an...   
973518  Let 's first review what we know does not and ...   

                                                    hindi  hindi_in_english  
973263                             द्वारा संक्रामण होना।               True  
973343  यह हत्थाकुर्सी से मिलती जुलती पोजीशन है इसमें ...              True  
973383          के बहुत सारे राष्ट्रों को मान्यता चाहिए.               True  
973483  पौधों तथा जानवरों के सभी समूहों और उनके अन्दर ...              True  
973518                                  समस्या” का समाधान              True  


In [12]:
# Function to check if a string contains English characters
def contains_english(text):
    if not isinstance(text, str):  # Ensure the input is a string
        return False
    english_pattern = re.compile('[A-Za-z]')
    return bool(english_pattern.search(text))

# Apply the function to the Hindi column
combined_df['english_in_hindi'] = combined_df['hindi'].apply(contains_english)

# Display rows where English words are found in the Hindi column
english_in_hindi_rows = combined_df[combined_df['english_in_hindi']]
print(f"Number of rows with English in Hindi column: {len(english_in_hindi_rows)}")
print(english_in_hindi_rows.head())


Number of rows with English in Hindi column: 116543
                                      english  \
5        _Download Messages for Offline Usage   
8   The application '%s' could not be created   
17                                  Kennebunk   
20                          FIB(9) returns 34   
25                                Third power   

                                                hindi  hindi_in_english  \
5       ऑफ़लाइन प्रयोग के लिए संदेश डाउनलोड करें (_D)             False   
8   अनुप्रयोग '%s' के लिए इस्तेमाल किया जा के लिए ...             False   
17  केनेबंकCity name (optional, probably does not ...             False   
20                          FIB( 9) का परिणाम होगा 34             False   
25                       तृतीय घातx to the power of y             False   

    english_in_hindi  
5               True  
8               True  
17              True  
20              True  
25              True  


In [13]:
# Drop the columns 'hindi_in_english' and 'english_in_hindi' from the DataFrame
cleaned_df = combined_df.drop(columns=['hindi_in_english', 'english_in_hindi'])

# Display the size of the cleaned dataset
print(f"Dataset size after removing unwanted columns: {len(cleaned_df)}")

# Inspect the first few rows of the cleaned dataset
print("First few rows of the cleaned dataset:")
print(cleaned_df.head())


Dataset size after removing unwanted columns: 1635847
First few rows of the cleaned dataset:
                                       english                           hindi
0                           Other, Private Use               अन्य, निज़ी उपयोग
1                                  [SCREAMING]                          ऊबड़ .
2                                       Spouse                        जीवनसाथी
3                     I will never salute you!  - तुम एक कमांडर कभी नहीं होगा!
4  and the stars and the trees bow themselves;  और तारे और वृक्ष सजदा करते है;


In [76]:
# 9B LR - 2e-4

In [14]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Randomly sample 10,000 rows from cleaned_df
df = cleaned_df.sample(n=10000, random_state=42)

# Rename columns for consistency
df = df.rename(columns={'hindi': 'source', 'english': 'target'})

# Drop missing values
df = df.dropna()

# Split dataset into train, validation, and test
train_df = df.sample(frac=0.8, random_state=42)
remaining_df = df.drop(train_df.index)
valid_df = remaining_df.sample(frac=0.5, random_state=42)
test_df = remaining_df.drop(valid_df.index)

# Convert DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)
test_dataset = Dataset.from_pandas(test_df)


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9974 entries, 1293567 to 2089657
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   target  9974 non-null   object
 1   source  9974 non-null   object
dtypes: object(2)
memory usage: 491.8+ KB


In [16]:
# Load tokenizer
access_token= "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"
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b-it",token=access_token)

# Preprocessing function
def preprocess_function(examples):
    inputs = tokenizer(
        examples['source'],
        truncation=True,
        padding="max_length",
        max_length=128
    )
    targets = tokenizer(
        examples['target'],
        truncation=True,
        padding="max_length",
        max_length=128
    )
    inputs["labels"] = targets["input_ids"]
    return inputs

# Preprocess datasets
train_dataset = train_dataset.map(preprocess_function, batched=True)
valid_dataset = valid_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# Set dataset format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
valid_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


Map: 100%|██████████| 7979/7979 [00:00<00:00, 8354.74 examples/s]
Map: 100%|██████████| 998/998 [00:00<00:00, 10162.21 examples/s]
Map: 100%|██████████| 997/997 [00:00<00:00, 10609.82 examples/s]


In [17]:
from transformers import AutoModelForCausalLM

access_token = "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"  # Replace with your actual Hugging Face token

model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2-9b-it",         # Correct model name
    device_map="auto",            # Automatically map to available devices
    torch_dtype=torch.float16,   # Use bfloat16 for efficiency
    use_auth_token=access_token   # Pass your access token for authentication
)


2024-12-09 08:40:44.295101: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-09 08:40:44.315315: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-09 08:40:44.340767: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-09 08:40:44.347988: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-09 08:40:44.362862: I tensorflow/core/platform/cpu_feature_guar

In [20]:
from peft import LoraConfig, get_peft_model, TaskType
from transformers import TrainingArguments, Trainer

In [22]:
# Define LoRA configuration
# lora_config = LoraConfig(
#     task_type=TaskType.CAUSAL_LM,  # Task type for causal language modeling
#     inference_mode=False,         # Fine-tune mode
#     r=8,                          # Low-rank dimension
#     lora_alpha=32,                # Scaling factor
#     lora_dropout=0.1,             # Dropout for LoRA layers
# )

lora_config = LoraConfig(
    r=64,
    lora_alpha=32,
    #target_modules=modules,
    lora_dropout=0.02,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

In [23]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    learning_rate=1e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    save_steps=499,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=499,
    load_best_model_at_end=True,
    fp16=True,
    push_to_hub=False,
    save_safetensors=False  # Use PyTorch save format
)




In [25]:
from trl import SFTTrainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
)

model.tie_weights = lambda: None  # Temporarily disable shared weights
trainer.train()                   # Proceed with training
model.tie_weights()               # Re-enable shared weights after training

# Save the model after training
trainer.save_model("./results")  # Explicitly save the model




Step,Training Loss,Validation Loss
499,2.8564,2.670529
998,2.4157,2.540451
1497,2.2154,2.559234
1996,2.0264,2.624841
2495,1.837,2.706093
2994,1.6664,2.864666
3493,1.5086,3.027435
3992,1.361,3.206076
4491,1.2436,3.403177
4990,1.1537,3.544007


The 'batch_size' argument of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'max_batch_size' argument instead.
The 'batch_size' attribute of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.


In [26]:
import accelerate
print(accelerate.__version__)

1.2.0.dev0


In [29]:
import pandas as pd
from datasets import Dataset
import evaluate
from tqdm import tqdm
from transformers import AutoTokenizer

# Load the BLEU metric
bleu = evaluate.load("bleu")

# Load the test data from the CSV file
test_data_path = "test_file.csv"  # Path to your test CSV file
test_df = pd.read_csv(test_data_path)

# Ensure the columns match expectations
test_df = test_df.rename(columns={"Source": "source", "Target": "target"})

# Convert the DataFrame to a Hugging Face Dataset
raw_test_dataset = Dataset.from_pandas(test_df)

# Function to generate predictions
def generate_translation(model, tokenizer, text, max_length=256):
    prompt = f"Translate the following Hindi text to English:\n\n{text}\n\nEnglish Translation:"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_length).to("cuda")
    outputs = model.generate(inputs.input_ids, max_new_tokens=100, num_beams=5, early_stopping=True)
    translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translation.split("English Translation:")[-1].strip()  # Remove prompt if included

# Generate predictions and references
predictions = []
references = []

print("Generating predictions for BLEU evaluation...")
for example in tqdm(raw_test_dataset):  # Iterate over the test dataset
    source_text = example["source"]  # Raw Hindi text
    reference_text = example["target"]  # Reference English translation
    prediction = generate_translation(model, tokenizer, source_text)
    predictions.append(prediction)
    references.append([reference_text])  # Wrap the reference in a list

# Compute BLEU score
bleu_score = bleu.compute(predictions=predictions, references=references)
print("\nBLEU Score:", bleu_score)


Generating predictions for BLEU evaluation...


100%|██████████| 49/49 [00:34<00:00,  1.40it/s]


BLEU Score: {'bleu': 0.8884162322663213, 'precisions': [0.9571984435797666, 0.9182692307692307, 0.8742138364779874, 0.8363636363636363], 'brevity_penalty': 0.9922481009857891, 'length_ratio': 0.9922779922779923, 'translation_length': 257, 'reference_length': 259}



