<a href="https://colab.research.google.com/github/apoorvapu/data_science/blob/main/NLP_languageTranslation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**NLP**: **(2 methods: (1) fine-tuning LLM and (2) model training from scratch)**

# language translation

For fine-tuning Meta-llama2 for language translation with low memory,you can use techniques like LoRA, QLoRA, 8-bit/4-bit quantization, and Retrieval-Augmented Generation (RAG).

In [66]:
from google.colab import drive
drive.mount('/content/drive')
# Read the API key from the file
with open("/content/drive/My Drive/hf_token.txt", "r") as f:
    hf_token = f.read().strip()

from huggingface_hub import login
login(hf_token)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [67]:
# Read the API key from the file
with open('/content/drive/MyDrive/wandb_key.txt', 'r') as f:
    wandb_key = f.read().strip()

# Set the W&B API key
import os
os.environ["WANDB_API_KEY"] = wandb_key

In [68]:
!pip install datasets
!pip install bitsandbytes accelerate transformers



In [69]:
import pandas as pd
from transformers import LlamaForCausalLM, LlamaTokenizer, Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig, TaskType
import torch, gc
from datasets import load_dataset
torch.cuda.empty_cache()
gc.collect()

7718

In [70]:
# Check if CUDA (GPU support) is available
cuda_available = torch.cuda.is_available()
print("CUDA Available:", cuda_available)

# Get the name of the GPU being used
gpu_name = torch.cuda.get_device_name(0) if cuda_available else "No GPU"
print("GPU Name:", gpu_name)

CUDA Available: True
GPU Name: Tesla T4


In [71]:
# Load dataset
dataset = load_dataset("rahular/itihasa")  # Example: sanskrit-eng translation

In [72]:
from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

In [73]:
'''
# Load tokenizer and model
model_name = "meta-llama/Llama-2-7b-hf"
tokenizer = LlamaTokenizer.from_pretrained(model_name)
model = LlamaForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
'''

'\n# Load tokenizer and model\nmodel_name = "meta-llama/Llama-2-7b-hf"\ntokenizer = LlamaTokenizer.from_pretrained(model_name)\nmodel = LlamaForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)\n'

In [74]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50", quantization_config=quantization_config)


`low_cpu_mem_usage` was None, now default to True since model is quantized.


In [75]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 75162
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 6149
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 11722
    })
})


In [76]:
dataset['train'][0]

{'translation': {'en': 'The ascetic Vālmīki asked Nārada, the best of sages and foremost of those conversant with words, ever engaged in austerities and Vedic studies.',
  'sn': 'ॐ तपः स्वाध्यायनिरतं तपस्वी वाग्विदां वरम्। नारदं परिपप्रच्छ वाल्मीकिर्मुनिपुङ्गवम्॥'}}

In [77]:
import gc
train_dataset = dataset["train"]
val_dataset = dataset["validation"]
test_dataset = dataset["test"]
del dataset
gc.collect()

60

In [78]:
'''
import random
from random import sample

# Convert dataset to a list (this works for Hugging Face Datasets)
dataset_list = list(train_dataset)
# Now you can sample a fraction of it, for example, 20%
train_dataset = random.sample(dataset_list, int(len(dataset_list) * 0.1))

dataset_list = list(val_dataset)
# Now you can sample a fraction of it, for example, 20%
val_dataset = random.sample(dataset_list, int(len(dataset_list) * 0.2))
'''

'\nimport random\nfrom random import sample\n\n# Convert dataset to a list (this works for Hugging Face Datasets)\ndataset_list = list(train_dataset)\n# Now you can sample a fraction of it, for example, 20%\ntrain_dataset = random.sample(dataset_list, int(len(dataset_list) * 0.1))\n\ndataset_list = list(val_dataset)\n# Now you can sample a fraction of it, for example, 20%\nval_dataset = random.sample(dataset_list, int(len(dataset_list) * 0.2))\n'

In [79]:
print(train_dataset)

Dataset({
    features: ['translation'],
    num_rows: 75162
})


In [80]:
train_dataset[0]

{'translation': {'en': 'The ascetic Vālmīki asked Nārada, the best of sages and foremost of those conversant with words, ever engaged in austerities and Vedic studies.',
  'sn': 'ॐ तपः स्वाध्यायनिरतं तपस्वी वाग्विदां वरम्। नारदं परिपप्रच्छ वाल्मीकिर्मुनिपुङ्गवम्॥'}}

In [81]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def preprocess_function(examples):
    # Extract English ('en') and Sanskrit ('sn') texts from the 'translation' field in the list
    inputs = tokenizer([example['en'] for example in examples['translation']], padding="max_length", truncation=True, max_length=64)
    labels = tokenizer([example['sn'] for example in examples['translation']], padding="max_length", truncation=True, max_length=64)

    # Add the labels as the 'input_ids' of the Sanskrit tokens
    inputs['labels'] = labels['input_ids']
    return inputs


In [82]:
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

In [83]:
torch.cuda.empty_cache()
gc.collect()

45

In [84]:
# Define LoRA configuration
lora_config = LoraConfig(
    r=2,  # Rank for LoRA
    lora_alpha=8,  # Scaling factor for LoRA
    target_modules=["q_proj", "v_proj"],  # Modules to apply LoRA to
    lora_dropout=0.1,  # Dropout for LoRA layers
    task_type=TaskType.SEQ_2_SEQ_LM,
    bias="none",  # Freeze biases
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
device = "cuda" if torch.cuda.is_available() else "cpu"
# Move your model to the GPU
model.to(device)

PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): MBartForConditionalGeneration(
      (model): MBartModel(
        (shared): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
        (encoder): MBartEncoder(
          (embed_tokens): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
          (embed_positions): MBartLearnedPositionalEmbedding(1026, 1024)
          (layers): ModuleList(
            (0-11): 12 x MBartEncoderLayer(
              (self_attn): MBartSdpaAttention(
                (k_proj): Linear4bit(in_features=1024, out_features=1024, bias=True)
                (v_proj): lora.Linear4bit(
                  (base_layer): Linear4bit(in_features=1024, out_features=1024, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=1024, out_features=2, bias=False)
                  )
        

In [85]:
torch.cuda.empty_cache()
gc.collect()

0

In [88]:
from transformers import EarlyStoppingCallback

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",  # Directory to save results and model checkpoints
    evaluation_strategy="steps",  # Evaluation strategy (every `eval_steps` steps)
    save_strategy="steps",  # Save checkpoint every `save_steps`
    save_steps=250,  # Save model every 500 steps (adjust as needed)
    save_total_limit=2,  # Keep the latest 2 checkpoints
    logging_dir="./logs",  # Where to save logs
    logging_steps=250,  # Log every 500 steps
    per_device_train_batch_size=16,  # Adjust based on GPU memory
    gradient_accumulation_steps=4,  # Simulate larger batch sizes
    fp16=True,  # Enable mixed precision for memory optimization
    load_best_model_at_end=True,  # Automatically load the best model at the end of training
    max_steps=500,  # Train for exactly 10,000 steps
    eval_steps=250,  # Evaluate every 1000 steps
    dataloader_num_workers=4,  # Parallelize data loading
)

# Early stopping callback
callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    callbacks=callbacks,  # Add early stopping callback
)

# Start training
trainer.train()


  trainer = Trainer(


Step,Training Loss,Validation Loss
250,36.6001,8.684805
500,35.0419,8.572979




TrainOutput(global_step=500, training_loss=35.82101953125, metrics={'train_runtime': 935.8528, 'train_samples_per_second': 34.193, 'train_steps_per_second': 0.534, 'total_flos': 4337883414528000.0, 'train_loss': 35.82101953125, 'epoch': 0.4257130693912303})

In [89]:
# Evaluate the model on the test dataset
results = trainer.evaluate(test_dataset)

# Print the evaluation results
print(f"Evaluation results on the test dataset: {results}")




Evaluation results on the test dataset: {'eval_loss': 8.587699890136719, 'eval_runtime': 240.8346, 'eval_samples_per_second': 48.672, 'eval_steps_per_second': 6.087, 'epoch': 0.4257130693912303}


In [90]:
model.save_pretrained("./fine_tuned_translation_model")
tokenizer.save_pretrained("./fine_tuned_translation_model")


('./fine_tuned_translation_model/tokenizer_config.json',
 './fine_tuned_translation_model/special_tokens_map.json',
 './fine_tuned_translation_model/sentencepiece.bpe.model',
 './fine_tuned_translation_model/added_tokens.json',
 './fine_tuned_translation_model/tokenizer.json')

In [93]:
del train_dataset, val_dataset, results
gc.collect()
torch.cuda.empty_cache()

NameError: name 'train_dataset' is not defined

In [112]:
def hi(examples):
    # Tokenize the English sentences (inputs) and return tensors
    inputs = tokenizer(examples['text'], padding="max_length", max_length=64, return_tensors="pt")
    return inputs

# Data: English sentences
data = ["I am very happy today and hungry too.", "Life is good.", "All is well."]

# Wrap it in a dataset
dataset = Dataset.from_dict({"text": data})

# Apply the preprocessing function
dataset = dataset.map(hi, batched=True)

# Get inputs from the dataset
inputs = dataset['input_ids']  # Extract the input ids after preprocessing

# Convert the list of input_ids into a tensor and move to the appropriate device
inputs_tensor = torch.tensor(inputs).to(device)  # Move to device (GPU or CPU)

# Move the model to the device
model.to(device)

# Make predictions
with torch.no_grad():
    outputs = model.generate(
    input_ids=inputs_tensor,
    max_length=128,
    num_beams=5,
    temperature=0.7,  # Adjust temperature for diversity
    top_p=0.9,  # nucleus sampling
    top_k=50,  # Limit the top-k tokens to sample from
    early_stopping=True
    )

# Decode the predictions (Sanskrit translations)
decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Print the translations
for i, text in enumerate(decoded_preds):
    print(f"English: {data[i]}")
    print(f"Sanskrit Translation: {text}")
    print('-' * 50)

Map:   0%|          | 0/3 [00:00<?, ? examples/s]



English: I am very happy today and hungry too.
Sanskrit Translation: I am very happy today and hungry too.
--------------------------------------------------
English: Life is good.
Sanskrit Translation: Life is good. Life is good.
--------------------------------------------------
English: All is well.
Sanskrit Translation: All is well. . .
--------------------------------------------------
