<a href="https://colab.research.google.com/github/apoorvapu/data_science/blob/main/NLP_languageTranslation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**NLP**: **(2 methods: (1) model training from scratch and (2) fine-tuning LLM)**

#**(1) model training from scratch**

In [None]:
from google.colab import drive
drive.mount('/content/drive')
# Read the API key from the file
with open("/content/drive/My Drive/hf_token.txt", "r") as f:
    hf_token = f.read().strip()

from huggingface_hub import login
login(hf_token)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Read the API key from the file
with open('/content/drive/MyDrive/wandb_key.txt', 'r') as f:
    wandb_key = f.read().strip()

# Set the W&B API key
import os
os.environ["WANDB_API_KEY"] = wandb_key

In [None]:
!pip install datasets bitsandbytes accelerate transformers sacrebleu



In [None]:
from transformers import Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig, TaskType
import torch, gc
from datasets import load_dataset, Dataset

torch.cuda.empty_cache()
gc.collect()

612

In [None]:
# Load dataset
dataset = load_dataset("rahular/itihasa")  # Example: sanskrit-eng translation

In [None]:
from transformers import T5Config, T5ForConditionalGeneration, T5TokenizerFast

# Define tokenizer (you can train a new one if needed)
tokenizer = T5TokenizerFast.from_pretrained("t5-small")  # Uses a small tokenizer to save memory

# Preprocessing function
def preprocess_function(examples):
    # Extract English ('en') and Sanskrit ('sn') texts from the 'translation' field in the list
    inputs = tokenizer([example['en'] for example in examples['translation']], padding="max_length", truncation=True, max_length=64)
    labels = tokenizer([example['sn'] for example in examples['translation']], padding="max_length", truncation=True, max_length=64)

    # Add the labels as the 'input_ids' of the Sanskrit tokens
    inputs['labels'] = labels['input_ids']
    return inputs

# Apply preprocessing
train_dataset = dataset["train"].map(preprocess_function, batched=True)
val_dataset = dataset["validation"].map(preprocess_function, batched=True)
test_dataset = dataset["test"].map(preprocess_function, batched=True)



In [None]:
# **Optimized Transformer Config for Low Memory**
config = T5Config(
    vocab_size=32000,  # Reduce if needed
    decoder_start_token_id=tokenizer.pad_token_id,
    d_model=64,        # Smaller embedding size (default is 768)
    num_layers=4,       # Reduce layers (default is 12)
    num_heads=4,        # Reduce attention heads (default is 12)
    d_ff=1024,          # Reduce feed-forward size (default is 3072)
    dropout_rate=0.1
)

# Initialize model
model = T5ForConditionalGeneration(config)

# Use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)



T5ForConditionalGeneration(
  (shared): Embedding(32000, 64)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32000, 64)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=64, out_features=256, bias=False)
              (k): Linear(in_features=64, out_features=256, bias=False)
              (v): Linear(in_features=64, out_features=256, bias=False)
              (o): Linear(in_features=256, out_features=64, bias=False)
              (relative_attention_bias): Embedding(32, 4)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=64, out_features=1024, bias=False)
              (wo): Linear(in_features=1024, out_features=64, bias=False)
              (dropout): Dropout(p=0.

In [None]:
max_index = max(max(seq) for seq in train_dataset['input_ids'])  # Find max token index
print(f"Max token index in dataset: {max_index}, Model vocab size: {model.config.vocab_size}")


Max token index in dataset: 31993, Model vocab size: 32000


In [None]:
# **Training arguments optimized for memory**
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=60,  # Adjust as per GPU memory
    per_device_eval_batch_size=60,
    gradient_accumulation_steps=2,  # Simulates a larger batch size
    evaluation_strategy="steps",
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=1000,
    fp16=True,  # Enables mixed precision training
    gradient_checkpointing=True,  # Saves memory
    load_best_model_at_end=True,
    num_train_epochs=1,  # Adjust as needed
    #max_steps=2000,  # Train only for 200 steps instead of completing the full dataset across one epoch or many.
    report_to="none"  # Avoid logging to external services
)

# **Initialize Trainer**
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)


  trainer = Trainer(


In [None]:
trainer.train()


Step,Training Loss,Validation Loss


TrainOutput(global_step=626, training_loss=5.298243513503395, metrics={'train_runtime': 7525.8976, 'train_samples_per_second': 9.987, 'train_steps_per_second': 0.083, 'total_flos': 52973402849280.0, 'train_loss': 5.298243513503395, 'epoch': 0.9992019154030327})

In [None]:
# **Evaluate the Model**
results = trainer.evaluate(test_dataset)
print("Evaluation Results:", results)

# **Save the Fine-Tuned Model**
model.save_pretrained("./english_to_sanskrit_model")
tokenizer.save_pretrained("./english_to_sanskrit_model")



Evaluation Results: {'eval_loss': 2.215571165084839, 'eval_runtime': 402.9685, 'eval_samples_per_second': 29.089, 'eval_steps_per_second': 0.486, 'epoch': 0.9992019154030327}


('./english_to_sanskrit_model/tokenizer_config.json',
 './english_to_sanskrit_model/special_tokens_map.json',
 './english_to_sanskrit_model/spiece.model',
 './english_to_sanskrit_model/added_tokens.json',
 './english_to_sanskrit_model/tokenizer.json')

In [None]:
def translate(text):
    inputs = tokenizer(text, return_tensors="pt").input_ids
    outputs = model.generate(inputs, max_length=128, num_beams=5)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

print(translate("I am happy today."))  # Should return Sanskrit translation


                                                               


In [None]:

# Define the preprocessing function
def hi(examples):
    # Tokenize the English sentences (inputs) and return tensors
    inputs = tokenizer(examples['text'], padding="max_length", max_length=64, truncation=True, return_tensors="pt")
    return inputs

# Data: English sentences
data = ["I am very happy today and hungry too.", "Life is good.", "All is well."]

# Wrap it in a dataset
dataset = Dataset.from_dict({"text": data})

# Apply the preprocessing function
dataset = dataset.map(hi, batched=True)

# Get inputs from the dataset
inputs = dataset['input_ids']  # Extract the input ids after preprocessing

# Convert the list of input_ids into a tensor and move to the appropriate device
inputs_tensor = torch.tensor(inputs).to(device)  # Move to device (GPU or CPU)

# Make predictions
with torch.no_grad():
    outputs = model.generate(
        input_ids=inputs_tensor,
        max_length=128,
        num_beams=5,
        temperature=0.7,  # Adjust temperature for diversity
        top_p=0.9,  # nucleus sampling
        top_k=50,  # Limit the top-k tokens to sample from
        early_stopping=True
    )

# Decode the predictions (Sanskrit translations)
decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Print the translations
for i, text in enumerate(decoded_preds):
    print(f"English: {data[i]}")
    print(f"Sanskrit Translation: {text}")
    print('-' * 50)


Map:   0%|          | 0/3 [00:00<?, ? examples/s]



English: I am very happy today and hungry too.
Sanskrit Translation:                                                                
--------------------------------------------------
English: Life is good.
Sanskrit Translation:   SIS
--------------------------------------------------
English: All is well.
Sanskrit Translation:  SIS
--------------------------------------------------


# **(2) fine-tuning LLM**

# language translation

For fine-tuning Meta-llama2 for language translation with low memory,you can use techniques like LoRA, QLoRA, 8-bit/4-bit quantization, and Retrieval-Augmented Generation (RAG).

In [None]:
#delete all old variables to clean space

In [None]:
from transformers import LlamaForCausalLM, LlamaTokenizer, Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig, TaskType
import torch, gc
from datasets import load_dataset
torch.cuda.empty_cache()
gc.collect()

118

In [None]:
# Check if CUDA (GPU support) is available
cuda_available = torch.cuda.is_available()
print("CUDA Available:", cuda_available)

# Get the name of the GPU being used
gpu_name = torch.cuda.get_device_name(0) if cuda_available else "No GPU"
print("GPU Name:", gpu_name)

CUDA Available: False
GPU Name: No GPU


In [None]:
# Load dataset
dataset = load_dataset("rahular/itihasa")  # Example: sanskrit-eng translation

In [None]:
from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

In [None]:
'''
# Load tokenizer and model
model_name = "meta-llama/Llama-2-7b-hf"
tokenizer = LlamaTokenizer.from_pretrained(model_name)
model = LlamaForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
'''

'\n# Load tokenizer and model\nmodel_name = "meta-llama/Llama-2-7b-hf"\ntokenizer = LlamaTokenizer.from_pretrained(model_name)\nmodel = LlamaForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)\n'

In [None]:
'''
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50", quantization_config=quantization_config)
'''

'\n# Load model directly\nfrom transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n\ntokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50")\nmodel = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50", quantization_config=quantization_config)\n'

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [None]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 75162
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 6149
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 11722
    })
})


In [None]:
dataset['train'][0]

{'translation': {'en': 'The ascetic Vālmīki asked Nārada, the best of sages and foremost of those conversant with words, ever engaged in austerities and Vedic studies.',
  'sn': 'ॐ तपः स्वाध्यायनिरतं तपस्वी वाग्विदां वरम्। नारदं परिपप्रच्छ वाल्मीकिर्मुनिपुङ्गवम्॥'}}

In [None]:
import gc
train_dataset = dataset["train"]
val_dataset = dataset["validation"]
test_dataset = dataset["test"]
del dataset
gc.collect()

140

In [None]:
'''
import random
from random import sample

# Convert dataset to a list (this works for Hugging Face Datasets)
dataset_list = list(train_dataset)
# Now you can sample a fraction of it, for example, 20%
train_dataset = random.sample(dataset_list, int(len(dataset_list) * 0.1))

dataset_list = list(val_dataset)
# Now you can sample a fraction of it, for example, 20%
val_dataset = random.sample(dataset_list, int(len(dataset_list) * 0.2))
'''

'\nimport random\nfrom random import sample\n\n# Convert dataset to a list (this works for Hugging Face Datasets)\ndataset_list = list(train_dataset)\n# Now you can sample a fraction of it, for example, 20%\ntrain_dataset = random.sample(dataset_list, int(len(dataset_list) * 0.1))\n\ndataset_list = list(val_dataset)\n# Now you can sample a fraction of it, for example, 20%\nval_dataset = random.sample(dataset_list, int(len(dataset_list) * 0.2))\n'

In [None]:
print(train_dataset)

Dataset({
    features: ['translation'],
    num_rows: 75162
})


In [None]:
train_dataset[0]

{'translation': {'en': 'The ascetic Vālmīki asked Nārada, the best of sages and foremost of those conversant with words, ever engaged in austerities and Vedic studies.',
  'sn': 'ॐ तपः स्वाध्यायनिरतं तपस्वी वाग्विदां वरम्। नारदं परिपप्रच्छ वाल्मीकिर्मुनिपुङ्गवम्॥'}}

In [None]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def preprocess_function(examples):
    # Extract English ('en') and Sanskrit ('sn') texts from the 'translation' field in the list
    inputs = tokenizer([example['en'] for example in examples['translation']], padding="max_length", truncation=True, max_length=64)
    labels = tokenizer([example['sn'] for example in examples['translation']], padding="max_length", truncation=True, max_length=64)

    # Add the labels as the 'input_ids' of the Sanskrit tokens
    inputs['labels'] = labels['input_ids']
    return inputs


In [None]:
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/75162 [00:00<?, ? examples/s]

Map:   0%|          | 0/6149 [00:00<?, ? examples/s]

Map:   0%|          | 0/11722 [00:00<?, ? examples/s]

In [None]:
torch.cuda.empty_cache()
gc.collect()

81

In [None]:
# Define LoRA configuration
lora_config = LoraConfig(
    r=2,  # Rank for LoRA
    lora_alpha=8,  # Scaling factor for LoRA
    target_modules=["q_proj", "v_proj"],  # Modules to apply LoRA to
    lora_dropout=0.1,  # Dropout for LoRA layers
    task_type=TaskType.SEQ_2_SEQ_LM,
    bias="none",  # Freeze biases
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
device = "cuda" if torch.cuda.is_available() else "cpu"
# Move your model to the GPU
model.to(device)

ERROR:bitsandbytes.cextension:Could not load bitsandbytes native library: /lib/x86_64-linux-gnu/libstdc++.so.6: version `GLIBCXX_3.4.32' not found (required by /usr/local/lib/python3.11/dist-packages/bitsandbytes/libbitsandbytes_cpu.so)
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/bitsandbytes/cextension.py", line 85, in <module>
    lib = get_native_library()
          ^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/bitsandbytes/cextension.py", line 72, in get_native_library
    dll = ct.cdll.LoadLibrary(str(binary_path))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/ctypes/__init__.py", line 454, in LoadLibrary
    return self._dlltype(name)
           ^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/ctypes/__init__.py", line 376, in __init__
    self._handle = _dlopen(self._name, mode)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^
OSError: /lib/x86_64-linux-gnu/libstdc++.so.6: version `GLIBCXX_3.4

PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): M2M100ForConditionalGeneration(
      (model): M2M100Model(
        (shared): M2M100ScaledWordEmbedding(256206, 1024, padding_idx=1)
        (encoder): M2M100Encoder(
          (embed_tokens): M2M100ScaledWordEmbedding(256206, 1024, padding_idx=1)
          (embed_positions): M2M100SinusoidalPositionalEmbedding()
          (layers): ModuleList(
            (0-11): 12 x M2M100EncoderLayer(
              (self_attn): M2M100SdpaAttention(
                (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
                (v_proj): lora.Linear(
                  (base_layer): Linear(in_features=1024, out_features=1024, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=1024, out_features=2, bias=False)
                  )
                  (

In [None]:
torch.cuda.empty_cache()
gc.collect()

0

In [None]:
from transformers import EarlyStoppingCallback

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",  # Directory to save results and model checkpoints
    evaluation_strategy="steps",  # Evaluation strategy (every `eval_steps` steps)
    save_strategy="steps",  # Save checkpoint every `save_steps`
    save_steps=1000,  # Save model every 500 steps (adjust as needed)
    save_total_limit=2,  # Keep the latest 2 checkpoints
    logging_dir="./logs",  # Where to save logs
    logging_steps=1000,  # Log every 500 steps
    per_device_train_batch_size=20,  # Adjust based on GPU memory
    gradient_accumulation_steps=4,  # Simulate larger batch sizes
    fp16=True,  # Enable mixed precision for memory optimization
    load_best_model_at_end=True,  # Automatically load the best model at the end of training
    num_train_epochs=1,  # Adjust as needed
    #max_steps=2000,  # Train for exactly 10,000 steps
    #eval_steps=1000,  # Evaluate every 1000 steps
    dataloader_num_workers=4,  # Parallelize data loading
)

# Early stopping callback
callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    callbacks=callbacks,  # Add early stopping callback
)

# Start training
trainer.train()


  trainer = Trainer(
[34m[1mwandb[0m: Currently logged in as: [33mapoorvapu[0m ([33mapoorvapu-ohio-state-buckeyes[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


In [None]:
# Evaluate the model on the test dataset
results = trainer.evaluate(test_dataset)

# Print the evaluation results
print(f"Evaluation results on the test dataset: {results}")


In [None]:
model.save_pretrained("./fine_tuned_translation_model")
tokenizer.save_pretrained("./fine_tuned_translation_model")


In [None]:
del train_dataset, val_dataset, results
gc.collect()
torch.cuda.empty_cache()

In [None]:
def hi(examples):
    # Tokenize the English sentences (inputs) and return tensors
    inputs = tokenizer(examples['text'], padding="max_length", max_length=64, return_tensors="pt")
    return inputs

# Data: English sentences
data = ["I am very happy today and hungry too.", "Life is good.", "All is well."]

# Wrap it in a dataset
dataset = Dataset.from_dict({"text": data})

# Apply the preprocessing function
dataset = dataset.map(hi, batched=True)

# Get inputs from the dataset
inputs = dataset['input_ids']  # Extract the input ids after preprocessing

# Convert the list of input_ids into a tensor and move to the appropriate device
inputs_tensor = torch.tensor(inputs).to(device)  # Move to device (GPU or CPU)

# Move the model to the device
model.to(device)

# Make predictions
with torch.no_grad():
    outputs = model.generate(
    input_ids=inputs_tensor,
    max_length=128,
    num_beams=5,
    temperature=0.7,  # Adjust temperature for diversity
    top_p=0.9,  # nucleus sampling
    top_k=50,  # Limit the top-k tokens to sample from
    early_stopping=True
    )

# Decode the predictions (Sanskrit translations)
decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Print the translations
for i, text in enumerate(decoded_preds):
    print(f"English: {data[i]}")
    print(f"Sanskrit Translation: {text}")
    print('-' * 50)

# Compare the results with Few-Shot Learning with Prompt Engineering
