<a href="https://colab.research.google.com/github/apoorvapu/data_science/blob/main/NLP_predictText.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**NLP**: **(2 methods: (1) fine-tuning LLM and (2) model training from scratch)**

# predict the answer to a question or complete a sentence

In [36]:
from google.colab import drive
drive.mount('/content/drive')
# Read the API key from the file
with open("/content/drive/My Drive/hf_token.txt", "r") as f:
    hf_token = f.read().strip()

from huggingface_hub import login
login(hf_token)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [37]:
# Read the API key from the file
with open('/content/drive/MyDrive/wandb_key.txt', 'r') as f:
    wandb_key = f.read().strip()

# Set the W&B API key
import os
os.environ["WANDB_API_KEY"] = wandb_key

In [38]:
!pip install datasets



In [39]:
from transformers import Trainer, TrainingArguments
import torch, gc
from datasets import load_dataset, Dataset

torch.cuda.empty_cache()
gc.collect()

5016

In [40]:
# Login using e.g. `huggingface-cli login` to access this dataset
dataset = load_dataset("community-datasets/yahoo_answers_topics")
dataset["train"] = dataset["train"].select(range(80000))
dataset["test"] = dataset["test"].select(range(10000))

In [41]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'topic', 'question_title', 'question_content', 'best_answer'],
        num_rows: 80000
    })
    test: Dataset({
        features: ['id', 'topic', 'question_title', 'question_content', 'best_answer'],
        num_rows: 10000
    })
})


In [42]:
print(dataset['train'][0])

{'id': 0, 'topic': 4, 'question_title': "why doesn't an optical mouse work on a glass table?", 'question_content': 'or even on some surfaces?', 'best_answer': 'Optical mice use an LED and a camera to rapidly capture images of the surface beneath the mouse.  The infomation from the camera is analyzed by a DSP (Digital Signal Processor) and used to detect imperfections in the underlying surface and determine motion. Some materials, such as glass, mirrors or other very shiny, uniform surfaces interfere with the ability of the DSP to accurately analyze the surface beneath the mouse.  \\nSince glass is transparent and very uniform, the mouse is unable to pick up enough imperfections in the underlying surface to determine motion.  Mirrored surfaces are also a problem, since they constantly reflect back the same image, causing the DSP not to recognize motion properly. When the system is unable to see surface changes associated with movement, the mouse will not work properly.'}


In [43]:
dataset = dataset.remove_columns(["id", "topic"])
print(dataset['train'][0])

{'question_title': "why doesn't an optical mouse work on a glass table?", 'question_content': 'or even on some surfaces?', 'best_answer': 'Optical mice use an LED and a camera to rapidly capture images of the surface beneath the mouse.  The infomation from the camera is analyzed by a DSP (Digital Signal Processor) and used to detect imperfections in the underlying surface and determine motion. Some materials, such as glass, mirrors or other very shiny, uniform surfaces interfere with the ability of the DSP to accurately analyze the surface beneath the mouse.  \\nSince glass is transparent and very uniform, the mouse is unable to pick up enough imperfections in the underlying surface to determine motion.  Mirrored surfaces are also a problem, since they constantly reflect back the same image, causing the DSP not to recognize motion properly. When the system is unable to see surface changes associated with movement, the mouse will not work properly.'}


In [44]:
from transformers import T5Config, T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer

# Initialize the tokenizer and model
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)


# Define a new model configuration (random weights, untrained)
config = T5Config(
    vocab_size=tokenizer.vocab_size,  # Matches tokenizer vocab size
    d_model=1024,  # Model hidden size
    num_layers=12,  # Number of encoder & decoder layers
    num_heads=12,  # Number of attention heads
    d_ff=2048,  # Feed-forward hidden layer size
    dropout_rate=0.1,  # Dropout for regularization
    decoder_start_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id

)
model = T5ForConditionalGeneration(config)  # No pretrained weights

# Use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Function to preprocess and format data for T5
def preprocess_function(examples):
    inputs = [
        f"answer this: {title} {content}" if title and content else
        f"answer this: {title}" if title else
        f"answer this: {content}"
        for title, content in zip(examples["question_title"], examples["question_content"])
    ]

    model_inputs = tokenizer(inputs, truncation=True, padding="max_length", max_length=128)

    # Tokenize labels (answers)
    labels = tokenizer(examples["best_answer"], truncation=True, padding="max_length", max_length=128)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply tokenization and remove unnecessary columns
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["question_title", "question_content", "best_answer"])




Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [45]:
# **Training arguments optimized for memory**
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=40,  # Adjust as per GPU memory
    per_device_eval_batch_size=40,
    gradient_accumulation_steps=2,  # Simulates a larger batch size
    evaluation_strategy="steps",
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=1000,
    fp16=True,  # Enables mixed precision training
    gradient_checkpointing=True,  # Saves memory
    load_best_model_at_end=True,
    num_train_epochs=2,  # Adjust as needed
    #max_steps=200,  # Train only for 200 steps instead of completing the full dataset across one epoch or many.
    report_to="none"  # Avoid logging to external services
)

# **Initialize Trainer**
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
)


  trainer = Trainer(


In [None]:
trainer.train()


Step,Training Loss,Validation Loss
1000,6.9549,3.040581


Step,Training Loss,Validation Loss
1000,6.9549,3.040581


In [None]:
# Define the preprocessing function
def hi(examples):
    # Tokenize the English sentences (inputs) and return tensors
    inputs = tokenizer(examples['text'], padding="max_length", max_length=128, truncation=True, return_tensors="pt")
    return inputs

# Data: English sentences
data = ["Why am I very happy today and hungry too?", "Is life good?", "All is well."]

# Wrap it in a dataset
dataset = Dataset.from_dict({"text": data})

# Apply the preprocessing function
dataset = dataset.map(hi, batched=True)

# Get inputs from the dataset
inputs = dataset['input_ids']  # Extract the input ids after preprocessing

# Convert the list of input_ids into a tensor and move to the appropriate device
inputs_tensor = torch.tensor(inputs).to(device)  # Move to device (GPU or CPU)

# Make predictions
with torch.no_grad():
    outputs = model.generate(
        input_ids=inputs_tensor,
        max_length=128,
        num_beams=5,
        temperature=0.7,  # Adjust temperature for diversity
        top_p=0.9,  # nucleus sampling
        top_k=50,  # Limit the top-k tokens to sample from
        early_stopping=True
    )

# Decode the predictions (Sanskrit translations)
decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Print the translations
for i, text in enumerate(decoded_preds):
    print(f"Q: {data[i]}")
    print(f"Ans: {text}")
    print('-' * 50)


# **fine-tuning**