## Objective Based Fine-Tunning

In [None]:
!pip install --upgrade pip
!pip install --upgrade transformers
!pip install --upgrade datasets

Collecting pip
  Downloading pip-25.2-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.2-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.2
Collecting transformers
  Downloading transformers-4.55.0-py3-none-any.whl.metadata (39 kB)
Downloading transformers-4.55.0-py3-none-any.whl (11.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m89.7 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.54.1
    Uninstalling transformers-4.54.1:
      Successfully uninstalled transformers-4.54.1
Successfully installed transformers-4.55.0


### DAFT / Unsupervised Fine Tunning

In [None]:
from transformers import AutoModelForMaskedLM, DataCollatorForLanguageModeling
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

corpus = [
    "Quantum computing is the future of technology.",
    "Artificial intelligence is transforming industries across the globe.",
    "Machine learning models improve with larger datasets and better features.",
    "Neural networks are the foundation of modern deep learning.",
    "The Large Hadron Collider was built to explore particle physics.",
    "Natural language processing enables machines to understand human speech.",
    "Data privacy is crucial in the age of big data.",
    "Edge computing reduces latency by processing data closer to the source.",
    "Blockchain technology enables decentralized trustless systems.",
    "The Turing Test measures a machine's ability to exhibit intelligent behavior.",
    "Augmented reality is being used in education, healthcare, and entertainment.",
    "Reinforcement learning allows agents to learn by trial and error.",
    "Genetic algorithms mimic natural selection for optimization problems.",
    "Speech recognition has advanced rapidly with transformer models.",
    "The quantum bit, or qubit, can exist in multiple states at once.",
    "Data lakes store raw, unstructured, and structured data.",
    "Autonomous vehicles rely heavily on computer vision systems.",
    "Cybersecurity threats are increasing with the rise of IoT devices.",
    "The metaverse could redefine how humans interact digitally.",
    "HPC (High Performance Computing) powers climate modeling and simulations.",
    "Supervised learning requires labeled datasets for training.",
    "Self-supervised learning is gaining popularity in NLP.",
    "Transfer learning reduces the amount of data needed for training.",
    "GPT models can generate human-like text based on prompts.",
    "Cloud-native architectures are scalable and resilient.",
    "Smart contracts execute automatically on blockchain platforms.",
    "Facial recognition systems have raised ethical concerns.",
    "The no-code movement democratizes app development.",
    "Differential privacy helps protect individual identities in datasets.",
    "Sustainable AI focuses on reducing energy consumption in training models.",
    "AI in healthcare is used for diagnostics, imaging, and personalized medicine.",
    "Bioinformatics uses computational tools to analyze biological data.",
    "Text embeddings map words into high-dimensional vector spaces.",
    "Zero-shot learning enables models to generalize to unseen tasks.",
    "TinyML enables machine learning on low-power microcontrollers.",
    "Explainable AI (XAI) makes model predictions more interpretable.",
    "GANs (Generative Adversarial Networks) generate realistic images and audio.",
    "Vision transformers are state-of-the-art for image classification.",
    "Federated learning allows models to train across decentralized data.",
    "Few-shot learning requires only a few examples to adapt.",
    "Deepfake technology raises questions about misinformation.",
    "Semantic search improves search accuracy using embeddings.",
    "Swarm robotics involves multiple robots working collaboratively.",
    "Prompt engineering improves model outputs in large language models.",
    "A/B testing helps validate product and UX changes.",
    "Synthetic data generation is useful for model training and privacy.",
    "Edge AI brings intelligence to mobile and embedded devices.",
    "Multimodal models can understand text, images, and audio together.",
    "Language models are trained using next-token prediction tasks."
]

from datasets import Dataset
text_dataset = Dataset.from_dict({"text": corpus})

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)

def tokenize_corpus(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=64)

tokenized_corpus = text_dataset.map(tokenize_corpus)

args = TrainingArguments(
    output_dir="./results_uft",
    per_device_train_batch_size=8,
    num_train_epochs=1,
    logging_strategy="no",
    save_strategy="no",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_corpus,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True)
)

trainer.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Map:   0%|          | 0/49 [00:00<?, ? examples/s]

Step,Training Loss


TrainOutput(global_step=7, training_loss=3.214120319911412, metrics={'train_runtime': 44.0303, 'train_samples_per_second': 1.113, 'train_steps_per_second': 0.159, 'total_flos': 811937081088.0, 'train_loss': 3.214120319911412, 'epoch': 1.0})

In [None]:
'''
.train() → to train the model.

.evaluate() → to evaluate the model.

.predict() → to generate predictions.

.save_model() → to save the fine-tuned model.
'''

In [None]:
# Save model and tokenizer to a directory
save_path = "./dapt_distilbert_model"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

from transformers import AutoTokenizer, AutoModelForMaskedLM

# Load the fine-tuned model and tokenizer
load_path = "./dapt_distilbert_model"
tokenizer = AutoTokenizer.from_pretrained(load_path)
model = AutoModelForMaskedLM.from_pretrained(load_path)

In [None]:
import torch

# Example sentence with a [MASK] token
text = "Quantum computing is the cradle of [MASK]."

# Tokenize input
inputs = tokenizer(text, return_tensors="pt")

# Check if [MASK] is in the input
if tokenizer.mask_token_id not in inputs["input_ids"]:
    raise ValueError("No [MASK] token found in the input.")

# Predict
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

# Get index of [MASK] token
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]

# Get top 5 predictions for [MASK]
top_k = 5
mask_logits = logits[0, mask_token_index, :]
top_tokens = torch.topk(mask_logits, top_k, dim=1).indices[0].tolist()

# Display predictions
print("Top predictions for [MASK]:")
for token_id in top_tokens:
    print(tokenizer.decode([token_id]))


Top predictions for [MASK]:
computing
learning
science
innovation
mathematics


### Supervised Fine Tunning

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

# Load small dataset subset
dataset = load_dataset("imdb")
small_train_dataset = dataset["train"].shuffle(seed=42).select(range(200))
small_test_dataset = dataset["test"].shuffle(seed=42).select(range(50))

# Load small model and tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenize
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True)

tokenized_train = small_train_dataset.map(tokenize, batched=True)
tokenized_test = small_test_dataset.map(tokenize, batched=True)

README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [None]:
df = small_test_dataset.to_pandas()
df.head(20)

Unnamed: 0,text,label
0,<br /><br />When I unsuspectedly rented A Thou...,1
1,This is the latest entry in the long series of...,1
2,This movie was so frustrating. Everything seem...,0
3,"I was truly and wonderfully surprised at ""O' B...",1
4,This movie spends most of its time preaching t...,0
5,After a very long time Marathi cinema has come...,1
6,"This is a really sad, and touching movie! It d...",1
7,Don't pay any attention to the rave reviews of...,0
8,Porn legend Gregory Dark directs this cheesy h...,0
9,This was a great movie. Something not only for...,1


In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    save_strategy="no",
    logging_strategy="no",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
)

trainer.train()

Step,Training Loss


TrainOutput(global_step=25, training_loss=0.6819244384765625, metrics={'train_runtime': 683.5758, 'train_samples_per_second': 0.293, 'train_steps_per_second': 0.037, 'total_flos': 26493479731200.0, 'train_loss': 0.6819244384765625, 'epoch': 1.0})

In [None]:
from datasets import Dataset

# Prepare a dummy dataset
predict_dataset = Dataset.from_dict({"text": ["The movie was hillarious and worth time spending."]})

# Tokenize the dataset
predict_dataset = predict_dataset.map(lambda x: tokenizer(x["text"], truncation=True, padding=True), batched=True)

# Remove 'text' column if not needed
predict_dataset = predict_dataset.remove_columns(["text"])

# Predict
predictions = trainer.predict(predict_dataset)
print(predictions.predictions.argmax(axis=-1))  # Gives class index

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

[1]


### Instruction based fine tuning

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Use Alpaca-style dataset structure (mocked here)
data = [
    {"instruction": "Translate to French", "input": "Good morning", "output": "Bonjour"},
    {"instruction": "Translate to French", "input": "How are you?", "output": "Comment ça va ?"},
    {"instruction": "Translate to French", "input": "I am learning AI", "output": "J'apprends l'intelligence artificielle"},
    {"instruction": "Translate to French", "input": "What is your name?", "output": "Comment tu t'appelles ?"},
    {"instruction": "Translate to French", "input": "Thank you very much", "output": "Merci beaucoup"},
    {"instruction": "Translate to French", "input": "I love programming", "output": "J'adore programmer"},
    {"instruction": "Translate to French", "input": "Where is the train station?", "output": "Où est la gare ?"},
    {"instruction": "Translate to French", "input": "This is my book", "output": "C'est mon livre"},
    {"instruction": "Translate to French", "input": "See you tomorrow", "output": "À demain"},
    {"instruction": "Translate to French", "input": "She is a doctor", "output": "Elle est médecin"},
    # Add more like this up to 100 or more diverse rows
]

from datasets import Dataset
instruction_dataset = Dataset.from_list(data)

model_name = "EleutherAI/gpt-neo-125M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

def format_and_tokenize(example):
    prompt = (
        f"### Instruction:\n{example['instruction']}\n"
        f"### Input:\n{example['input']}\n"
        f"### Response:\n{example['output']}"
    )
    return tokenizer(prompt, padding="max_length", truncation=True, max_length=128)


tokenized_data = instruction_dataset.map(format_and_tokenize)

args = TrainingArguments(
    output_dir="./results_instruction",
    per_device_train_batch_size=4,
    num_train_epochs=5,
    logging_strategy="no",
    save_strategy="no",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_data,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
)

trainer.train()


tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/526M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Step,Training Loss


TrainOutput(global_step=15, training_loss=1.2933970133463542, metrics={'train_runtime': 120.4163, 'train_samples_per_second': 0.415, 'train_steps_per_second': 0.125, 'total_flos': 3265088716800.0, 'train_loss': 1.2933970133463542, 'epoch': 5.0})

In [None]:
trainer.save_model("./results_instruction")
tokenizer.save_pretrained("./results_instruction")

# Reload
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained("./results_instruction")
tokenizer = AutoTokenizer.from_pretrained("./results_instruction")


In [None]:
prompt = """### Instruction:
Translate to French
### Input:
Where is the airport?
### Response:"""

inputs = tokenizer(prompt, return_tensors="pt")

outputs = model.generate(
    **inputs,
    max_length=100,
    do_sample=False,
    num_beams=3,
    early_stopping=True,
    pad_token_id=tokenizer.eos_token_id
)

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Extract clean output
response = generated_text.split("### Response:")[-1].strip().split("\n")[0]
print("Model response:", response)