In [1]:
!pip install transformers datasets evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [2]:

import json
import numpy as np
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    EarlyStoppingCallback
)
from sklearn.model_selection import train_test_split
import evaluate
import torch

In [3]:
import json

# Open the uploaded JSON file
with open("data.json") as f:
    data = json.load(f)

# Format the dataset as "Q: question \nA: answer"
dataset = [
    {"text": f"Q: {item['prompt']}\nA: {item['completion']}"}
    for item in data
]

# View the first couple of items
dataset[:2]

[{'text': 'Q: What is 42 Amman?\nA: 42 Amman is a tuition-free coding school in Amman, Jordan, launched on February 25, 2024, by the Crown Prince Foundation under HRH Crown Prince Hussein. Part of the global 42 Network, it offers peer-to-peer, project-based learning to train Jordanian youth in digital skills, targeting 1,300 students over five years.'},
 {'text': 'Q: What is 42 Irbid?\nA: 42 Irbid is a tuition-free coding school in Irbid, Jordan, funded with 572,000 JD in November 2024 by the Ministry of Planning and Crown Prince Foundation. It extends the 42 Network’s peer-to-peer model to northern Jordan, training youth in globally recognized tech skills.'}]

In [4]:
from datasets import Dataset
train_data, eval_data = train_test_split(dataset, test_size=0.1, random_state=42)
print(f"Training on {len(train_data)} examples, evaluating on {len(eval_data)} examples")

Training on 3141 examples, evaluating on 349 examples


In [5]:
# Convert to HF datasets
train_dataset = Dataset.from_list(train_data)
eval_dataset = Dataset.from_list(eval_data)

In [6]:
from transformers import AutoTokenizer

checkpoint = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 has no pad token

def tokenize_function(example):
    result = tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )
    result["labels"] = result["input_ids"].clone()  # 👈 this is the key line
    return result

# Process datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_eval = eval_dataset.map(tokenize_function, batched=True, remove_columns=["text"])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/3141 [00:00<?, ? examples/s]

Map:   0%|          | 0/349 [00:00<?, ? examples/s]

In [7]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(checkpoint)
model.resize_token_embeddings(len(tokenizer))

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Embedding(50257, 768)

In [10]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=3,
    learning_rate=5e-5,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    gradient_accumulation_steps=10,
    num_train_epochs=100,
    weight_decay=0.01,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="tensorboard",
    fp16=True,  # Mixed precision training
)

# 6. Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # We're doing causal LM, not masked LM
)

# Update trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

In [11]:
# 9. Train model
trainer.train()

Step,Training Loss,Validation Loss
100,3.0343,2.654956
200,2.0553,2.037369
300,1.7025,1.736232
400,1.4438,1.538912
500,1.3534,1.445961
600,1.0263,1.397215
700,0.9293,1.374355
800,0.8389,1.360474
900,0.8346,1.368159
1000,0.7765,1.402297


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=1300, training_loss=1.3194193011063795, metrics={'train_runtime': 1337.3655, 'train_samples_per_second': 234.865, 'train_steps_per_second': 2.318, 'total_flos': 4259463655587840.0, 'train_loss': 1.3194193011063795, 'epoch': 41.92063492063492})

In [12]:
# 10. Evaluate final model
eval_results = trainer.evaluate()
print(f"Final evaluation results: {eval_results}")

Final evaluation results: {'eval_loss': 1.3604736328125, 'eval_runtime': 1.3549, 'eval_samples_per_second': 257.576, 'eval_steps_per_second': 25.831, 'epoch': 41.92063492063492}


In [55]:
from transformers import pipeline

chatbot = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Ask a question
chatbot("Q: tell me about Petra \nA:", max_length=100)

Device set to use cuda:0


[{'generated_text': 'Q: tell me about Petra \nA: Petra is an ancient Nabataean city carved into red sandstone cliffs, celebrated for its architectural achievements and water management system, and is one of the New Seven Wonders of the World, boasting landmarks like the Treasury and the Monastery. It is one of the New Seven Wonders of the World, boasting iconic landmarks like the Treasury and the Monastery. It is one of the New Seven Wonders of the World, showcasing the ancient Nabataean city and'}]

In [14]:
model.save_pretrained("42amman_model")
tokenizer.save_pretrained("42amman_model")

('42amman_model/tokenizer_config.json',
 '42amman_model/special_tokens_map.json',
 '42amman_model/vocab.json',
 '42amman_model/merges.txt',
 '42amman_model/added_tokens.json',
 '42amman_model/tokenizer.json')

In [15]:
from google.colab import files
#!cp -r 42amman_model /content/drive/MyDrive/42amman_model

!zip -r 42amman_model.zip 42amman_model
files.download('42amman_model.zip')

  adding: 42amman_model/ (stored 0%)
  adding: 42amman_model/model.safetensors (deflated 7%)
  adding: 42amman_model/generation_config.json (deflated 24%)
  adding: 42amman_model/vocab.json (deflated 59%)
  adding: 42amman_model/tokenizer_config.json (deflated 54%)
  adding: 42amman_model/config.json (deflated 51%)
  adding: 42amman_model/merges.txt (deflated 53%)
  adding: 42amman_model/tokenizer.json (deflated 82%)
  adding: 42amman_model/special_tokens_map.json (deflated 60%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [16]:
#from google.colab import drive
#drive.mount('/content/drive')

#!cp -r results /content/drive/MyDrive/results42amman_model

!zip -r results.zip results
files.download('results.zip')

  adding: results/ (stored 0%)
  adding: results/checkpoint-800/ (stored 0%)
  adding: results/checkpoint-800/model.safetensors (deflated 7%)
  adding: results/checkpoint-800/scaler.pt (deflated 60%)
  adding: results/checkpoint-800/rng_state.pth (deflated 25%)
  adding: results/checkpoint-800/generation_config.json (deflated 24%)
  adding: results/checkpoint-800/vocab.json (deflated 59%)
  adding: results/checkpoint-800/optimizer.pt (deflated 8%)
  adding: results/checkpoint-800/tokenizer_config.json (deflated 54%)
  adding: results/checkpoint-800/config.json (deflated 51%)
  adding: results/checkpoint-800/scheduler.pt (deflated 56%)
  adding: results/checkpoint-800/merges.txt (deflated 53%)
  adding: results/checkpoint-800/tokenizer.json (deflated 82%)
  adding: results/checkpoint-800/trainer_state.json (deflated 79%)
  adding: results/checkpoint-800/special_tokens_map.json (deflated 60%)
  adding: results/checkpoint-800/training_args.bin (deflated 51%)
  adding: results/runs/ (store

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>