In [21]:
!pip install transformers datasets evaluate



In [22]:

import json
import numpy as np
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    EarlyStoppingCallback
)
from sklearn.model_selection import train_test_split
import evaluate
import torch

In [23]:
import json

# Open the uploaded JSON file
with open("data.json") as f:
    data = json.load(f)

# Format the dataset as "Q: question \nA: answer"
dataset = [
    {"text": f"Q: {item['prompt']}\nA: {item['completion']}"}
    for item in data
]

# View the first couple of items
dataset[:2]

[{'text': 'Q: What is 42 Amman?\nA: 42 Amman is a tuition-free coding school in Amman, Jordan, launched on February 25, 2024, by the Crown Prince Foundation under HRH Crown Prince Hussein. Part of the global 42 Network, it offers peer-to-peer, project-based learning to train Jordanian youth in digital skills, targeting 1,300 students over five years.'},
 {'text': 'Q: What is 42 Irbid?\nA: 42 Irbid is a tuition-free coding school in Irbid, Jordan, funded with 572,000 JD in November 2024 by the Ministry of Planning and Crown Prince Foundation. It extends the 42 Network’s peer-to-peer model to northern Jordan, training youth in globally recognized tech skills.'}]

In [24]:
from datasets import Dataset
train_data, eval_data = train_test_split(dataset, test_size=0.1, random_state=42)
print(f"Training on {len(train_data)} examples, evaluating on {len(eval_data)} examples")

Training on 1080 examples, evaluating on 121 examples


In [25]:
# Convert to HF datasets
train_dataset = Dataset.from_list(train_data)
eval_dataset = Dataset.from_list(eval_data)

In [26]:
from transformers import AutoTokenizer

checkpoint = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 has no pad token

def tokenize_function(example):
    result = tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )
    result["labels"] = result["input_ids"].clone()  # 👈 this is the key line
    return result

# Process datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_eval = eval_dataset.map(tokenize_function, batched=True, remove_columns=["text"])


Map:   0%|          | 0/1080 [00:00<?, ? examples/s]

Map:   0%|          | 0/121 [00:00<?, ? examples/s]

In [27]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(checkpoint)
model.resize_token_embeddings(len(tokenizer))

Embedding(50257, 768)

In [39]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=3,
    learning_rate=5e-5,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    gradient_accumulation_steps=10,
    num_train_epochs=50,
    weight_decay=0.01,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="tensorboard",
    fp16=True,  # Mixed precision training
)

# 6. Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # We're doing causal LM, not masked LM
)

# Update trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [40]:
# 9. Train model
trainer.train()

Step,Training Loss,Validation Loss
100,1.1493,1.19013
200,0.6947,0.93854
300,0.5399,0.911392
400,0.4878,0.910585
500,0.4793,0.909919


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=500, training_loss=0.8150457677841186, metrics={'train_runtime': 511.9632, 'train_samples_per_second': 105.476, 'train_steps_per_second': 0.977, 'total_flos': 1603708806758400.0, 'train_loss': 0.8150457677841186, 'epoch': 45.46296296296296})

In [30]:
# 10. Evaluate final model
eval_results = trainer.evaluate()
print(f"Final evaluation results: {eval_results}")

Final evaluation results: {'eval_loss': 2.080559730529785, 'eval_runtime': 0.5092, 'eval_samples_per_second': 237.633, 'eval_steps_per_second': 25.531, 'epoch': 6.37037037037037}


In [31]:
# 11. Save model and tokenizer
model.save_pretrained("./enhanced_42amman_model")
tokenizer.save_pretrained("./enhanced_42amman_model")

('./enhanced_42amman_model/tokenizer_config.json',
 './enhanced_42amman_model/special_tokens_map.json',
 './enhanced_42amman_model/vocab.json',
 './enhanced_42amman_model/merges.txt',
 './enhanced_42amman_model/added_tokens.json',
 './enhanced_42amman_model/tokenizer.json')

In [46]:
from transformers import pipeline

chatbot = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Ask a question
chatbot("Q: What is mansaf?\nA:", max_length=100)

Device set to use cuda:0


[{'generated_text': "Q: What is mansaf?\nA: Mansaf is a national dish made with lamb, as well as falafel, shawarma, hummus, moutabal, tabbouleh, maqluba (a flavorful rice dish with meat and veggies flipped upside down), and desserts like knafeh and qatayef, all enjoyed with a cup of Bedouin tea or Arabic coffee. It's a popular Bedouin tea or Arabic coffee, accompanied by"}]

In [33]:
model.save_pretrained("42amman_model")
tokenizer.save_pretrained("42amman_model")

('42amman_model/tokenizer_config.json',
 '42amman_model/special_tokens_map.json',
 '42amman_model/vocab.json',
 '42amman_model/merges.txt',
 '42amman_model/added_tokens.json',
 '42amman_model/tokenizer.json')

In [34]:
from google.colab import files
#!cp -r 42amman_model /content/drive/MyDrive/42amman_model

!zip -r 42amman_model.zip 42amman_model
files.download('42amman_model.zip')

updating: 42amman_model/ (stored 0%)
updating: 42amman_model/model.safetensors (deflated 7%)
updating: 42amman_model/generation_config.json (deflated 24%)
updating: 42amman_model/vocab.json (deflated 59%)
updating: 42amman_model/tokenizer_config.json (deflated 54%)
updating: 42amman_model/config.json (deflated 51%)
updating: 42amman_model/merges.txt (deflated 53%)
updating: 42amman_model/tokenizer.json (deflated 82%)
updating: 42amman_model/special_tokens_map.json (deflated 60%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [35]:
#from google.colab import drive
#drive.mount('/content/drive')

#!cp -r results /content/drive/MyDrive/results42amman_model

!zip -r results.zip results
files.download('results.zip')

updating: results/ (stored 0%)
updating: results/checkpoint-300/ (stored 0%)
updating: results/checkpoint-300/model.safetensors (deflated 7%)
updating: results/checkpoint-300/scaler.pt (deflated 60%)
updating: results/checkpoint-300/rng_state.pth (deflated 25%)
updating: results/checkpoint-300/generation_config.json (deflated 24%)
updating: results/checkpoint-300/vocab.json (deflated 59%)
updating: results/checkpoint-300/optimizer.pt (deflated 8%)
updating: results/checkpoint-300/tokenizer_config.json (deflated 54%)
updating: results/checkpoint-300/config.json (deflated 51%)
updating: results/checkpoint-300/scheduler.pt (deflated 56%)
updating: results/checkpoint-300/merges.txt (deflated 53%)
updating: results/checkpoint-300/tokenizer.json (deflated 82%)
updating: results/checkpoint-300/trainer_state.json (deflated 74%)
updating: results/checkpoint-300/special_tokens_map.json (deflated 60%)
updating: results/checkpoint-300/training_args.bin (deflated 52%)
updating: results/checkpoint-2

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>