# 🔧 Environment Setup: Install Required Dependencies

In [1]:
# Install Hugging Face dependencies
!pip install -q transformers datasets

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.5.3.2 which is incompatible.
torch 2.6.0+cu124 requires nvidi

# 📚 Extended Pretraining on Specialized Domain Corpus

In [2]:
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling


# 🧠 Load the Pretrained Model

In [3]:
# ✅ Use a small model like DistilGPT-2
model_name = "distilgpt2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # To avoid padding issues

model = AutoModelForCausalLM.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

# 📂 Load the Dataset

In [4]:
# ✅ Tiny medical dataset
texts = [
    "Hypertension is a chronic condition characterized by elevated blood pressure.",
    "Diabetes is caused by insufficient insulin production or response.",
    "MRI scans help visualize organs using magnetic fields and radio waves.",
    "An ECG records the electrical signals in the heart.",
    "Asthma involves inflammation and narrowing of the airways."
]

# ✂️ Tokenize the Dataset for Language Modeling

In [5]:
# Convert to Hugging Face Dataset
dataset = Dataset.from_dict({"text": texts})

# Tokenize function
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=64)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

# ⚙️ Define Training Configuration and Hyperparameters

In [6]:
# ✅ Training arguments (CPU-friendly)
training_args = TrainingArguments(
    output_dir="./distilgpt2-medical-pretrain",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    logging_steps=1,
    save_steps=10,
    save_total_limit=1,
    prediction_loss_only=True,
    report_to="none",  # Disable logging
)

# 🚀 Train the Language Model

In [7]:
# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

# Train!
trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
1,5.732
2,5.6668
3,4.8725
4,4.0337
5,4.6213


TrainOutput(global_step=5, training_loss=4.985265922546387, metrics={'train_runtime': 20.028, 'train_samples_per_second': 0.25, 'train_steps_per_second': 0.25, 'total_flos': 81655234560.0, 'train_loss': 4.985265922546387, 'epoch': 1.0})