In [1]:
# !pip install transformers datasets
# !pip install -U fsspec==2023.6.0  # ✅ Fix for load_dataset error


In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
#  !pip install -U transformers


In [4]:
from datasets import load_dataset

# Example: using wikitext for demo
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split='train')
def tokenize_function(examples):
    return tokenizer(examples["text"], return_special_tokens_mask=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)


README.md: 0.00B [00:00, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

In [5]:
# Step 1: Install latest versions
# !pip install -U transformers datasets fsspec

# Confirm the version (should be >= 4.30+)
import transformers
print(transformers.__version__)


4.53.0


In [15]:
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, TrainingArguments, Trainer
from datasets import load_dataset

model_name = "distilgpt2"

# 1. Load tokenizer and set padding token
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Set the padding token to the end-of-sequence token

In [16]:
# Load model
model = AutoModelForCausalLM.from_pretrained(model_name)

In [17]:
# Load and split the dataset
raw_datasets = load_dataset("wikitext", "wikitext-2-raw-v1")

In [18]:

# Tokenize the dataset and filter empty sequences
def tokenize_function(examples):
    # First remove empty or whitespace-only texts
    examples["text"] = [text for text in examples["text"] if text.strip() != ""]
    return tokenizer(examples["text"], truncation=True)

tokenized_datasets = raw_datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"],  # Remove the original text column
    batch_size=1000  # Process in larger batches for efficiency
)

Map:   0%|          | 0/4358 [00:00<?, ? examples/s]

Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

Map:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [19]:
# Filter out empty sequences (those with no input_ids)
tokenized_datasets = tokenized_datasets.filter(
    lambda example: len(example["input_ids"]) > 0
)


Filter:   0%|          | 0/2891 [00:00<?, ? examples/s]

Filter:   0%|          | 0/23767 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2461 [00:00<?, ? examples/s]

In [21]:
# Create data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # We're doing causal language modeling, not masked LM
)

In [22]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    per_device_train_batch_size=4,
    num_train_epochs=1,
    save_steps=500,
    logging_steps=100,
    push_to_hub=False,
)

In [23]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
)

In [24]:
# Train
trainer.train()

Epoch,Training Loss,Validation Loss
1,3.5228,3.589502


TrainOutput(global_step=5942, training_loss=3.604568607275111, metrics={'train_runtime': 1096.7905, 'train_samples_per_second': 21.67, 'train_steps_per_second': 5.418, 'total_flos': 1234709312126976.0, 'train_loss': 3.604568607275111, 'epoch': 1.0})

In [25]:
from transformers import pipeline

generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
generator("Once upon a time", max_length=50, num_return_sequences=1)


Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


[{'generated_text': 'Once upon a time of the last remaining year of the drought, the Red Bulls signed former world and international star Michael Bradley, who played in a limited period of time. Bradley played seven games for the Red Bulls, totaling six points in the first half. Bradley was on the verge of playing in the fourth quarter but came off the bench. He scored his first goal in the fourth quarter, and scored his first goal in the second half. \n\n\n \n \n \n \n \n \n \n  \n \n  \n  \n  \n  \n \n  \n  \n  \n  \n \n \n  \n  \n   \n   \n   \n     \n    \n    \n      \n   \n       \n       \n          \n       \n         \n                  \n          '}]

In [28]:
generator(
    "Lesson",
    max_length=50,
    temperature=0.7,           # Reduce randomness
    do_sample=True,
    top_k=50,                   # Focus on high-probability tokens
    pad_token_id=tokenizer.eos_token_id
)

Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


[{'generated_text': 'Lessonology of the Church ( 1778 ), published in 1854, was based on the idea of a " faith which does not fall from heaven ", and was not inspired by the " doctrine of man. The idea of a " faith " is not believed to be a universal concept, but rather an " eternal concept ". \n\n\n\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n  \n  \n  \n \n \n     \n    \n    \n  \n   \n       \n      \n       \n           \n          \n           \n             \n                  \n                         \n      '}]