In [31]:
!pip install transformers datasets evaluate -q

In [32]:
from datasets import load_dataset

raw_dataset = load_dataset("dany0407/eli5_category", split="train[:5000]")

In [33]:
raw_dataset = raw_dataset.train_test_split(test_size=0.2)

In [34]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")

In [35]:
raw_dataset = raw_dataset.flatten()

In [36]:
def tokenize_fn(examples):
    return tokenizer([' '.join(x) for x in examples['answers.text']])

In [37]:
tokenized_dataset = raw_dataset.map(
    tokenize_fn,
    batched=True,
    num_proc=4,
    remove_columns=raw_dataset.column_names['train']
)

In [38]:
block_size = 128

def concatenate_inputs(examples):
    examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(examples[list(examples.keys())[0]])
    if total_length > block_size:
        total_length -= total_length%block_size
    result = {
        k: [v[i:i+block_size] for i in range(0, total_length, block_size)] for k, v in examples.items()
    }
    result['labels'] = result['input_ids']
    return result

In [39]:
tokenized_dataset = tokenized_dataset.map(
    concatenate_inputs,
    num_proc=4,
    batched=True
)

In [40]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [41]:
from transformers import TrainingArguments, Trainer, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")

In [42]:
training_args = TrainingArguments(
    output_dir='/kaggle/working/results',
    report_to='none',
    eval_strategy='steps',
    save_strategy='steps',
    eval_steps=0.00615,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=1,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    data_collator=data_collator,
    processing_class=tokenizer
)

In [43]:
trainer.train()

Step,Training Loss,Validation Loss
2,No log,3.986981
4,No log,3.97426
6,No log,3.965734
8,No log,3.960572
10,No log,3.956552
12,No log,3.9526
14,No log,3.948845
16,No log,3.945738
18,No log,3.942949
20,No log,3.940319




TrainOutput(global_step=164, training_loss=1.9854918689262577, metrics={'train_runtime': 1443.7581, 'train_samples_per_second': 7.247, 'train_steps_per_second': 0.114, 'total_flos': 341743487680512.0, 'train_loss': 1.9854918689262577, 'epoch': 1.0})

In [44]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.3f}")



Perplexcity: 49.038


In [45]:
model.save_pretrained("/kaggle/working/distilbertgpt2-finetuned-reddit")

In [49]:
prompt = "What happens when a galaxy starts to"
inputs = tokenizer(prompt, return_tensors='pt')
inputs = {k: v.to("cuda") for k, v in inputs.items()}

outputs = model.generate(
    inputs['input_ids'],

    # --- Length Control ---
    max_new_tokens=150,          # Generate up to 150 new tokens beyond the prompt
    min_length=25,               # Ensure the output has at least 50 tokens

    # --- Sampling Settings ---
    do_sample=True,              # Enable sampling instead of greedy decoding
    temperature=0.7,             # Lower = more focused, higher = more random
    top_k=50,                    # Only sample from the top 50 most likely next tokens
    top_p=0.9,                   # Or from the smallest set of tokens whose prob sum ≥ 0.9

    # --- Beam Search Settings ---
    num_beams=5,                 # Use 5 beams for higher-quality text (slower)
    early_stopping=True,         # Stop when all beams reach the end-of-sequence token
    length_penalty=1.1,          # Slightly prefer longer sentences during beam search

    # --- Repetition & Loop Prevention ---
    repetition_penalty=1.2,      # Penalize repeating phrases or tokens
    no_repeat_ngram_size=3,      # Prevent repeating any 3-word sequences

    # --- Output Options ---
    num_return_sequences=3,      # Generate 3 different continuations
    pad_token_id=tokenizer.eos_token_id,  # Use EOS token for padding
    eos_token_id=tokenizer.eos_token_id,  # Stop generation at the EOS token

    # --- Optional Filtering ---
    bad_words_ids=[[tokenizer.encode("spoiler")[0]]],  # Avoid generating certain words
)

In [50]:
for output in outputs:
    print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

["What happens when a galaxy starts to get bigger and bigger, and that’s why it's so important to keep in mind that this is the first time you’ve ever seen a galaxy with a massive mass of more than one billion light-years away. This is because galaxies have a mass of about 1.5 billion times the mass of the sun. The mass of a galaxy is about 2.5 million times that of a star. If you look at the Hubble Space Telescope, you can see that galaxies have an average mass of around 1.2 billion light years away. That means that if you look through the Hubble space telescope, you will see a galaxy that is about 3.4 billion times as large as the Milky Way. That's a lot of light", "What happens when a galaxy starts to get bigger and bigger, and that’s why it's so important to keep in mind that this is the first time you’ve ever seen a galaxy with a massive mass of more than one billion light-years away. This is because galaxies have a mass of about 1.5 billion times the mass of the sun. The mass of 