In [1]:
!pip install -q transformers datasets accelerate


In [2]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("GPU:", torch.cuda.get_device_name(0))


CUDA available: True
GPU: Tesla P100-PCIE-16GB


# **Load WikiText-103 Dataset**

In [3]:
from datasets import load_dataset

dataset = load_dataset("wikitext", "wikitext-103-raw-v1")
print(dataset)


README.md: 0.00B [00:00, ?B/s]

wikitext-103-raw-v1/test-00000-of-00001.(…):   0%|          | 0.00/733k [00:00<?, ?B/s]

wikitext-103-raw-v1/train-00000-of-00002(…):   0%|          | 0.00/157M [00:00<?, ?B/s]

wikitext-103-raw-v1/train-00001-of-00002(…):   0%|          | 0.00/157M [00:00<?, ?B/s]

wikitext-103-raw-v1/validation-00000-of-(…):   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1801350 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 1801350
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})


# **Dataset Builder (20k+ Clean Paragraphs)**

In [4]:
def build_paragraph_dataset(
    texts,
    min_words=30,
    max_samples=20000
):
    paragraphs = []

    for text in texts:
        text = text.strip()

        if len(text.split()) >= min_words:
            sample = (
                "<|endoftext|>\n"
                f"{text}\n"
                "<|endoftext|>"
            )
            paragraphs.append(sample)

        if len(paragraphs) >= max_samples:
            break

    return paragraphs


## **Build the Dataset**

In [5]:
raw_texts = dataset["train"]["text"]

paragraphs = build_paragraph_dataset(
    raw_texts,
    min_words=30,
    max_samples=20000   # >= 20k as required
)

print("Total paragraphs:", len(paragraphs))
print("\nSample paragraph:\n", paragraphs[0])


Total paragraphs: 20000

Sample paragraph:
 <|endoftext|>
Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " .
<|endoftext|>


## **Save to Text File**

In [6]:
with open("wikitext_paragraphs.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(paragraphs))


# **Load GPT-2 Model & Tokenizer**

In [7]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

model_name = "gpt2"  # small GPT-2 (fits P100 well)

tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))

model.to("cuda")


2025-12-18 13:58:34.793019: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766066314.967561      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766066315.021950      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766066315.426629      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766066315.426669      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766066315.426672      55 computation_placer.cc:177] computation placer alr

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

# **Tokenization & Block Processing**

In [9]:
from datasets import load_dataset

text_dataset = load_dataset(
    "text",
    data_files="wikitext_paragraphs.txt"
)["train"]


In [10]:
block_size = 128

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=block_size
    )

tokenized_dataset = text_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)


Map:   0%|          | 0/60000 [00:00<?, ? examples/s]

## **Group Tokens into Blocks**

In [11]:
def group_texts(examples):
    concatenated = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated["input_ids"])
    total_length = (total_length // block_size) * block_size

    result = {
        k: [
            t[i : i + block_size]
            for i in range(0, total_length, block_size)
        ]
        for k, t in concatenated.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_dataset = tokenized_dataset.map(group_texts, batched=True)


Map:   0%|          | 0/60000 [00:00<?, ? examples/s]

# **Fine-Tuning Setup**

In [12]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling


In [13]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)


In [14]:
training_args = TrainingArguments(
    output_dir="/kaggle/working/gpt2-wikitext-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=3,          # WikiText is clean; 3 epochs is enough
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,  # effective batch = 16
    learning_rate=2e-5,
    warmup_steps=100,
    fp16=True,
    logging_steps=100,
    save_steps=1000,
    save_total_limit=2,
    report_to="none"
)


# **Train the Model**

In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset,
    data_collator=data_collator
)

trainer.train()


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,4.1199
200,3.7377
300,3.7002
400,3.6449
500,3.6111
600,3.6018
700,3.5622
800,3.5809
900,3.5545
1000,3.5711


TrainOutput(global_step=3267, training_loss=3.5131947470981117, metrics={'train_runtime': 1151.6629, 'train_samples_per_second': 45.365, 'train_steps_per_second': 2.837, 'total_flos': 3412800552960000.0, 'train_loss': 3.5131947470981117, 'epoch': 3.0})

## **Save the Fine-Tuned Model**

In [16]:
trainer.save_model("/kaggle/working/gpt2-wikitext-finetuned")
tokenizer.save_pretrained("/kaggle/working/gpt2-wikitext-finetuned")


('/kaggle/working/gpt2-wikitext-finetuned/tokenizer_config.json',
 '/kaggle/working/gpt2-wikitext-finetuned/special_tokens_map.json',
 '/kaggle/working/gpt2-wikitext-finetuned/vocab.json',
 '/kaggle/working/gpt2-wikitext-finetuned/merges.txt',
 '/kaggle/working/gpt2-wikitext-finetuned/added_tokens.json')

# **Text Generation**

In [17]:
from transformers import pipeline

generator = pipeline(
    "text-generation",
    model="/kaggle/working/gpt2-wikitext-finetuned",
    tokenizer=tokenizer,
    device=0
)


Device set to use cuda:0


## **Generate a New Paragraph from a Sentence**

In [18]:
prompt = "Artificial intelligence is transforming modern society by"

outputs = generator(
    prompt,
    max_new_tokens=150,
    num_return_sequences=3,
    temperature=0.7,
    top_p=0.9,
    do_sample=True,
    repetition_penalty=1.2,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id
)

for i, out in enumerate(outputs):
    print(f"\nGenerated {i+1}:\n{out['generated_text']}")



Generated 1:
Artificial intelligence is transforming modern society by increasing the size of our digital networks, enabling us to organize and communicate with each other. AI will replace human knowledge in a way that makes it more accessible for everyone else through smart devices such as computers or robotics ; this can be accomplished via an increase on privacy @-@ protections which allow people who have not been affected directly from being able access information about their surroundings without fear thereof ( i.e., no threat ) [ 1 ] –[ 2 ]. In addition, artificial intelligences may enhance communication between humans using technologies like Artificial Intelligence Technologies " : Smartphones are now connected wirelessly — they do so because these new methods offer greater reliability than existing communications systems were designed at first but still provide lower cost per unit time compared wiener technology would otherwise

Generated 2:
Artificial intelligence is transfor