In [1]:
import sys, torch
print("Python:", sys.executable)
print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())


Python: C:\Users\Adarsh\venvs\genai312\Scripts\python.exe
Torch: 2.5.1+cu121
CUDA available: True


In [2]:
from datasets import Dataset, DatasetDict

# Path where your files are (from your screenshot)
data_dir = r"C:\Users\Adarsh\Downloads\archive\wikitext-2"

def load_text_file(path):
    with open(path, "r", encoding="utf-8") as f:
        lines = f.read().split("\n")
    # remove empty lines
    lines = [l for l in lines if l.strip() != ""]
    return Dataset.from_dict({"text": lines})

ds = DatasetDict({
    "train": load_text_file(f"{data_dir}/wiki.train.tokens"),
    "validation": load_text_file(f"{data_dir}/wiki.valid.tokens"),
    "test": load_text_file(f"{data_dir}/wiki.test.tokens"),
})

print(ds)
print(ds["train"][0])


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 23767
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 2461
    })
    test: Dataset({
        features: ['text'],
        num_rows: 2891
    })
})
{'text': ' = Valkyria Chronicles III = '}


In [3]:
from transformers import AutoTokenizer

gpt_tok  = AutoTokenizer.from_pretrained("distilgpt2")
bert_tok = AutoTokenizer.from_pretrained("bert-base-uncased")
t5_tok   = AutoTokenizer.from_pretrained("t5-small")

print("Tokenizers ready")


Tokenizers ready


In [4]:
# NEXT STEP: Set pad tokens (required for batching)
gpt_tok.pad_token = gpt_tok.eos_token   # GPT-style uses EOS as PAD
t5_tok.pad_token = t5_tok.pad_token     # already set, just explicit
bert_tok.pad_token = bert_tok.pad_token # already set, just explicit

print("GPT pad token:", gpt_tok.pad_token, "| id:", gpt_tok.pad_token_id)


GPT pad token: <|endoftext|> | id: 50256


In [5]:
# NEXT STEP: Tokenization for GPT (causal LM)

def tokenize_gpt(examples):
    return gpt_tok(
        examples["text"],
        truncation=True,
        max_length=128,
        padding="max_length"
    )

gpt_ds = ds.map(tokenize_gpt, batched=True, remove_columns=["text"])

# For GPT LM, labels = input_ids
gpt_ds = gpt_ds.map(lambda x: {"labels": x["input_ids"]}, batched=True)

print(gpt_ds)


Map:   0%|          | 0/23767 [00:00<?, ? examples/s]

Map:   0%|          | 0/2461 [00:00<?, ? examples/s]

Map:   0%|          | 0/2891 [00:00<?, ? examples/s]

Map:   0%|          | 0/23767 [00:00<?, ? examples/s]

Map:   0%|          | 0/2461 [00:00<?, ? examples/s]

Map:   0%|          | 0/2891 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 23767
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2461
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2891
    })
})


In [6]:
# NEXT STEP: Tokenization for BERT (MLM)

def tokenize_bert(examples):
    return bert_tok(
        examples["text"],
        truncation=True,
        max_length=128,
        padding="max_length"
    )

bert_ds = ds.map(tokenize_bert, batched=True, remove_columns=["text"])
print(bert_ds)


Map:   0%|          | 0/23767 [00:00<?, ? examples/s]

Map:   0%|          | 0/2461 [00:00<?, ? examples/s]

Map:   0%|          | 0/2891 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 23767
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2461
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2891
    })
})


In [7]:
# NEXT STEP: Tokenization for T5 using text_target (no as_target_tokenizer)

def tokenize_t5(examples):
    return t5_tok(
        examples["text"],
        truncation=True,
        max_length=128,
        padding="max_length",
        text_target=examples["text"],          # labels
        max_target_length=128
    )

t5_ds = ds.map(tokenize_t5, batched=True, remove_columns=["text"])
print(t5_ds)


Map:   0%|          | 0/23767 [00:00<?, ? examples/s]

Map:   0%|          | 0/2461 [00:00<?, ? examples/s]

Map:   0%|          | 0/2891 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 23767
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2461
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2891
    })
})


In [8]:
# NEXT STEP: Train GPT-style model (distilgpt2) on WikiText-2 (causal LM)

import torch
from transformers import (
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
)

# Causal LM model
gpt_model = AutoModelForCausalLM.from_pretrained("distilgpt2")

# Data collator: causal LM (mlm=False)
gpt_collator = DataCollatorForLanguageModeling(tokenizer=gpt_tok, mlm=False)

training_args = TrainingArguments(
    output_dir="runs_gpt",
    eval_strategy="epoch",      # <-- new name in your version
    save_strategy="no",
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_steps=50,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

gpt_trainer = Trainer(
    model=gpt_model,
    args=training_args,
    train_dataset=gpt_ds["train"],
    eval_dataset=gpt_ds["validation"],
    data_collator=gpt_collator,
)

gpt_trainer.train()
gpt_eval = gpt_trainer.evaluate()
print("GPT eval:", gpt_eval)


Loading weights:   0%|          | 0/76 [00:00<?, ?it/s]

[1mGPT2LMHeadModel LOAD REPORT[0m from: distilgpt2
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
transformer.h.{0, 1, 2, 3, 4, 5}.attn.bias | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,3.406336,3.14355


GPT eval: {'eval_loss': 3.143549919128418, 'eval_runtime': 6.1609, 'eval_samples_per_second': 399.457, 'eval_steps_per_second': 24.996, 'epoch': 1.0}


In [9]:
import math

gpt_ppl = math.exp(gpt_eval["eval_loss"])
print("GPT validation perplexity:", gpt_ppl)


GPT validation perplexity: 23.186029466647234


In [10]:
# NEXT STEP: Train BERT for Masked Language Modeling (MLM)

from transformers import (
    AutoModelForMaskedLM,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
)

bert_model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")

# MLM collator (15% masking)
bert_collator = DataCollatorForLanguageModeling(
    tokenizer=bert_tok,
    mlm=True,
    mlm_probability=0.15,
)

bert_args = TrainingArguments(
    output_dir="runs_bert",
    eval_strategy="epoch",
    save_strategy="no",
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_steps=50,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

bert_trainer = Trainer(
    model=bert_model,
    args=bert_args,
    train_dataset=bert_ds["train"],
    eval_dataset=bert_ds["validation"],
    data_collator=bert_collator,
)

bert_trainer.train()
bert_eval = bert_trainer.evaluate()
print("BERT eval:", bert_eval)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Loading weights:   0%|          | 0/202 [00:00<?, ?it/s]

[1mBertForMaskedLM LOAD REPORT[0m from: bert-base-uncased
Key                         | Status     |  | 
----------------------------+------------+--+-
bert.pooler.dense.bias      | UNEXPECTED |  | 
cls.seq_relationship.bias   | UNEXPECTED |  | 
cls.seq_relationship.weight | UNEXPECTED |  | 
bert.pooler.dense.weight    | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Epoch,Training Loss,Validation Loss
1,1.635023,1.416378


BERT eval: {'eval_loss': 1.4000164270401, 'eval_runtime': 6.3316, 'eval_samples_per_second': 388.687, 'eval_steps_per_second': 24.323, 'epoch': 1.0}


In [12]:
import math

bert_ppl_proxy = math.exp(bert_eval["eval_loss"])
print("BERT validation exp(MLM loss) (proxy):", bert_ppl_proxy)


BERT validation exp(MLM loss) (proxy): 4.05526658232429


In [13]:
# NEXT STEP: Train T5-small (seq2seq)

from transformers import (
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    TrainingArguments,
    Trainer,
)

t5_model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

t5_collator = DataCollatorForSeq2Seq(
    tokenizer=t5_tok,
    model=t5_model,
)

t5_args = TrainingArguments(
    output_dir="runs_t5",
    eval_strategy="epoch",
    save_strategy="no",
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_steps=50,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

t5_trainer = Trainer(
    model=t5_model,
    args=t5_args,
    train_dataset=t5_ds["train"],
    eval_dataset=t5_ds["validation"],
    data_collator=t5_collator,
)

t5_trainer.train()
t5_eval = t5_trainer.evaluate()
print("T5 eval:", t5_eval)


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Loading weights:   0%|          | 0/131 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss
1,0.011877,0.000997


T5 eval: {'eval_loss': 0.0009973767446354032, 'eval_runtime': 7.1674, 'eval_samples_per_second': 343.362, 'eval_steps_per_second': 21.486, 'epoch': 1.0}


In [14]:
import random

# NEXT STEP: Build T5 denoising dataset (corrupt input, clean target)
# Simple corruption: drop random tokens and ask T5 to reconstruct original text

def corrupt_text(s, drop_prob=0.3):
    toks = s.split()
    if len(toks) < 8:
        return s  # skip very short lines
    kept = [t for t in toks if random.random() > drop_prob]
    if len(kept) < 3:
        kept = toks[:3]
    return " ".join(kept)

def make_t5_denoise(examples):
    inputs = []
    targets = []
    for s in examples["text"]:
        s = s.strip()
        if not s:
            continue
        inputs.append("denoise: " + corrupt_text(s, drop_prob=0.35))
        targets.append(s)

    return t5_tok(
        inputs,
        truncation=True,
        max_length=128,
        padding="max_length",
        text_target=targets,
        max_target_length=128
    )

t5_ds_denoise = ds.map(make_t5_denoise, batched=True, remove_columns=["text"])
print(t5_ds_denoise)


Map:   0%|          | 0/23767 [00:00<?, ? examples/s]

Map:   0%|          | 0/2461 [00:00<?, ? examples/s]

Map:   0%|          | 0/2891 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 23767
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2461
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2891
    })
})


In [15]:
# NEXT STEP: Retrain T5 on denoising objective

from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer
import torch

t5_model2 = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

t5_collator2 = DataCollatorForSeq2Seq(tokenizer=t5_tok, model=t5_model2)

t5_args2 = TrainingArguments(
    output_dir="runs_t5_denoise",
    eval_strategy="epoch",
    save_strategy="no",
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_steps=50,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

t5_trainer2 = Trainer(
    model=t5_model2,
    args=t5_args2,
    train_dataset=t5_ds_denoise["train"],
    eval_dataset=t5_ds_denoise["validation"],
    data_collator=t5_collator2,
)

t5_trainer2.train()
t5_eval2 = t5_trainer2.evaluate()
print("T5 denoise eval:", t5_eval2)


Loading weights:   0%|          | 0/131 [00:00<?, ?it/s]

Epoch,Training Loss,Validation Loss
1,1.034427,0.944288


T5 denoise eval: {'eval_loss': 0.9442878365516663, 'eval_runtime': 6.6702, 'eval_samples_per_second': 368.954, 'eval_steps_per_second': 23.088, 'epoch': 1.0}


In [16]:
import math

t5_ppl = math.exp(t5_eval2["eval_loss"])
print("T5 denoise validation perplexity:", t5_ppl)

metrics_summary = {
    "GPT (distilgpt2) val_loss": gpt_eval["eval_loss"],
    "GPT perplexity": gpt_ppl,
    "BERT (MLM) val_loss": bert_eval["eval_loss"],
    "BERT exp(MLM loss) proxy": bert_ppl_proxy,
    "T5 (denoise) val_loss": t5_eval2["eval_loss"],
    "T5 perplexity": t5_ppl,
}

metrics_summary


T5 denoise validation perplexity: 2.5709817672214577


{'GPT (distilgpt2) val_loss': 3.143549919128418,
 'GPT perplexity': 23.186029466647234,
 'BERT (MLM) val_loss': 1.4000164270401,
 'BERT exp(MLM loss) proxy': 4.05526658232429,
 'T5 (denoise) val_loss': 0.9442878365516663,
 'T5 perplexity': 2.5709817672214577}

In [22]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ================= GPT (decoder-only) =================
gpt_model = gpt_model.to(device)
gpt_model.eval()

prompt = "Valkyria Chronicles III is"
gpt_inputs = gpt_tok(prompt, return_tensors="pt").to(device)

with torch.no_grad():
    gpt_ids = gpt_model.generate(
        **gpt_inputs,
        max_new_tokens=60,
        do_sample=True,
        top_p=0.9,
        temperature=0.8,
        pad_token_id=gpt_tok.eos_token_id
    )

gpt_out = gpt_tok.decode(gpt_ids[0], skip_special_tokens=True)

print("\n=== GPT continuation ===")
print(gpt_out)


# ================= BERT (encoder-only, MLM) =================
from transformers import pipeline

bert_fill = pipeline(
    "fill-mask",
    model=bert_model,
    tokenizer=bert_tok,
    device=0 if torch.cuda.is_available() else -1
)

masked = "Valkyria Chronicles III is a [MASK] game."
bert_out = bert_fill(masked, top_k=5)

print("\n=== BERT fill-mask (top 5) ===")
for o in bert_out:
    print(o["sequence"], "| score:", round(o["score"], 4))


# ================= T5 (encoder-decoder, denoise) =================
t5_model2 = t5_model2.to(device)
t5_model2.eval()

corrupted = "denoise: Valkyria Chronicles III is a tactical role playing game developed by Sega"
t5_inputs = t5_tok(corrupted, return_tensors="pt", truncation=True, max_length=128).to(device)

with torch.no_grad():
    t5_ids = t5_model2.generate(
        **t5_inputs,
        max_new_tokens=60,
        num_beams=4,
        early_stopping=True
    )

t5_out = t5_tok.decode(t5_ids[0], skip_special_tokens=True)

print("\n=== T5 denoise ===")
print("Input :", corrupted)
print("Output:", t5_out)



=== GPT continuation ===
Valkyria Chronicles III is the first game in the series to feature a three @-@ dimensional map featuring a single planet , as well as a playable boss . The game was developed by the team @-@ based on the original game and was developed by the company <unk> , and the game was released on 3DS

=== BERT fill-mask (top 5) ===
valkyria chronicles iii is a strategy game. | score: 0.5384
valkyria chronicles iii is a video game. | score: 0.0586
valkyria chronicles iii is a platform game. | score: 0.0364
valkyria chronicles iii is a chess game. | score: 0.0231
valkyria chronicles iii is a fantasy game. | score: 0.021

=== T5 denoise ===
Input : denoise: Valkyria Chronicles III is a tactical role playing game developed by Sega
Output: Valkyria Chronicles III is a tactical role playing game developed by Sega.
