In [2]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())

2.5.1+cu121
True


In [3]:
from datasets import Dataset, DatasetDict

data_dir = r"C:\Users\Adarsh\Downloads\archive\wikitext-2"

def load_text_file(path):
    with open(path, "r", encoding="utf-8") as f:
        lines = [l.strip() for l in f.read().split("\n") if l.strip()]
    return Dataset.from_dict({"text": lines})

ds = DatasetDict({
    "train": load_text_file(f"{data_dir}/wiki.train.tokens"),
    "validation": load_text_file(f"{data_dir}/wiki.valid.tokens"),
    "test": load_text_file(f"{data_dir}/wiki.test.tokens"),
})

print(ds)
print(ds["train"][0])

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 23767
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 2461
    })
    test: Dataset({
        features: ['text'],
        num_rows: 2891
    })
})
{'text': '= Valkyria Chronicles III ='}


In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "distilgpt2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(model_name)

print("Model + tokenizer loaded.")

Loading weights:   0%|          | 0/76 [00:00<?, ?it/s]

[1mGPT2LMHeadModel LOAD REPORT[0m from: distilgpt2
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
transformer.h.{0, 1, 2, 3, 4, 5}.attn.bias | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Model + tokenizer loaded.


In [5]:
MAX_LEN = 128

def tokenize_lm(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=MAX_LEN,
        padding="max_length"
    )

tokenized_ds = ds.map(tokenize_lm, batched=True, remove_columns=["text"])
tokenized_ds = tokenized_ds.map(lambda x: {"labels": x["input_ids"]}, batched=True)

print(tokenized_ds)

Map:   0%|          | 0/23767 [00:00<?, ? examples/s]

Map:   0%|          | 0/2461 [00:00<?, ? examples/s]

Map:   0%|          | 0/2891 [00:00<?, ? examples/s]

Map:   0%|          | 0/23767 [00:00<?, ? examples/s]

Map:   0%|          | 0/2461 [00:00<?, ? examples/s]

Map:   0%|          | 0/2891 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 23767
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2461
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2891
    })
})


In [6]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
import math

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

training_args = TrainingArguments(
    output_dir="pretrain_runs",
    eval_strategy="epoch",
    save_strategy="no",
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    logging_steps=100,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    data_collator=data_collator,
)

trainer.train()

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,3.438338,3.165188


TrainOutput(global_step=1486, training_loss=3.529469671031187, metrics={'train_runtime': 196.3653, 'train_samples_per_second': 121.035, 'train_steps_per_second': 7.568, 'total_flos': 776279983915008.0, 'train_loss': 3.529469671031187, 'epoch': 1.0})

In [7]:
eval_results = trainer.evaluate()
perplexity = math.exp(eval_results["eval_loss"])

print("Validation Loss:", eval_results["eval_loss"])
print("Perplexity:", perplexity)

Validation Loss: 3.1651883125305176
Perplexity: 23.693205340211556


In [8]:
pretrain_dir = "model_pretrained"
trainer.save_model(pretrain_dir)
tokenizer.save_pretrained(pretrain_dir)
print("Saved pre-trained model to:", pretrain_dir)

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Saved pre-trained model to: model_pretrained


In [9]:
from datasets import Dataset, DatasetDict

def make_pairs(lines, min_words=20):
    pairs = []
    for t in lines:
        w = t.split()
        if len(w) < min_words:
            continue
        mid = len(w) // 2
        prompt = "Continue the text:\n" + " ".join(w[:mid])
        target = " ".join(w[mid:])
        pairs.append({"prompt": prompt, "target": target})
    return pairs

train_pairs = make_pairs(ds["train"]["text"])
val_pairs   = make_pairs(ds["validation"]["text"])

sft_ds = DatasetDict({
    "train": Dataset.from_list(train_pairs),
    "validation": Dataset.from_list(val_pairs),
})

print(sft_ds)
print("Example SFT item:\n", sft_ds["train"][0])

DatasetDict({
    train: Dataset({
        features: ['prompt', 'target'],
        num_rows: 15339
    })
    validation: Dataset({
        features: ['prompt', 'target'],
        num_rows: 1647
    })
})
Example SFT item:
 {'prompt': 'Continue the text:\nSenjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the', 'target': 'Valkyria series . <unk> the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " <unk> Raven " .'}


In [10]:
MAX_LEN = 256

def tokenize_sft(batch):
    prompts = batch["prompt"]
    targets = batch["target"]

    # full input = prompt + target
    full_text = [p + "\n" + t for p, t in zip(prompts, targets)]
    full_enc = tokenizer(
        full_text,
        truncation=True,
        max_length=MAX_LEN,
        padding="max_length",
    )

    # prompt token lengths (to mask labels)
    prompt_enc = tokenizer(
        prompts,
        truncation=True,
        max_length=MAX_LEN,
        padding="max_length",
    )

    labels = []
    for i in range(len(prompts)):
        lab = full_enc["input_ids"][i].copy()

        # compute number of non-pad tokens in prompt
        prompt_len = 0
        for tid in prompt_enc["input_ids"][i]:
            if tid == tokenizer.pad_token_id:
                break
            prompt_len += 1

        # mask prompt portion
        for j in range(min(prompt_len, len(lab))):
            lab[j] = -100

        labels.append(lab)

    full_enc["labels"] = labels
    return full_enc

sft_tok = sft_ds.map(tokenize_sft, batched=True, remove_columns=["prompt", "target"])
print(sft_tok)

Map:   0%|          | 0/15339 [00:00<?, ? examples/s]

Map:   0%|          | 0/1647 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 15339
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1647
    })
})


In [11]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
import math

sft_model = AutoModelForCausalLM.from_pretrained(pretrain_dir)

sft_args = TrainingArguments(
    output_dir="sft_runs",
    eval_strategy="epoch",
    save_strategy="no",
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    logging_steps=100,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

sft_trainer = Trainer(
    model=sft_model,
    args=sft_args,
    train_dataset=sft_tok["train"],
    eval_dataset=sft_tok["validation"],
    data_collator=data_collator,
)

sft_trainer.train()

Loading weights:   0%|          | 0/76 [00:00<?, ?it/s]

Epoch,Training Loss,Validation Loss
1,3.263554,3.053734


TrainOutput(global_step=959, training_loss=3.28565150033197, metrics={'train_runtime': 220.5873, 'train_samples_per_second': 69.537, 'train_steps_per_second': 4.347, 'total_flos': 1002007714332672.0, 'train_loss': 3.28565150033197, 'epoch': 1.0})

In [12]:
sft_eval = sft_trainer.evaluate()
sft_ppl = math.exp(sft_eval["eval_loss"])

print("SFT Validation Loss:", sft_eval["eval_loss"])
print("SFT Perplexity:", sft_ppl)

SFT Validation Loss: 3.0537338256835938
SFT Perplexity: 21.194332810531588


In [13]:
import pandas as pd

results = pd.DataFrame([
    {"method": "Pre-training (LM)", "val_loss": 3.1651883125305176, "perplexity": 23.693205340211556},
    {"method": "SFT (prompt->target)", "val_loss": 3.0537338256835938, "perplexity": 21.194332810531588},
])

results

Unnamed: 0,method,val_loss,perplexity
0,Pre-training (LM),3.165188,23.693205
1,SFT (prompt->target),3.053734,21.194333


In [14]:
import torch
from transformers import AutoModelForCausalLM

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# reload models cleanly
pre_model = AutoModelForCausalLM.from_pretrained("model_pretrained").to(device)
sft_model = AutoModelForCausalLM.from_pretrained("model_pretrained").to(device)  # base
sft_model.load_state_dict(sft_trainer.model.state_dict())  # updated SFT weights
sft_model.to(device)

def generate(model, prompt, max_new=80):
    model.eval()
    x = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=256).to(device)
    with torch.no_grad():
        y = model.generate(
            **x,
            max_new_tokens=max_new,
            do_sample=True,
            top_p=0.9,
            temperature=0.8,
            pad_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(y[0], skip_special_tokens=True)

# pick one prompt from validation set (same one used in SFT pairs)
sample = sft_ds["validation"][0]
prompt = sample["prompt"]
gold   = sample["target"]

print("PROMPT:\n", prompt)
print("\nGOLD TARGET (what SFT should learn to generate):\n", gold)

print("\n--- PRETRAIN OUTPUT ---\n")
print(generate(pre_model, prompt))

print("\n--- SFT OUTPUT ---\n")
print(generate(sft_trainer.model, prompt))

Loading weights:   0%|          | 0/76 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/76 [00:00<?, ?it/s]

PROMPT:
 Continue the text:
Homarus gammarus , known as the European lobster or common lobster , is a species of <unk> lobster from the eastern Atlantic Ocean , Mediterranean Sea and parts of the Black Sea . It is closely related to the American lobster , H. americanus . It may grow to a length of 60 cm ( 24 in ) and a mass of 6 kilograms ( 13 lb ) , and

GOLD TARGET (what SFT should learn to generate):
 bears a conspicuous pair of claws . In life , the lobsters are blue , only becoming " lobster red " on cooking . Mating occurs in the summer , producing eggs which are carried by the females for up to a year before hatching into <unk> larvae . Homarus gammarus is a highly esteemed food , and is widely caught using lobster pots , mostly around the British Isles .

--- PRETRAIN OUTPUT ---

Continue the text:
Homarus gammarus , known as the European lobster or common lobster , is a species of <unk> lobster from the eastern Atlantic Ocean , Mediterranean Sea and parts of the Black Sea . It

In [15]:
import re

def token_set_overlap(pred, ref):
    pred_toks = set(re.findall(r"\w+", pred.lower()))
    ref_toks  = set(re.findall(r"\w+", ref.lower()))
    if len(ref_toks) == 0:
        return 0.0
    return len(pred_toks & ref_toks) / len(ref_toks)

pre_out = generate(pre_model, prompt)
sft_out = generate(sft_trainer.model, prompt)

print("Token overlap vs target (proxy)")
print("Pretrain:", round(token_set_overlap(pre_out, gold), 4))
print("SFT     :", round(token_set_overlap(sft_out, gold), 4))

Token overlap vs target (proxy)
Pretrain: 0.2353
SFT     : 0.2745
