In [1]:
import torch
import os
import pandas as pd
import time
import logging
import transformers
from transformers import (
    GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
)
from torch.utils.data import Dataset, DataLoader

In [2]:
# ✅ Set device ONCE
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ✅ Memory optimizations (move these up)
torch.backends.cudnn.benchmark = False
torch.cuda.set_per_process_memory_fraction(0.6, device=0)
torch.cuda.empty_cache()

# Limit GPU & CPU usage
os.environ["OMP_NUM_THREADS"] = "4"  # Limit CPU threads
os.environ["MKL_NUM_THREADS"] = "4"

In [3]:
# Load the cleaned dataset
df = pd.read_csv("../data/cleaned_dataset.csv", index_col=0)

In [4]:
df

Unnamed: 0,headline
0,Over 4 Million Americans Roll Up Sleeves For O...
1,"American Airlines Flyer Charged, Banned For Li..."
2,23 Of The Funniest Tweets About Cats And Dogs ...
3,The Funniest Tweets From Parents This Week (Se...
4,Woman Who Called Cops On Black Bird-Watcher Lo...
...,...
209522,RIM CEO Thorsten Heins' 'Significant' Plans Fo...
209523,Maria Sharapova Stunned By Victoria Azarenka I...
209524,"Giants Over Patriots, Jets Over Colts Among M..."
209525,Aldon Smith Arrested: 49ers Linebacker Busted ...


In [5]:
df = df.dropna(subset=["headline"])  # Remove rows where 'headline' is NaN


In [6]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 doesn't have padding by default

In [18]:
df.loc[:, "tokenized"] = df["headline"].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))

max_length = max(len(tokens) for tokens in df["tokenized"])

df.loc[:, "padded"] = df["tokenized"].apply(lambda x: x + [tokenizer.pad_token_id] * (max_length - len(x)))


In [19]:
df_subset = df.sample(n=100, random_state=42)  # Random 100 rows
df_subset

Unnamed: 0,headline,tokenized,padded
121815,Celebrities Emerging From Water Because Hey .....,"[42741, 65, 19491, 48297, 3574, 5638, 4362, 14...","[42741, 65, 19491, 48297, 3574, 5638, 4362, 14..."
76633,Bill Maher Slams The Internet For Killing The ...,"[17798, 38137, 30382, 82, 383, 4455, 1114, 255...","[17798, 38137, 30382, 82, 383, 4455, 1114, 255..."
188697,The Most Expensive NFL Tickets Of The Season: ...,"[464, 4042, 5518, 2021, 5134, 26878, 3226, 383...","[464, 4042, 5518, 2021, 5134, 26878, 3226, 383..."
80361,Students Surprise Starbucks Employee With Gene...,"[28239, 47893, 24527, 36824, 2080, 2980, 516, ...","[28239, 47893, 24527, 36824, 2080, 2980, 516, ..."
65636,Ben Stein: 'I Don't Think Trump Knows A Goddam...,"[11696, 15215, 25, 705, 40, 2094, 470, 11382, ...","[11696, 15215, 25, 705, 40, 2094, 470, 11382, ..."
...,...,...,...
43356,"Trump's ""Extreme Vetting"" Of Refugees Empowers...","[6170, 338, 366, 36716, 569, 35463, 1, 3226, 3...","[6170, 338, 366, 36716, 569, 35463, 1, 3226, 3..."
113206,For $6 You Can Give a Coffee Tree and Help Emp...,"[1890, 720, 21, 921, 1680, 13786, 257, 19443, ...","[1890, 720, 21, 921, 1680, 13786, 257, 19443, ..."
66542,Brad Paisley Debuts A Little Ditty About North...,"[30805, 11243, 271, 1636, 1024, 4360, 82, 317,...","[30805, 11243, 271, 1636, 1024, 4360, 82, 317,..."
91949,Video Shows Man Holding Gun Before Allegedly S...,"[10798, 25156, 1869, 31703, 6748, 7413, 26326,...","[10798, 25156, 1869, 31703, 6748, 7413, 26326,..."


##### Define dataset class

In [20]:
class GPT2Dataset(Dataset):
    def __init__(self, df_subset):
        self.input_ids = torch.tensor(df_subset["padded"].tolist(), dtype=torch.long)
        self.attention_mask = (self.input_ids != tokenizer.pad_token_id).long()  # Mask padding tokens

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels": self.input_ids[idx],  # GPT-2 is trained using its own inputs as labels
        }

In [21]:
# Train/Validation Split
train_size = int(0.8 * len(df_subset))
train_df, val_df = df_subset[:train_size], df_subset[train_size:]

# Create Dataset
train_dataset = GPT2Dataset(train_df)
val_dataset = GPT2Dataset(val_df)

DistilGPT2 has 6 transformer blocks compared to GPT2, which has 12 transformer blocks. To perform transfer learning properly we will freeze all layers and unfreeze the last 2.

In [22]:
# Load the Pretrained GPT-2 Model with LM head
model = GPT2LMHeadModel.from_pretrained("distilgpt2")  # 50% smaller

# Freeze all layers initially
for param in model.parameters():
    param.requires_grad = False

# Unfreeze last 4 layers
for param in model.transformer.h[-2:].parameters():
    param.requires_grad = True

# Move Model to GPU
model.to(device)

loading configuration file config.json from cache at /home/adma224/.cache/huggingface/hub/models--distilgpt2/snapshots/2290a62682d06624634c1f46a6ad5be0f47f38aa/config.json
Model config GPT2Config {
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

##### Choose a batch size and num workers

In [23]:
train_loader = DataLoader(train_dataset, batch_size=8, num_workers=1, shuffle=True)  # Reduce num_workers
val_loader = DataLoader(val_dataset, batch_size=8, num_workers=1, shuffle=False)

print("Model and data loaders ready!")

Model and data loaders ready!


In [24]:
logging.basicConfig(level=logging.INFO)

# Set Hugging Face Transformers library to show debug logs
transformers.logging.set_verbosity_debug()

In [40]:
# Print selected device
print(f"🔥 Using device: {device}")

# Print GPU info
if device.type == "cuda":
    print(f"🚀 GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"💾 GPU Memory Allocated: {torch.cuda.memory_allocated(0) / 1024 ** 2:.2f} MB")
    print(f"💾 GPU Memory Reserved: {torch.cuda.memory_reserved(0) / 1024 ** 2:.2f} MB")
    print(f"🔄 CUDA Version: {torch.version.cuda}")
else:
    print("🖥 Running on CPU")


🔥 Using device: cuda
🚀 GPU Name: NVIDIA GeForce RTX 3070 Ti Laptop GPU
💾 GPU Memory Allocated: 1916.61 MB
💾 GPU Memory Reserved: 3370.00 MB
🔄 CUDA Version: 12.1


Reducing Learning Rate
Why? A high learning rate can cause large weight updates, leading to overwriting GPT-2’s pre-trained knowledge.
How? Use a smaller learning rate than usual when fine-tuning.

In [26]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",  # Where model checkpoints will be saved
    logging_dir="./logs",  # Directory for logging
    logging_strategy="steps",  # Log at every step
    logging_steps=50,  # Log every 50 steps
    report_to=["tensorboard"],  # Log to TensorBoard
    eval_strategy="epoch",  # Evaluate at each epoch
    save_strategy="epoch",  # Save model at each epoch
    save_total_limit=5,  # Keep only last 2 checkpoints
    disable_tqdm=False,  # Enable progress bars
    load_best_model_at_end=True,  # Load best model checkpoint at end
    fp16=True,  # Enable mixed precision for speed
    per_device_train_batch_size=8,  # Adjust batch size to prevent memory issues
    per_device_eval_batch_size=8,  # Same for evaluation
    gradient_accumulation_steps=1,  # Accumulate gradients before updating weights
    learning_rate=5e-5, # Lower than usual (default is 5e-4)
    weight_decay=0.01,  # Prevent drastic weight changes
    num_train_epochs=3
)

PyTorch: setting up devices


In [27]:
# Use Hugging Face Trainer API
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

print(trainer)

Using auto half precision backend


<transformers.trainer.Trainer object at 0x7ff925bb3d60>


In [28]:
# Measure start time
start_time = time.time()

print("🚀 Training started...")

# Start training
trainer.train()

# Measure end time
end_time = time.time()

🚀 Training started...


Currently training with a batch size of: 8
***** Running training *****
  Num examples = 80
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 30
  Number of trainable parameters = 81,912,576


Epoch,Training Loss,Validation Loss
1,No log,2.286885
2,No log,0.521786
3,No log,0.511022



***** Running Evaluation *****
  Num examples = 20
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-10
Configuration saved in ./results/checkpoint-10/config.json
Configuration saved in ./results/checkpoint-10/generation_config.json
Model weights saved in ./results/checkpoint-10/model.safetensors

***** Running Evaluation *****
  Num examples = 20
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-20
Configuration saved in ./results/checkpoint-20/config.json
Configuration saved in ./results/checkpoint-20/generation_config.json
Model weights saved in ./results/checkpoint-20/model.safetensors
Saving model checkpoint to ./results/checkpoint-30
Configuration saved in ./results/checkpoint-30/config.json
Configuration saved in ./results/checkpoint-30/generation_config.json
Model weights saved in ./results/checkpoint-30/model.safetensors

***** Running Evaluation *****
  Num examples = 20
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-30
Configu

In [29]:
training_duration = end_time - start_time
# Print training duration
print(f"✅ Training completed in {training_duration:.2f} seconds ({training_duration/60:.2f} minutes)")

# Print final training state
print(f"📈 Final Epoch: {trainer.state.epoch}")
print(f"📊 Total Training Steps: {trainer.state.global_step}")


✅ Training completed in 41.57 seconds (0.69 minutes)
📈 Final Epoch: 3.0
📊 Total Training Steps: 30


##### Run this command in the terminal for training metrics in tensorboard

`tensorboard --logdir=./logs`

In [30]:
# Save model & tokenizer
model.save_pretrained("../models/gpt2_finetuned")
tokenizer.save_pretrained("../models/gpt2_finetuned")

print("Model saved successfully!")



Configuration saved in ../models/gpt2_finetuned/config.json
Configuration saved in ../models/gpt2_finetuned/generation_config.json
Model weights saved in ../models/gpt2_finetuned/model.safetensors
tokenizer config file saved in ../models/gpt2_finetuned/tokenizer_config.json
Special tokens file saved in ../models/gpt2_finetuned/special_tokens_map.json


Model saved successfully!


In [31]:
import torch
import numpy as np
from transformers import Trainer

# Define function to calculate perplexity
def compute_perplexity(eval_loss):
    return np.exp(eval_loss)  # Perplexity = exp(loss)

# Get evaluation loss from trainer
eval_results = trainer.evaluate()
eval_loss = eval_results["eval_loss"]
perplexity = compute_perplexity(eval_loss)

print(f"📝 Evaluation Loss: {eval_loss:.4f}")
print(f"🔢 Perplexity: {perplexity:.4f}")



***** Running Evaluation *****
  Num examples = 20
  Batch size = 8


📝 Evaluation Loss: 0.5110
🔢 Perplexity: 1.6670


In [32]:
from transformers import StoppingCriteria, StoppingCriteriaList

class StopOnWhitespace(StoppingCriteria):
    def __call__(self, input_ids, scores, **kwargs):
        # Stop generation if the last 5 tokens are whitespace
        if all(tokenizer.decode(tok).isspace() for tok in input_ids[0, -5:]):
            return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnWhitespace()])


In [39]:
def generate_text(prompt, top_p=0.9, top_k=50):
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    
    output = model.generate(
        input_ids, 
        max_length=100,
        min_length=20,
        do_sample=True,
        num_beams=10,
        length_penalty=1,
        early_stopping=False,
        pad_token_id=tokenizer.eos_token_id,  # Ensure it doesn't stop early due to padding
        temperature=10.1,  # Increase randomness
        top_p=top_p,  # Nucleus sampling (limits probability mass)
        top_k=top_k,  # Only consider the top-k most likely words
        repetition_penalty=2.1,  # Avoid repeated spaces
        stopping_criteria=stopping_criteria
    )
    
    return tokenizer.decode(output[0], skip_special_tokens=True).strip()  # Strip extra spaces

# Test with new settings
examples = [
    "Breaking news:", 
    "Latest update:", 
    "The president announced that",
    "Scientists have discovered a new",
    "Experts warn that",
    "A recent study suggests that",
    "Authorities have confirmed that",
    "In a surprising turn of events,",
    "The stock market responded to",
    "New regulations require companies to",
    "Health officials recommend that",
    "Technology companies are investing in",
    "Sports fans are excited about",
    "The weather forecast predicts",
    "Researchers at MIT have developed",
    "Protests erupted in the city over",
    "The Supreme Court ruled that",
    "Celebrities are reacting to",
    "A new breakthrough in medicine shows that",
    "The United Nations has issued a statement on"
]

for text in examples:
    print(f"📝 Input: {text}")
    print(f"🔮 Output: {generate_text(text)}\n")


📝 Input: Breaking news:
🔮 Output: Breaking news: Obama won't vote and that the GOP wants him to fight 'bailing out' Republican officials

📝 Input: Latest update:
🔮 Output: Latest update: A more detailed guide to learning the science and maths skills so that you know how they use all your knowledge on the go," he said

📝 Input: The president announced that
🔮 Output: The president announced that he won't meet with Congress on Tuesday" and added, "This was something I had very difficult time finding.

📝 Input: Scientists have discovered a new
🔮 Output: Scientists have discovered a new study in human sexual reproduction based on their sex lives for generations: how many women live each day - or how much they'll let themselves know what it is, even during the years before they die to avoid unwanted pregnancies."

📝 Input: Experts warn that
🔮 Output: Experts warn that it may be worth doing more for health and security if you do everything possible with it

📝 Input: A recent study suggests th

In [34]:
input_ids = tokenizer.encode("Breaking news:", return_tensors="pt").to(device)
print("🔎 Encoded Input:", input_ids)


🔎 Encoded Input: tensor([[29449,  1705,    25]], device='cuda:0')


In [37]:
import torch

def generate_with_token_probabilities(prompt, max_tokens=20):
    """Generate text token by token, displaying each step with probabilities."""
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

    print(f"📝 Starting prompt: {prompt}")
    generated_text = prompt

    for _ in range(max_tokens):
        with torch.no_grad():
            outputs = model(input_ids)
            logits = outputs.logits  # Raw model outputs
        
        # Get probability distribution for next token
        probs = torch.softmax(logits[0, -1, :], dim=-1)  
        sorted_probs, sorted_indices = torch.sort(probs, descending=True)

        # Print top token choices
        print(f"\n🔍 Token step {_+1}")
        for i in range(5):  # Show top 5 predictions
            token = tokenizer.decode([sorted_indices[i].item()])
            print(f"  {token}: {sorted_probs[i].item():.4f}")

        # Select most probable token
        next_token_id = sorted_indices[0].item()
        next_token = tokenizer.decode([next_token_id])

        # Append token to generated text
        generated_text += next_token
        input_ids = torch.cat([input_ids, torch.tensor([[next_token_id]]).to(device)], dim=1)

        # Stop if end token is reached
        if next_token == tokenizer.eos_token:
            print("\n🚫 End of text token reached.")
            break

    print("\n📝 Final generated text:")
    print(generated_text)

# Test the function
generate_with_token_probabilities("The United Nations has issued a statement on")



📝 Starting prompt: The United Nations has issued a statement on

🔍 Token step 1
   the: 0.2751
   its: 0.0788
   Tuesday: 0.0654
   Monday: 0.0595
   Wednesday: 0.0525

🔍 Token step 2
   situation: 0.0397
   issue: 0.0248
   matter: 0.0193
   crisis: 0.0193
   ongoing: 0.0176

🔍 Token step 3
   in: 0.4893
  ,: 0.0774
   and: 0.0642
  .: 0.0603
   with: 0.0402

🔍 Token step 4
   Syria: 0.4355
   Yemen: 0.0858
   Ukraine: 0.0474
   Iraq: 0.0431
   Gaza: 0.0405

🔍 Token step 5
  ,: 0.2934
   and: 0.1149
  .: 0.1079
  :: 0.0765
   that: 0.0526

🔍 Token step 6
   saying: 0.1526
   calling: 0.1117
   and: 0.0636
   which: 0.0465
   urging: 0.0465

🔍 Token step 7
   that: 0.2565
   the: 0.1709
   it: 0.1508
  :: 0.1036
   ": 0.0358

🔍 Token step 8
   the: 0.2524
   it: 0.0770
   ": 0.0546
   there: 0.0364
   Syria: 0.0242

🔍 Token step 9
   Syrian: 0.1489
   United: 0.0661
   government: 0.0499
   situation: 0.0377
   country: 0.0332

🔍 Token step 10
   government: 0.5813
   people: 0.0613
  