In [6]:
import torch
import os
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
import time
from transformers import TrainingArguments
import logging

In [2]:
# Load the cleaned dataset
df = pd.read_csv("../data/cleaned_dataset.csv", index_col=0)

# Display the first few rows to confirm

df.head()

Unnamed: 0,headline
0,Over 4 Million Americans Roll Up Sleeves For O...
1,"American Airlines Flyer Charged, Banned For Li..."
2,23 Of The Funniest Tweets About Cats And Dogs ...
3,The Funniest Tweets From Parents This Week (Se...
4,Woman Who Called Cops On Black Bird-Watcher Lo...


In [3]:


# Initialize GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 doesn't have padding by default

In [4]:
df = df.dropna(subset=["headline"])  # Remove rows where 'headline' is NaN


In [5]:
# Tokenize headlines
df["tokenized"] = df["headline"].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))

# Find max sequence length
max_length = max(len(tokens) for tokens in df["tokenized"])

# Pad sequences
df["padded"] = df["tokenized"].apply(lambda x: x + [tokenizer.pad_token_id] * (max_length - len(x)))

# Convert to tensor format
input_ids = torch.tensor(df["padded"].tolist(), dtype=torch.long)

# Save processed dataset
df.to_json("../data/gpt2_dataset.json", orient="records", lines=True)

In [None]:
df

In [None]:
df_subset = df.sample(n=20, random_state=42)  # Random 100 rows

In [None]:
class GPT2Dataset(Dataset):
    def __init__(self, df_subset):
        self.input_ids = torch.tensor(df_subset["padded"].tolist(), dtype=torch.long)
        self.attention_mask = (self.input_ids != tokenizer.pad_token_id).long()  # Mask padding tokens

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels": self.input_ids[idx],  # GPT-2 is trained using its own inputs as labels
        }

# Train/Validation Split
train_size = int(0.8 * len(df_subset))
train_df, val_df = df[:train_size], df[train_size:]

# Create Dataset
train_dataset = GPT2Dataset(train_df)
val_dataset = GPT2Dataset(val_df)

In [None]:
# Limit GPU & CPU usage
os.environ["OMP_NUM_THREADS"] = "4"  # Limit CPU threads
os.environ["MKL_NUM_THREADS"] = "4"

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


##### Choose a batch size and num workers

In [None]:
train_loader = DataLoader(train_dataset, batch_size=8, num_workers=1, shuffle=True)  # Reduce num_workers
val_loader = DataLoader(val_dataset, batch_size=8, num_workers=1, shuffle=False)

In [None]:
import torch

torch.backends.cudnn.benchmark = False  # Reduce unnecessary optimizations
torch.cuda.set_per_process_memory_fraction(0.6, device=0)  # Use only 80% of GPU memory


In [None]:
# Load pre-trained GPT-2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load GPT-2 with LM head
model = GPT2LMHeadModel.from_pretrained("distilgpt2")  # 50% smaller
model.to(device)



In [None]:
# Reduce batch size to prevent GPU memory issues
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

print("Model and data loaders ready!")


In [None]:
# Free unused memory
torch.cuda.empty_cache()

# Limit memory usage (e.g., 50% of GPU capacity)
torch.cuda.set_per_process_memory_fraction(0.5, device=0)


In [None]:
import torch

# Print selected device
print(f"🔥 Using device: {device}")

# Print GPU info
if device.type == "cuda":
    print(f"🚀 GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"💾 GPU Memory Allocated: {torch.cuda.memory_allocated(0) / 1024 ** 2:.2f} MB")
    print(f"💾 GPU Memory Reserved: {torch.cuda.memory_reserved(0) / 1024 ** 2:.2f} MB")
    print(f"🔄 CUDA Version: {torch.version.cuda}")
else:
    print("🖥 Running on CPU")


In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",  # Where model checkpoints will be saved
    logging_dir="./logs",  # Directory for logging
    logging_strategy="steps",  # Log at every step
    logging_steps=50,  # Log every 50 steps
    report_to=["tensorboard"],  # Log to TensorBoard
    evaluation_strategy="epoch",  # Evaluate at each epoch
    save_strategy="epoch",  # Save model at each epoch
    save_total_limit=2,  # Keep only last 2 checkpoints
    disable_tqdm=False,  # Enable progress bars
    load_best_model_at_end=True,  # Load best model checkpoint at end
    fp16=True,  # Enable mixed precision for speed
    per_device_train_batch_size=8,  # Adjust batch size to prevent memory issues
    per_device_eval_batch_size=8,  # Same for evaluation
    gradient_accumulation_steps=2,  # Accumulate gradients before updating weights
)

In [None]:
logging.basicConfig(level=logging.INFO)

# Set Hugging Face Transformers library to show debug logs
transformers.logging.set_verbosity_debug()

In [None]:
# Use Hugging Face Trainer API
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [None]:
# Measure start time
start_time = time.time()

print("🚀 Training started...")

# Start training
trainer.train()

# Measure end time
end_time = time.time()

In [None]:
training_duration = end_time - start_time

In [None]:

# Print training duration
print(f"✅ Training completed in {training_duration:.2f} seconds ({training_duration/60:.2f} minutes)")

# Monitor GPU usage
if torch.cuda.is_available():
    allocated_memory = torch.cuda.memory_allocated() / 1024**2  # Convert bytes to MB
    reserved_memory = torch.cuda.memory_reserved() / 1024**2
    print(f"💾 GPU Memory Allocated: {allocated_memory:.2f} MB")
    print(f"💾 GPU Memory Reserved: {reserved_memory:.2f} MB")

# Print final training state
print(f"📈 Final Epoch: {trainer.state.epoch}")
print(f"📊 Total Training Steps: {trainer.state.global_step}")


#### Run this command in the terminal for training metrics in tensorboard

`tensorboard --logdir=./logs`

In [None]:
# Save model & tokenizer
model.save_pretrained("../models/gpt2_finetuned")
tokenizer.save_pretrained("../models/gpt2_finetuned")

print("Model saved successfully!")



In [None]:
def generate_text(prompt, max_length=50):
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    output = model.generate(input_ids, max_length=max_length, pad_token_id=tokenizer.eos_token_id)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Test with example prompts
examples = ["Breaking news:", "Latest update:", "The president announced that"]
for text in examples:
    print(f"📝 Input: {text}")
    print(f"🔮 Output: {generate_text(text)}\n")

