In [8]:

from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import torch
import os

import wandb

os.environ["WANDB_API_KEY"] = "e3facbc53138b2f4cbe5e555094d03d6a777d916"

In [9]:
#initialize wandb
# Initialize Weights & Biases
wandb.init(project="llm-forecaster-optimization", name="gpt2-finetuning-optimization")



[34m[1mwandb[0m: Currently logged in as: [33m24832901044[0m ([33m24832901044-gazi-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [10]:
# Config
MODEL_NAME = "gpt2"  # or "gpt2-medium" etc.
DATA_DIR = "../data/llm_preprocessed/"
OUTPUT_DIR = "../models/llm_forecaster_optimized/"

os.makedirs(OUTPUT_DIR, exist_ok=True)

In [11]:
# Dataset class
class TimeSeriesPromptDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length=256):
        self.data = pd.read_csv(file_path)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        prompt = self.data.iloc[idx]['prompt']
        completion = self.data.iloc[idx]['completion']
        text = prompt + " " + completion
        encodings = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        input_ids = encodings['input_ids'].squeeze()
        attention_mask = encodings['attention_mask'].squeeze()
        labels = input_ids.clone()
        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}


In [12]:

# Device detection
device = torch.device("mps") if torch.backends.mps.is_available() else \
         torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print("Using device:", device)


Using device: mps


In [13]:
# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token  # for padding
model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)
model.to(device)


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [14]:
# Prepare datasets and dataloaders
train_dataset = TimeSeriesPromptDataset(f"{DATA_DIR}/train.csv", tokenizer)
val_dataset = TimeSeriesPromptDataset(f"{DATA_DIR}/val.csv", tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# Optimizer and loss
optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:

# Training loop
epochs = 5
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs} - Training loss: {avg_loss:.4f}")

    # Log to Weights & Biases
    wandb.log({"epoch": epoch + 1, "loss": avg_loss})

    wandb.config.update({
        "model_name": MODEL_NAME,
        "epochs": epochs,
        "batch_size": 8,
        "learning_rate": 5e-5,
        "data_dir": DATA_DIR,
        "output_dir": OUTPUT_DIR
    })
    wandb.watch(model, log="all")

    # Optional: add evaluation on val set here

# Save the fine-tuned model and tokenizer
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"✅ LLM fine-tuning complete. Model saved to {OUTPUT_DIR}")


Epoch 1/5 - Training loss: 0.5159
Epoch 2/5 - Training loss: 0.1902
Epoch 3/5 - Training loss: 0.1731
