In [1]:

from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from torch.utils.data import Dataset
import pandas as pd
import torch
import os


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# 🔧 Config
MODEL_NAME = "gpt2"  # Try also: "gpt2-medium", "TinyLlama", or "EleutherAI/gpt-neo-125M"
DATA_DIR = "../data/llm_preprocessed/"
OUTPUT_DIR = "../models/llm_forecaster/"


# ✅ Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [4]:
# 📁 1. Dataset class using prompt + completion
class TimeSeriesPromptDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length=256):
        self.data = pd.read_csv(file_path)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        prompt = self.data.iloc[idx]["prompt"]
        completion = self.data.iloc[idx]["completion"]
        full_text = prompt + " " + completion

        encodings = self.tokenizer(
            full_text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )

        input_ids = encodings["input_ids"].squeeze()
        attention_mask = encodings["attention_mask"].squeeze()
        labels = input_ids.clone()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
        }

In [5]:
# 📚 2. Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token  # Required for padding
model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


KeyboardInterrupt: 

In [None]:
# 📊 3. Prepare datasets using pre-split LLM files
train_dataset = TimeSeriesPromptDataset(f"{DATA_DIR}/train.csv", tokenizer)
val_dataset = TimeSeriesPromptDataset(f"{DATA_DIR}/val.csv", tokenizer)

In [None]:
# ⚙️ 4. Training Arguments (compatible with transformers 4.51.3 or lower)
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    logging_dir=os.path.join(OUTPUT_DIR, "logs"),
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    save_steps=500,
    logging_steps=500,
    weight_decay=0.01,
    save_total_limit=2,
    learning_rate=5e-5,
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU
    report_to="none",  # disable W&B or Hub
)

In [None]:
# 🧠 5. Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

In [None]:
# 🚀 6. Train and Save
trainer.train()
trainer.save_model(OUTPUT_DIR)
print(f"✅ LLM Fine-tuning complete. Model saved to {OUTPUT_DIR}")