In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import matplotlib.pyplot as plt

device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

OUTPUT_DIR = "../models/llm_forecaster/"
VAL_FILE = "../data/llm_preprocessed/val.csv"


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Load fine-tuned model

tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR)
model = AutoModelForCausalLM.from_pretrained(OUTPUT_DIR).to(device)

model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [5]:
# Load your real target

df = pd.read_csv(VAL_FILE)

#show the first 5 rows
print(df.head())

# Extract the 'completion' column and convert it to a list of floats
target = df['completion'].apply(lambda x: float(x.strip('[]'))).tolist()

window_size = 10
stride = 1

def create_prompt(input_series):
    return f"Given past values: {input_series}, predict next 1 value(s):"

                                              prompt            completion
0  Given past values: [0.3047697082252015, 0.3117...  [0.2485874626185448]
1  Given past values: [0.5322937958941605, 0.5322...  [0.5421366699247255]
2  Given past values: [0.4985924416716777, 0.5322...  [0.5435342320660694]
3  Given past values: [0.7078683091284076, 0.7359...  [0.6924151941585102]
4  Given past values: [0.2345518831235917, 0.2275...  [0.2120709917394176]


In [6]:

preds = []
trues = []

for i in range(0, len(target) - window_size - 1, stride):
    input_series = target[i:i+window_size]
    true_value = target[i+window_size]

    prompt = create_prompt(input_series)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_length=inputs['input_ids'].shape[1] + 10, 
            do_sample=False,        
            pad_token_id=tokenizer.eos_token_id  # ðŸ‘ˆ Add this! to avoid "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation." warning
)

    generated = tokenizer.decode(output[0])
    try:
        prediction_text = generated[len(prompt):].strip().split()[0]
        prediction_value = float(prediction_text.replace(",", "").replace("[", "").replace("]", ""))
    except:
        prediction_value = input_series[-1]

    preds.append(prediction_value)
    trues.append(true_value)

    if i % 100 == 0:
        print(f"Processed {i}/{len(target)} points")

Processed 0/3482 points
Processed 100/3482 points
Processed 200/3482 points
Processed 300/3482 points
Processed 400/3482 points
Processed 500/3482 points
Processed 600/3482 points
Processed 700/3482 points
Processed 800/3482 points
Processed 900/3482 points
Processed 1000/3482 points
Processed 1100/3482 points
Processed 1200/3482 points
Processed 1300/3482 points
Processed 1400/3482 points
Processed 1500/3482 points
Processed 1600/3482 points
Processed 1700/3482 points
Processed 1800/3482 points
Processed 1900/3482 points
Processed 2000/3482 points
Processed 2100/3482 points
Processed 2200/3482 points
Processed 2300/3482 points
Processed 2400/3482 points
Processed 2500/3482 points
Processed 2600/3482 points
Processed 2700/3482 points
Processed 2800/3482 points
Processed 2900/3482 points
Processed 3000/3482 points
Processed 3100/3482 points
Processed 3200/3482 points
Processed 3300/3482 points
Processed 3400/3482 points


In [7]:

# Create a figure for the evaluation results
fig, ax = plt.subplots(figsize=(6, 4))
ax.axis('off')  # Turn off the axes


# Calculate metrics
mse = mean_squared_error(trues, preds)
mae = mean_absolute_error(trues, preds)
smape = np.mean(2 * np.abs(np.array(preds) - np.array(trues)) / (np.abs(preds) + np.abs(trues))) * 100

# Add text with evaluation metrics
text = f"MSE: {mse:.6f}\nMAE: {mae:.6f}\nSMAPE: {smape:.2f}%"
ax.text(0.5, 0.5, text, fontsize=12, ha='center', va='center', wrap=True)

# Save the figure as a PNG file
output_png_path = "../../outputs/llm_forecaster_evaluation_results.png"
plt.savefig(output_png_path, bbox_inches='tight')
plt.close()

print(f"Evaluation results saved as PNG to {output_png_path}")

Evaluation results saved as PNG to ../../outputs/llm_forecaster_evaluation_results.png
