# Inference of baseline small LLM (Llama 3.1 8B) on stock price data

## 1. Environment Setup and Dependencies

In [1]:
# Install required packages (run once)
#!pip install -r ../requirements.txt

In [2]:
# Install Hugging Face packages (run once if using local Llama)
# !pip install transformers accelerate bitsandbytes torch

In [3]:
# Import libraries
import os
import json
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from typing import Dict, List, Tuple
import warnings
warnings.filterwarnings('ignore')

# Standard library
import time
import pickle

# Environment variables
from dotenv import load_dotenv

# HTTP requests for HF endpoint
import requests

# # Machine Learning
# from sklearn.svm import SVR
# from sklearn.preprocessing import StandardScaler
# from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
# from xgboost import XGBRegressor

# Deep Learning
import torch
import torch.nn as nn

# Reinforcement Learning
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

# Progress bar
from tqdm import tqdm

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)


print("All libraries imported successfully!")

All libraries imported successfully!


## 2. Hugging Face Dedicated Endpoint Configuration

In [4]:
# Load environment variables
load_dotenv('../.env')

# LLM Configuration
MAX_TOKENS = 1024
TEMPERATURE = 0.0

# Hugging Face Dedicated Endpoint
HF_ENDPOINT_URL = "https://o988k6zvcj6ifd2u.us-east-1.aws.endpoints.huggingface.cloud"

# Get HF token
hf_token = os.getenv('HF_TOKEN')
if not hf_token:
    raise ValueError("HF_TOKEN not found in .env file. Get token from: https://huggingface.co/settings/tokens")

print(f"‚úÖ Hugging Face Dedicated Endpoint configured!")
print(f"   Endpoint: {HF_ENDPOINT_URL}")
print(f"   Model: Llama 3.1 8B Instruct")
print(f"   Max Tokens: {MAX_TOKENS}")

print(f"   Temperature: {TEMPERATURE}")
print(f"   Rate limits: UNLIMITED! üéâ")

‚úÖ Hugging Face Dedicated Endpoint configured!
   Endpoint: https://o988k6zvcj6ifd2u.us-east-1.aws.endpoints.huggingface.cloud
   Model: Llama 3.1 8B Instruct
   Max Tokens: 1024
   Temperature: 0.0
   Rate limits: UNLIMITED! üéâ


## 3. Data Loading and Preprocessing

In [5]:
# Load datasets
def load_jsonl(filepath):
    """Load JSONL file"""
    data = []
    with open(filepath, 'r') as f:
        for line in f:
            data.append(json.loads(line))
    return data

# Load train, val, test data
train_data = load_jsonl('../finetune_paper/train.jsonl')
val_data = load_jsonl('../finetune_paper/val.jsonl')
test_data = load_jsonl('../finetune_paper/test.jsonl')

# Load supervised labels
all_labels = pd.read_csv('../finetune_paper/all_supervised_price_labels.csv')

print(f"Training samples: {len(train_data)}")
print(f"Validation samples: {len(val_data)}")
print(f"Test samples: {len(test_data)}")
print(f"\nAll labels shape: {all_labels.shape}")
print(f"\nStocks in dataset: {all_labels['ticker'].unique()}")

Training samples: 8698
Validation samples: 1243
Test samples: 2477

All labels shape: (12418, 16)

Stocks in dataset: ['AAPL' 'HSBC' '0700.HK' 'PEP' '7203.T']


In [6]:
# Display sample data
print("Sample training data:")
print(f"Prompt (first 500 chars): {train_data[0]['prompt'][:500]}...")
print(f"\nResponse: {train_data[0]['response']}")

print("\n" + "="*80 + "\n")
print("Sample supervised labels:")
all_labels.head()

Sample training data:
Prompt (first 500 chars): You are a financial analyst with expertise in stock market forecasting.
Your task is to analyze market data and predict the next trading day stock price.
Use historical price trends, technical indicators, and sentiment analysis to provide an informed forecast.
Ensure that your predictions are well-justified, considering multiple financial factors.

‚Ä¢ Predicted Stock Price: The forecasted close price for the next trading day.
‚Ä¢ Price Movement Likelihood: The likelihood of the predicted stock pric...

Response: {"predicted_close": 27.18000030517578, "likelihood": 0.5, "justification": "n/a"}


Sample supervised labels:


Unnamed: 0,Date,SMA_20,SMA_50,EMA_12,EMA_26,RSI_14,MACD,MACD_signal,MACD_hist,BB_width_20_2,headline_count,sent_compound_mean,titles_joined,next_close,confidence_proxy,ticker
0,2015-01-16 00:00:00+00:00,,,27.159062,27.234398,13.536208,-0.075335,-0.01569,-0.059645,,4.0,-0.07955,,27.18,0.5,AAPL
1,2015-01-16 00:00:00+00:00,,,45.765558,46.231136,4.645025,-0.465578,-0.348537,-0.117041,,6.0,0.308567,Which London business pays the highest busines...,45.360001,0.9,HSBC
2,2015-01-16 00:00:00+00:00,,,113.078837,109.846862,68.406756,3.231975,2.607665,0.624309,,1.0,0.0,,113.388344,0.5,0700.HK
3,2015-01-16 00:00:00+00:00,,,96.059458,95.400737,36.54659,0.658721,0.41146,0.247261,,10.0,0.08298,"Audrey P. ""Pep"" Landry Obituary January 16, 20...",97.510002,0.5,PEP
4,2015-01-19 00:00:00+00:00,,,113.126453,110.109194,70.079261,3.017259,2.689584,0.327675,,1.0,0.3612,WeChat apologizes for showering Chinese users ...,114.402382,0.5,0700.HK


In [7]:
# Parse test data for evaluation
POSITIVE_JUSTIFICATION_KEYWORDS = {
    "increase", "growth", "upward", "bullish", "positive", "gain", "improve", "strength", "rally", "optimistic"
}
NEGATIVE_JUSTIFICATION_KEYWORDS = {
    "decrease", "decline", "downward", "bearish", "negative", "loss", "drop", "weakness", "sell", "pessimistic"
}
RISK_JUSTIFICATION_KEYWORDS = {
    "volatility", "volatile", "risk", "uncertain", "uncertainty", "caution", "concern", "warning", "downside"
}

def parse_prompt_data(prompt_text):
    """Extract key information from prompt"""
    lines = prompt_text.split('\n')
    data = {}
    
    for line in lines:
        if 'TICKER:' in line:
            data['ticker'] = line.split('TICKER:')[1].strip()
        elif 'DATE:' in line:
            data['date'] = line.split('DATE:')[1].strip()
        elif 'RECENT CLOSING PRICES' in line:
            prices_line = lines[lines.index(line) + 1]
            if prices_line.strip():
                data['recent_prices'] = [float(p.strip()) for p in prices_line.split(',') if p.strip()]
    
    return data

def safe_float(value, default=0.0) -> float:
    try:
        return float(value)
    except (TypeError, ValueError):
        return float(default)

def extract_justification_features(justification: str) -> Dict[str, float]:
    base = {
        "justification_pos_ratio": 0.0,
        "justification_neg_ratio": 0.0,
        "justification_risk_ratio": 0.0,
        "justification_polarity": 0.0,
        "justification_length": 0.0,
    }
    if not justification:
        return base.copy()
    tokens = re.findall(r"[a-zA-Z']+", justification.lower())
    token_count = max(len(tokens), 1)
    pos_hits = sum(token in POSITIVE_JUSTIFICATION_KEYWORDS for token in tokens)
    neg_hits = sum(token in NEGATIVE_JUSTIFICATION_KEYWORDS for token in tokens)
    risk_hits = sum(token in RISK_JUSTIFICATION_KEYWORDS for token in tokens)
    base.update({
        "justification_pos_ratio": float(pos_hits / token_count),
        "justification_neg_ratio": float(neg_hits / token_count),
        "justification_risk_ratio": float(risk_hits / token_count),
        "justification_polarity": float((pos_hits - neg_hits) / token_count),
        "justification_length": float(np.log1p(token_count)),
    })
    return base

# Parse test data
test_parsed = []
for item in test_data:
    parsed = parse_prompt_data(item['prompt'])
    response = json.loads(item['response'])
    parsed['predicted_close'] = response['predicted_close']
    parsed['likelihood'] = response['likelihood']
    test_parsed.append(parsed)

test_df = pd.DataFrame(test_parsed)
print(f"Parsed test data shape: {test_df.shape}")
test_df.head()


Parsed test data shape: (2477, 4)


Unnamed: 0,ticker,date,predicted_close,likelihood
0,HSBC,2023-01-03,32.68,0.9
1,0700.HK,2023-01-03,342.870056,0.5
2,PEP,2023-01-03,178.970001,0.9
3,AAPL,2023-01-03,126.360001,0.5
4,7203.T,2023-01-04,1807.5,0.7


## 4. Stage 1: LLM-Based Stock Price Prediction

In [23]:
def llm_predict_stock_price(prompt: str, retries: int = 3) -> Dict:
    """Use Hugging Face Dedicated Endpoint to predict stock price with retries."""
    for attempt in range(retries):
        try:
            headers = {
                "Accept": "application/json",
                "Authorization": f"Bearer {hf_token}",
                "Content-Type": "application/json"
            }
            
            payload = {
                "inputs": prompt,
                "parameters": {
                    "max_new_tokens": MAX_TOKENS,
                    "temperature": TEMPERATURE if TEMPERATURE > 0 else 0.1,
                    "return_full_text": False
                }
            }
            
            response = requests.post(
                HF_ENDPOINT_URL,
                headers=headers,
                json=payload,
                timeout=30
            )

            print(response.text)
            
            if response.status_code != 200:
                print(f"HF Endpoint Error (Attempt {attempt + 1}/{retries}): {response.status_code} - {response.text}")
                if attempt < retries - 1:
                    time.sleep(2) # Wait 2 seconds before retrying
                    continue
                else:
                    return {"predicted_close": None, "likelihood": 0.5, "justification": f"API Error: {response.status_code}"}
            
            result_data = response.json()
            print(result_data[0].get('generated_text', ''))
            
            # Extract generated text
            if isinstance(result_data, list) and len(result_data) > 0:
                content = result_data[0].get('generated_text', '')
            elif isinstance(result_data, dict):
                content = result_data.get('generated_text', result_data.get('text', ''))
            else:
                content = str(result_data)
            
            # Parse JSON response
            if '{' in content and '}' in content:
                json_start = content.index('{')
                json_end = content.rindex('}') + 1
                json_str = content[json_start:json_end]
                
                try:
                    result = json.loads(json_str)
                    
                    # Validate required fields
                    if 'predicted_close' not in result:
                        result['predicted_close'] = None
                    if 'likelihood' not in result:
                        result['likelihood'] = 0.5
                    if 'justification' not in result:
                        result['justification'] = ''
                        
                    return result # Success
                except json.JSONDecodeError as je:
                    print(f"JSON parse error (Attempt {attempt + 1}/{retries}), attempting manual extraction: {je}")
                    
                    # Try to extract values manually
                    pred_match = re.search(r'"predicted_close"\s*:\s*([0-9.]+)', json_str)
                    likelihood_match = re.search(r'"likelihood"\s*:\s*([0-9.]+)', json_str)
                    
                    if pred_match:
                        return {
                            "predicted_close": float(pred_match.group(1)),
                            "likelihood": float(likelihood_match.group(1)) if likelihood_match else 0.5,
                            "justification": "Manually extracted from malformed JSON"
                        }
                    else:
                        # Manual extraction failed, retry if possible
                        if attempt < retries - 1:
                            time.sleep(2)
                            continue
                        else:
                            return {"predicted_close": 0.0, "likelihood": 0.5, "justification": f"JSON parse error: {str(je)}"}
            else:
                # No JSON found, retry if possible
                if attempt < retries - 1:
                    time.sleep(2)
                    continue
                else:
                    return {"predicted_close": 0.0, "likelihood": 0.5, "justification": "No JSON found in response"}

        except Exception as e:
            print(f"Error in HF endpoint prediction (Attempt {attempt + 1}/{retries}): {e}")
            if attempt < retries - 1:
                time.sleep(2) # Wait 2 seconds before retrying
                continue
            else:
                return {"predicted_close": 0.0, "likelihood": 0.5, "justification": str(e)}
    
    # This is the fallback if all retries fail
    return {"predicted_close": 0.0, "likelihood": 0.5, "justification": "All retry attempts failed."}


# Test HF Endpoint
print("üß™ Testing Hugging Face Dedicated Endpoint with a sample prediction...")
print("="*80)
print("Sample prompt (first 300 chars):")
sample_prompt = test_data[0]['prompt']
print(sample_prompt[:300] + "...\n")

print("‚è∞ Generating prediction...")
start_time = time.time()
llm_result = llm_predict_stock_price(sample_prompt)
elapsed = time.time() - start_time

print(f"\n‚è±Ô∏è Inference time: {elapsed:.2f} seconds")
print("\nHF Endpoint Prediction Result:")
print(json.dumps(llm_result, indent=2))

actual_response = json.loads(test_data[0]['response'])
print(f"\nActual Target Price: {actual_response['predicted_close']}")
print(f"\n‚úÖ HF Dedicated Endpoint is working!")
print(f"üí° Speed: ~{elapsed:.1f}s per prediction")
print(f"üí° No rate limits - run unlimited predictions!")
print("="*80)

üß™ Testing Hugging Face Dedicated Endpoint with a sample prediction...
Sample prompt (first 300 chars):
You are a financial analyst with expertise in stock market forecasting.
Your task is to analyze market data and predict the next trading day stock price.
Use historical price trends, technical indicators, and sentiment analysis to provide an informed forecast.
Ensure that your predictions are well-j...

‚è∞ Generating prediction...
[{"generated_text":" {\"predicted_close\": 31.6300, \"likelihood\": 0.8, \"justification\": \"Based on recent closing prices, technical indicators, and sentiment analysis, the predicted close price for HSBC on 2023-01-03 is 31.6300. The likelihood of this prediction is 0.8. The justification for this prediction is as follows: The recent closing prices indicate a bullish trend, with the stock price increasing over the past few days. The technical indicators, such as the SMA, EMA, RSI, MACD, and Bollinger Bands, also suggest a bullish trend. The sentiment 

### 4.1 Run LLM Inference on Training Data

We'll generate LLM predictions for the training dataset to use for PPO training later.

In [9]:
# Run LLM predictions on TRAINING data with checkpointing
checkpoint_file_train = '../results/llm_predictions_train_checkpoint.json'

# Load existing checkpoint if available
if os.path.exists(checkpoint_file_train):
    print(f"Loading existing training checkpoint from {checkpoint_file_train}")
    with open(checkpoint_file_train, 'r') as f:
        checkpoint = json.load(f)
    train_llm_predictions = checkpoint['predictions']
    train_actual_prices = checkpoint['actual_prices']
    train_llm_results = checkpoint.get('llm_results', [])  # Full LLM responses
    start_idx = checkpoint['last_idx'] + 1
    print(f"Resuming from index {start_idx}/{len(train_data)}")
else:
    train_llm_predictions = []
    train_actual_prices = []
    train_llm_results = []
    start_idx = 0
    print("Starting fresh LLM predictions on training data...")

# Run LLM predictions
print(f"\nüîÑ Generating LLM predictions for {len(train_data)} TRAINING samples...")
print("‚è∞ This will take considerable time. You can stop and resume later.")

for idx in tqdm(range(start_idx, len(train_data)), desc="Training LLM Inference"):
    item = train_data[idx]
    
    try:
        # Get LLM prediction
        llm_result = llm_predict_stock_price(item['prompt'])
        
        # Store full LLM result (including justification)
        train_llm_results.append(llm_result)
        
        if llm_result['predicted_close'] is not None and float(llm_result['predicted_close']) > 0:
            train_llm_predictions.append(llm_result['predicted_close'])
        else:
            # Fallback to 0.0 if prediction is invalid
            train_llm_predictions.append(0.0)
        
        response = json.loads(item['response'])
        train_actual_prices.append(response['predicted_close'])
        
        # # Delay to avoid rate limiting
        time.sleep(0.5)

        # Checkpoint every 50 samples
        if (idx + 1) % 50 == 0:
            checkpoint = {
                'predictions': train_llm_predictions,
                'actual_prices': train_actual_prices,
                'llm_results': train_llm_results,  # Full LLM responses with justification
                'last_idx': idx
            }
            os.makedirs('../results', exist_ok=True)
            with open(checkpoint_file_train, 'w') as f:
                json.dump(checkpoint, f, indent=2)
    
    except Exception as e:
        error_msg = str(e)
        
        if 'rate_limit' in error_msg.lower() or 'too many requests' in error_msg.lower():
            print(f"\n‚ùå RATE LIMIT HIT at index {idx}!")
            print(f"Saving checkpoint and stopping execution...")
            checkpoint = {
                'predictions': train_llm_predictions,
                'actual_prices': train_actual_prices,
                'llm_results': train_llm_results,
                'last_idx': idx - 1
            }
            os.makedirs('../results', exist_ok=True)
            with open(checkpoint_file_train, 'w') as f:
                json.dump(checkpoint, f, indent=2)
            print(f"‚úÖ Checkpoint saved to: {checkpoint_file_train}")
            print(f"üìä Progress: {idx}/{len(train_data)} samples completed")
            print(f"üí° Run this cell again to resume from where you left off.")
            break  # Stop execution
        else:
            print(f"\n‚ö†Ô∏è Error at index {idx}: {error_msg}")
            # Store error result
            error_result = {"predicted_close": None, "likelihood": 0.5, "justification": f"Error: {error_msg}"}
            train_llm_results.append(error_result)
            # Use 0.0 as fallback for prediction
            train_llm_predictions.append(0.0)
            # Still append the actual price for comparison
            response = json.loads(item['response'])
            train_actual_prices.append(response['predicted_close'])

# Final save
checkpoint = {
    'predictions': train_llm_predictions,
    'actual_prices': train_actual_prices,
    'llm_results': train_llm_results,
    'last_idx': len(train_llm_predictions) - 1,
    'completed': len(train_llm_predictions) == len(train_data)
}
with open(checkpoint_file_train, 'w') as f:
    json.dump(checkpoint, f, indent=2)

if len(train_llm_predictions) == len(train_data):
    print(f"\n‚úÖ Training LLM predictions completed: {len(train_llm_predictions)} samples")
else:
    print(f"\n‚ö†Ô∏è Partial completion: {len(train_llm_predictions)}/{len(train_data)} samples")
print(f"Checkpoint saved to: {checkpoint_file_train}")

Loading existing training checkpoint from ../results/llm_predictions_train_checkpoint.json
Resuming from index 8698/8698

üîÑ Generating LLM predictions for 8698 TRAINING samples...
‚è∞ This will take considerable time. You can stop and resume later.


Training LLM Inference: 0it [00:00, ?it/s]


‚úÖ Training LLM predictions completed: 8698 samples
Checkpoint saved to: ../results/llm_predictions_train_checkpoint.json





### 4.2 Run LLM Inference on Validation Data

Generate predictions for validation data (used for PPO training).

In [10]:
# Run LLM predictions on VALIDATION data with checkpointing
checkpoint_file_val = '../results/llm_predictions_val_checkpoint.json'

if os.path.exists(checkpoint_file_val):
    print(f"Loading existing validation checkpoint from {checkpoint_file_val}")
    with open(checkpoint_file_val, 'r') as f:
        checkpoint = json.load(f)
    val_llm_predictions = checkpoint['predictions']
    val_actual_prices = checkpoint['actual_prices']
    val_llm_results = checkpoint.get('llm_results', [])
    start_idx = checkpoint['last_idx'] + 1
    print(f"Resuming from index {start_idx}/{len(val_data)}")
else:
    val_llm_predictions = []
    val_actual_prices = []
    val_llm_results = []
    start_idx = 0
    print("Starting fresh LLM predictions on validation data...")

print(f"\nüîÑ Generating LLM predictions for {len(val_data)} VALIDATION samples...")

for idx in tqdm(range(start_idx, len(val_data)), desc="Validation LLM Inference"):
    item = val_data[idx]
    
    try:
        llm_result = llm_predict_stock_price(item['prompt'])
        
        # Store full LLM result
        val_llm_results.append(llm_result)
        
        if llm_result['predicted_close'] is not None and float(llm_result['predicted_close']) > 0:
            val_llm_predictions.append(llm_result['predicted_close'])
        else:
            # Fallback to 0.0 if prediction is invalid
            val_llm_predictions.append(0.0)
        
        response = json.loads(item['response'])
        val_actual_prices.append(response['predicted_close'])
        
        # time.sleep(0.5)
        
        if (idx + 1) % 50 == 0:
            checkpoint = {
                'predictions': val_llm_predictions,
                'actual_prices': val_actual_prices,
                'llm_results': val_llm_results,
                'last_idx': idx
            }
            os.makedirs('../results', exist_ok=True)
            with open(checkpoint_file_val, 'w') as f:
                json.dump(checkpoint, f, indent=2)
    
    except Exception as e:
        error_msg = str(e)
        
        if 'rate_limit' in error_msg.lower() or 'too many requests' in error_msg.lower():
            print(f"\n‚ùå RATE LIMIT HIT at index {idx}!")
            print(f"Saving checkpoint and stopping execution...")
            checkpoint = {
                'predictions': val_llm_predictions,
                'actual_prices': val_actual_prices,
                'llm_results': val_llm_results,
                'last_idx': idx - 1
            }
            os.makedirs('../results', exist_ok=True)
            with open(checkpoint_file_val, 'w') as f:
                json.dump(checkpoint, f, indent=2)
            print(f"‚úÖ Checkpoint saved to: {checkpoint_file_val}")
            print(f"üìä Progress: {idx}/{len(val_data)} samples completed")
            print(f"üí° Run this cell again to resume from where you left off.")
            break  # Stop execution
        else:
            print(f"\n‚ö†Ô∏è Error at index {idx}: {error_msg}")
            error_result = {"predicted_close": None, "likelihood": 0.5, "justification": f"Error: {error_msg}"}
            val_llm_results.append(error_result)
            # Use 0.0 as fallback for prediction
            val_llm_predictions.append(0.0)
            # Still append the actual price for comparison
            response = json.loads(item['response'])
            val_actual_prices.append(response['predicted_close'])

checkpoint = {
    'predictions': val_llm_predictions,
    'actual_prices': val_actual_prices,
    'llm_results': val_llm_results,
    'last_idx': len(val_llm_predictions) - 1,
    'completed': len(val_llm_predictions) == len(val_data)
}
with open(checkpoint_file_val, 'w') as f:
    json.dump(checkpoint, f, indent=2)

if len(val_llm_predictions) == len(val_data):
    print(f"\n‚úÖ Validation LLM predictions completed: {len(val_llm_predictions)} samples")
else:
    print(f"\n‚ö†Ô∏è Partial completion: {len(val_llm_predictions)}/{len(val_data)} samples")
print(f"Checkpoint saved to: {checkpoint_file_val}")

Loading existing validation checkpoint from ../results/llm_predictions_val_checkpoint.json
Resuming from index 1243/1243

üîÑ Generating LLM predictions for 1243 VALIDATION samples...


Validation LLM Inference: 0it [00:00, ?it/s]


‚úÖ Validation LLM predictions completed: 1243 samples
Checkpoint saved to: ../results/llm_predictions_val_checkpoint.json





### 4.3 Run LLM Inference on Test Data

Generate predictions for test data (used for final evaluation).

In [14]:
# Run LLM predictions on test data with checkpointing
import time

# Checkpoint file to save progress
checkpoint_file = '../results/llm_predictions_checkpoint.json'

# Load existing checkpoint if available
if os.path.exists(checkpoint_file):
    print(f"Loading existing checkpoint from {checkpoint_file}")
    with open(checkpoint_file, 'r') as f:
        checkpoint = json.load(f)
    llm_predictions = checkpoint['predictions']
    actual_prices = checkpoint['actual_prices']
    llm_results = checkpoint.get('llm_results', [])
    start_idx = checkpoint['last_idx'] + 1
    print(f"Resuming from index {start_idx}/{len(test_data)}")
else:
    llm_predictions = []
    actual_prices = []
    llm_results = []
    start_idx = 0
    print("Starting fresh LLM predictions...")

# Run LLM predictions with rate limiting and checkpointing
print(f"Generating LLM predictions for {len(test_data)} samples...")
print("This may take a while due to API rate limits...")

for idx in tqdm(range(start_idx, len(test_data)), desc="LLM Inference"):
    item = test_data[idx]
    
    try:
        # Get LLM prediction
        llm_result = llm_predict_stock_price(item['prompt'])
        
        # Store full LLM result
        llm_results.append(llm_result)
        
        # Extract prediction
        if llm_result['predicted_close'] is not None and float(llm_result['predicted_close']) > 0:
            llm_predictions.append(llm_result['predicted_close'])
        else:
            # Fallback: use 0.0 if LLM fails
            llm_predictions.append(0.0)
        
        # Get actual price from response
        response = json.loads(item['response'])
        actual_prices.append(response['predicted_close'])
        
        # Small delay to avoid rate limiting (adjust based on your API limits)
        #time.sleep(0.5)

        # Checkpoint every 50 samples
        if (idx + 1) % 50 == 0:
            checkpoint = {
                'predictions': llm_predictions,
                'actual_prices': actual_prices,
                'llm_results': llm_results,
                'last_idx': idx
            }
            os.makedirs('../results', exist_ok=True)
            with open(checkpoint_file, 'w') as f:
                json.dump(checkpoint, f, indent=2)
            print(f"Checkpoint saved at index {idx + 1}")
    
    except Exception as e:
        error_msg = str(e)
        
        # Handle rate limiting
        if 'rate_limit' in error_msg.lower() or 'too many requests' in error_msg.lower():
            print(f"‚ùå RATE LIMIT HIT at index {idx}!")
            print(f"Saving checkpoint and stopping execution...")
            
            # Save checkpoint
            checkpoint = {
                'predictions': llm_predictions,
                'actual_prices': actual_prices,
                'llm_results': llm_results,
                'last_idx': idx - 1
            }
            os.makedirs('../results', exist_ok=True)
            with open(checkpoint_file, 'w') as f:
                json.dump(checkpoint, f, indent=2)
            
            print(f"‚úÖ Checkpoint saved to: {checkpoint_file}")
            print(f"üìä Progress: {idx}/{len(test_data)} samples completed")
            print(f"üí° Run this cell again to resume from where you left off.")
            break  # Stop execution
        else:
            print(f"‚ö†Ô∏è Error at index {idx}: {error_msg}")
            # Store error result
            error_result = {"predicted_close": None, "likelihood": 0.5, "justification": f"Error: {error_msg}"}
            llm_results.append(error_result)
            # Use fallback
            # Use 0.0 as fallback for prediction
            llm_predictions.append(0.0)
            # Still append the actual price for comparison
            response = json.loads(item['response'])
            actual_prices.append(response['predicted_close'])

# Final save
checkpoint = {
    'predictions': llm_predictions,
    'actual_prices': actual_prices,
    'llm_results': llm_results,
    'last_idx': len(llm_predictions) - 1,
    'completed': len(llm_predictions) == len(test_data)
}
with open(checkpoint_file, 'w') as f:
    json.dump(checkpoint, f, indent=2)

# Merge with test_df
test_df['llm_prediction'] = llm_predictions
test_df['actual_price'] = actual_prices

if len(llm_results) == len(test_df):
    justifications = []
    likelihoods = []
    feature_rows = []
    for res in llm_results:
        res = res if isinstance(res, dict) else {}
        justification = res.get('justification', '')
        justifications.append(justification)
        likelihoods.append(safe_float(res.get('likelihood'), 0.5))
        feature_rows.append(extract_justification_features(justification))
else:
    justifications = [''] * len(test_df)
    likelihoods = [0.5] * len(test_df)
    feature_rows = [extract_justification_features('') for _ in range(len(test_df))]

if feature_rows:
    feature_keys = list(feature_rows[0].keys())
else:
    feature_keys = list(extract_justification_features('').keys())

test_df['llm_justification'] = justifications
test_df['llm_likelihood'] = likelihoods
for key in feature_keys:
    test_df[key] = [row[key] for row in feature_rows]

if len(llm_predictions) == len(test_data):
    print(f"‚úÖ LLM predictions completed: {len(llm_predictions)} samples")
else:
    print(f"‚ö†Ô∏è Partial completion: {len(llm_predictions)}/{len(test_data)} samples")
print(f"Checkpoint saved to: {checkpoint_file}")
print("Sample predictions:")


Loading existing checkpoint from ../results/llm_predictions_checkpoint.json
Resuming from index 2050/2477
Generating LLM predictions for 2477 samples...
This may take a while due to API rate limits...


LLM Inference:  12%|‚ñà‚ñè        | 50/427 [10:32<1:38:25, 15.66s/it]

Checkpoint saved at index 2100


LLM Inference:  15%|‚ñà‚ñç        | 62/427 [13:21<1:32:36, 15.22s/it]

JSON parse error (Attempt 1/3), attempting manual extraction: Invalid control character at: line 1 column 241 (char 240)


LLM Inference:  23%|‚ñà‚ñà‚ñé       | 100/427 [22:14<1:18:06, 14.33s/it]

Checkpoint saved at index 2150


LLM Inference:  26%|‚ñà‚ñà‚ñã       | 113/427 [25:13<1:20:43, 15.43s/it]

JSON parse error (Attempt 1/3), attempting manual extraction: Invalid control character at: line 1 column 237 (char 236)


LLM Inference:  28%|‚ñà‚ñà‚ñä       | 119/427 [26:55<1:25:45, 16.71s/it]

JSON parse error (Attempt 1/3), attempting manual extraction: Extra data: line 3 column 1 (char 66)


LLM Inference:  29%|‚ñà‚ñà‚ñâ       | 123/427 [27:45<1:15:03, 14.81s/it]

JSON parse error (Attempt 1/3), attempting manual extraction: Invalid control character at: line 1 column 307 (char 306)


LLM Inference:  35%|‚ñà‚ñà‚ñà‚ñå      | 150/427 [34:07<51:10, 11.09s/it]  

Checkpoint saved at index 2200


LLM Inference:  41%|‚ñà‚ñà‚ñà‚ñà      | 175/427 [40:41<1:12:30, 17.26s/it]

JSON parse error (Attempt 1/3), attempting manual extraction: Invalid control character at: line 1 column 192 (char 191)


LLM Inference:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 200/427 [46:16<1:04:08, 16.95s/it]

Checkpoint saved at index 2250


LLM Inference:  59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 250/427 [57:45<36:06, 12.24s/it]  

Checkpoint saved at index 2300


LLM Inference:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 285/427 [1:06:04<36:08, 15.27s/it]

JSON parse error (Attempt 1/3), attempting manual extraction: Invalid control character at: line 1 column 237 (char 236)


LLM Inference:  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 300/427 [1:09:04<24:32, 11.60s/it]

Checkpoint saved at index 2350


LLM Inference:  79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 339/427 [1:18:46<21:25, 14.60s/it]

JSON parse error (Attempt 1/3), attempting manual extraction: Invalid control character at: line 1 column 196 (char 195)


LLM Inference:  82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 350/427 [1:21:19<16:33, 12.90s/it]

Checkpoint saved at index 2400


LLM Inference:  89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 380/427 [1:28:28<10:58, 14.01s/it]

Error in HF endpoint prediction (Attempt 1/3): HTTPSConnectionPool(host='o988k6zvcj6ifd2u.us-east-1.aws.endpoints.huggingface.cloud', port=443): Read timed out. (read timeout=30)


LLM Inference:  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 400/427 [1:33:33<06:13, 13.84s/it]

Checkpoint saved at index 2450


LLM Inference: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 427/427 [1:39:30<00:00, 13.98s/it]

‚úÖ LLM predictions completed: 2477 samples
Checkpoint saved to: ../results/llm_predictions_checkpoint.json
Sample predictions:





### 4.4 Check for Failed Predictions in Checkpoints

Before training PPO, let's verify all predictions succeeded and fix any failures.

In [16]:
# Check for failed predictions in all checkpoint files
import json
import os

def check_failed_predictions(checkpoint_file, data_name):
    """Check for failed/None or zero predictions in checkpoint"""
    if not os.path.exists(checkpoint_file):
        print(f"‚ùå {data_name} checkpoint not found: {checkpoint_file}")
        return None
    
    with open(checkpoint_file, 'r') as f:
        checkpoint = json.load(f)
    
    predictions = checkpoint.get('predictions', [])
    llm_results = checkpoint.get('llm_results', [])
    
    # Find indices with failed predictions (None or 0)
    failed_indices = []
    for idx, (pred, result) in enumerate(zip(predictions, llm_results)):
        # A prediction is considered failed if it's None, 0, or the result object indicates failure.
        is_pred_zero = False
        try:
            if pred is not None and float(pred) == 0.0:
                is_pred_zero = True
        except (ValueError, TypeError):
            pass # Not a number, will be caught by `is None` check if problematic

        if pred is None or is_pred_zero or (isinstance(result, dict) and result.get('predicted_close') is None):
            failed_indices.append(idx)
    
    print(f"\n{'='*80}")
    print(f"üìä {data_name.upper()} CHECKPOINT ANALYSIS")
    print(f"{'='*80}")
    print(f"Total predictions: {len(predictions)}")
    print(f"Failed/Zero predictions: {len(failed_indices)}")
    
    if len(predictions) > 0:
        success_rate = ((len(predictions) - len(failed_indices)) / len(predictions) * 100)
        print(f"Success rate: {success_rate:.2f}%")
    
    if failed_indices:
        print(f"\n‚ö†Ô∏è Failed/Zero prediction indices (first 20): {failed_indices[:20]}")
        if len(failed_indices) > 20:
            print(f"   ... and {len(failed_indices) - 20} more")
    else:
        print(f"\n‚úÖ All predictions are valid (not None and not zero)!")
    
    return {
        'checkpoint_file': checkpoint_file,
        'total': len(predictions),
        'failed': len(failed_indices),
        'failed_indices': failed_indices,
        'checkpoint': checkpoint
    }

# Check all three checkpoints
print("üîç CHECKING ALL CHECKPOINT FILES FOR FAILED/ZERO PREDICTIONS")
print("="*80)

train_check = check_failed_predictions(
    '../results/llm_predictions_train_checkpoint.json', 
    'Training'
)

val_check = check_failed_predictions(
    '../results/llm_predictions_val_checkpoint.json', 
    'Validation'
)

test_check = check_failed_predictions(
    '../results/llm_predictions_checkpoint.json', 
    'Test'
)

# Summary
print(f"\n{'='*80}")
print(f"üìà OVERALL SUMMARY")
print(f"{'='*80}")
if train_check:
    print(f"Training:   {train_check['failed']}/{train_check['total']} failed or zero")
if val_check:
    print(f"Validation: {val_check['failed']}/{val_check['total']} failed or zero")
if test_check:
    print(f"Test:       {test_check['failed']}/{test_check['total']} failed or zero")

total_failed = 0
total_samples = 0
if train_check:
    total_failed += train_check['failed']
    total_samples += train_check['total']
if val_check:
    total_failed += val_check['failed']
    total_samples += val_check['total']
if test_check:
    total_failed += test_check['failed']
    total_samples += test_check['total']

if total_samples > 0:
    print(f"\nTotal failed or zero: {total_failed}/{total_samples} ({(total_failed/total_samples*100):.2f}%)")
print(f"\nüí° If any predictions failed, run the next cell to fix them.")


üîç CHECKING ALL CHECKPOINT FILES FOR FAILED/ZERO PREDICTIONS

üìä TRAINING CHECKPOINT ANALYSIS
Total predictions: 8698
Failed/Zero predictions: 0
Success rate: 100.00%

‚úÖ All predictions are valid (not None and not zero)!

üìä VALIDATION CHECKPOINT ANALYSIS
Total predictions: 1243
Failed/Zero predictions: 0
Success rate: 100.00%

‚úÖ All predictions are valid (not None and not zero)!

üìä TEST CHECKPOINT ANALYSIS
Total predictions: 2477
Failed/Zero predictions: 123
Success rate: 95.03%

‚ö†Ô∏è Failed/Zero prediction indices (first 20): [1, 8, 14, 20, 61, 80, 91, 95, 100, 105, 173, 181, 194, 233, 246, 250, 251, 257, 259, 277]
   ... and 103 more

üìà OVERALL SUMMARY
Training:   0/8698 failed or zero
Validation: 0/1243 failed or zero
Test:       123/2477 failed or zero

Total failed or zero: 123/12418 (0.99%)

üí° If any predictions failed, run the next cell to fix them.


In [24]:
# Re-run inference ONLY for failed predictions
def fix_failed_predictions(checkpoint_info, original_data, data_name):
    """Re-run inference for failed predictions only"""
    if not checkpoint_info or not checkpoint_info['failed_indices']:
        print(f"‚úÖ {data_name}: No failed predictions to fix!")
        return checkpoint_info['checkpoint']
    
    print(f"\n{'='*80}")
    print(f"üîÑ FIXING FAILED PREDICTIONS FOR {data_name.upper()}")
    print(f"{'='*80}")
    print(f"Failed predictions to fix: {len(checkpoint_info['failed_indices'])}")
    
    checkpoint = checkpoint_info['checkpoint']
    predictions = checkpoint['predictions']
    actual_prices = checkpoint['actual_prices']
    llm_results = checkpoint['llm_results']
    
    fixed_count = 0
    still_failed = []
    
    for idx in tqdm(checkpoint_info['failed_indices'], desc=f"Fixing {data_name}"):
        try:
            item = original_data[idx]
            
            # Re-run LLM prediction
            llm_result = llm_predict_stock_price(item['prompt'])
            
            # Update results
            llm_results[idx] = llm_result
            
            # Update prediction
            if llm_result['predicted_close'] is not None and float(llm_result['predicted_close']) > 0:
                predictions[idx] = llm_result['predicted_close']
                fixed_count += 1
            else:
                # Still failed, use fallback
                # response = json.loads(item['response'])
                # predictions[idx] = response['predicted_close']
                # still_failed.append(idx)
                predictions[idx] = 0.0
                still_failed.append(idx)
            
            # Small delay
            time.sleep(0.3)
            
        except Exception as e:
            print(f"\n‚ö†Ô∏è Error fixing index {idx}: {e}")
            still_failed.append(idx)
            try:
                response = json.loads(original_data[idx]['response'])
                predictions[idx] = response['predicted_close']
            except:
                pass
    
    # Save updated checkpoint
    checkpoint['predictions'] = predictions
    checkpoint['llm_results'] = llm_results
    checkpoint['last_idx'] = len(predictions) - 1
    checkpoint['completed'] = True
    with open(checkpoint_info['checkpoint_file'], 'w') as f:
        json.dump(checkpoint, f, indent=2)
    
    print(f"\n‚úÖ Fixed {fixed_count}/{len(checkpoint_info['failed_indices'])} predictions")
    if still_failed:
        print(f"‚ö†Ô∏è Still failed: {len(still_failed)} predictions (using fallback values)")
        print(f"   Indices: {still_failed[:10]}")
    print(f"üíæ Updated checkpoint saved to: {checkpoint_info['checkpoint_file']}")
    
    return checkpoint

# Fix training data
if train_check and train_check['failed'] > 0:
    print("\n" + "="*80)
    print("FIXING TRAINING DATA")
    print("="*80)
    train_checkpoint_fixed = fix_failed_predictions(train_check, train_data, "Training")
    # Update global variables
    train_llm_predictions = train_checkpoint_fixed['predictions']
    train_actual_prices = train_checkpoint_fixed['actual_prices']
    train_llm_results = train_checkpoint_fixed['llm_results']
    print(f"‚úÖ Training data updated: {len(train_llm_predictions)} predictions")

# Fix validation data
if val_check and val_check['failed'] > 0:
    print("\n" + "="*80)
    print("FIXING VALIDATION DATA")
    print("="*80)
    val_checkpoint_fixed = fix_failed_predictions(val_check, val_data, "Validation")
    # Update global variables
    val_llm_predictions = val_checkpoint_fixed['predictions']
    val_actual_prices = val_checkpoint_fixed['actual_prices']
    val_llm_results = val_checkpoint_fixed['llm_results']
    print(f"‚úÖ Validation data updated: {len(val_llm_predictions)} predictions")

# Fix test data
if test_check and test_check['failed'] > 0:
    print("\n" + "="*80)
    print("FIXING TEST DATA")
    print("="*80)
    test_checkpoint_fixed = fix_failed_predictions(test_check, test_data, "Test")
    # Update global variables
    llm_predictions = test_checkpoint_fixed['predictions']
    actual_prices = test_checkpoint_fixed['actual_prices']
    llm_results = test_checkpoint_fixed['llm_results']
    print(f"‚úÖ Test data updated: {len(llm_predictions)} predictions")
    
    test_df['actual_price'] = actual_prices
    test_df['llm_prediction'] = llm_predictions
    test_df['actual_price'] = actual_prices
    # Update test_df
    # Update justification features
    justifications = []
    likelihoods = []
    feature_rows = []
    for res in llm_results:
        res = res if isinstance(res, dict) else {}
        justification = res.get('justification', '')
        justifications.append(justification)
        likelihoods.append(safe_float(res.get('likelihood'), 0.5))
        feature_rows.append(extract_justification_features(justification))
    test_df['llm_likelihood'] = likelihoods
    test_df['llm_justification'] = justifications
    test_df['llm_likelihood'] = likelihoods
    
    feature_keys = list(feature_rows[0].keys()) if feature_rows else []
    for key in feature_keys:
        test_df[key] = [row[key] for row in feature_rows]

print("\n" + "="*80)
print("‚úÖ ALL FAILED PREDICTIONS HAVE BEEN PROCESSED!")

print("="*80)

print("You can now proceed with PPO training.")


FIXING TEST DATA

üîÑ FIXING FAILED PREDICTIONS FOR TEST
Failed predictions to fix: 123


Fixing Test:   0%|          | 0/123 [00:00<?, ?it/s]

[{"generated_text":" {\"predicted_close\": 0.0, \"likelihood\": 0.0, \"justification\": \"\"}\n\n"}]
 {"predicted_close": 0.0, "likelihood": 0.0, "justification": ""}




Fixing Test:   1%|          | 1/123 [00:02<06:02,  2.97s/it]

[{"generated_text":" {\"predicted_close\": 0.0, \"likelihood\": 0.0, \"justification\": \"\"}\n\n"}]
 {"predicted_close": 0.0, "likelihood": 0.0, "justification": ""}




Fixing Test:   2%|‚ñè         | 2/123 [00:05<05:55,  2.94s/it]

[{"generated_text":" {\"predicted_close\": 0.0, \"likelihood\": 0.0, \"justification\": \"\"}\n\n"}]
 {"predicted_close": 0.0, "likelihood": 0.0, "justification": ""}




Fixing Test:   2%|‚ñè         | 3/123 [00:08<05:54,  2.95s/it]

[{"generated_text":" {\"predicted_close\": 0.0, \"likelihood\": 0.0, \"justification\": \"\"}\n\n"}]
 {"predicted_close": 0.0, "likelihood": 0.0, "justification": ""}




Fixing Test:   3%|‚ñé         | 4/123 [00:13<06:29,  3.27s/it]



KeyboardInterrupt: 