In [2]:
# SECTION 1: NO RETRIEVAL BASELINE EXPERIMENT
# Pure LLM prediction without any historical context retrieval

import os
import torch
import json
import pandas as pd
import numpy as np
from datetime import datetime
import logging
from transformers import AutoTokenizer, AutoModelForCausalLM
from typing import Dict, List, Tuple

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class NoRetrievalExperiment:
    """
    Baseline experiment: LLM prediction with only query context
    Tests pure LLM financial reasoning without historical examples
    """
    
    def __init__(self, test_queries_file: str, ground_truth_file: str):
        self.test_queries_file = test_queries_file
        self.ground_truth_file = ground_truth_file
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        # Load data
        self.test_queries = self._load_json_file(test_queries_file)
        self.ground_truth = self._load_json_file(ground_truth_file)
        self.gt_lookup = {gt['query_id']: gt for gt in self.ground_truth}
        
        logger.info(f"Loaded {len(self.test_queries)} test queries")
        logger.info(f"Loaded {len(self.ground_truth)} ground truth entries")
        
        # LLM configurations for testing
        self.llm_configs = {
            'StockLLM': {
                'model_name': 'ElsaShaw/StockLLM',
                'description': 'Specialized financial LLM'
            },
            'Llama3.2-3B': {
                'model_name': 'meta-llama/Llama-3.2-3B-Instruct',
                'description': 'Medium general-purpose LLM'
            },
            'Qwen2.5-1.5B': {
                'model_name': 'Qwen/Qwen2.5-1.5B-Instruct',
                'description': 'Qwen instruction-following model'
            },
            'Phi3-Mini': {
                'model_name': 'microsoft/Phi-3-mini-4k-instruct',
                'description': 'Microsoft compact LLM'
            }
        }
        
        # Current LLM state
        self.current_llm = None
        self.current_tokenizer = None
        self.current_llm_name = None
    
    def _load_json_file(self, file_path: str) -> List[Dict]:
        """Load JSONL file"""
        data = []
        with open(file_path, 'r') as f:
            for line in f:
                try:
                    data.append(json.loads(line.strip()))
                except json.JSONDecodeError:
                    continue
        return data
    
    def load_llm(self, llm_name: str):
        """Load specified LLM for testing"""
        if llm_name not in self.llm_configs:
            raise ValueError(f"Unknown LLM: {llm_name}")
        
        # Clear previous model from memory
        if self.current_llm is not None:
            del self.current_llm
            del self.current_tokenizer
            torch.cuda.empty_cache()
        
        config = self.llm_configs[llm_name]
        model_name = config['model_name']
        
        logger.info(f"Loading {llm_name}: {model_name}")
        
        try:
            # Load tokenizer
            self.current_tokenizer = AutoTokenizer.from_pretrained(model_name)
            if self.current_tokenizer.pad_token is None:
                self.current_tokenizer.pad_token = self.current_tokenizer.eos_token
            
            # Load model with appropriate settings
            self.current_llm = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype=torch.bfloat16 if self.device.type == 'cuda' else torch.float32,
                device_map="auto" if self.device.type == 'cuda' else None,
                low_cpu_mem_usage=True,
                trust_remote_code=True  # For some models like Phi
            )
            
            self.current_llm_name = llm_name
            logger.info(f"✅ Successfully loaded {llm_name}")
            
        except Exception as e:
            logger.error(f"❌ Failed to load {llm_name}: {e}")
            raise
    
    def generate_no_retrieval_prompt(self, query: Dict) -> str:
        """
        Generate minimal prompt with only query stock context
        No historical examples or retrieved candidates
        """
        query_stock = query.get('query_stock', 'Unknown')
        query_date = query.get('query_date', 'Unknown')
        
        # Include only the essential query information
        context = {
            "query_stock": query_stock,
            "recent_date_list": query.get('recent_date_list', []),
            "adjusted_close_list": query.get('adjusted_close_list', [])
        }
        
        # Simple, direct prompt without historical context
        prompt = (
            f"Stock Context: {str(context)}\n\n"
            f"Based on the stock price history above, predict the stock movement by filling in the [blank] with 'rise' or 'fall'. "
            f"Just fill in the blank, do not explain.\n\n"
            f"Query: On {query_date}, the movement of ${query_stock} is [blank]."
        )
        
        return prompt
    
    def ask_llm(self, prompt: str) -> str:
        """Get prediction from current LLM"""
        if self.current_llm is None:
            raise RuntimeError("No LLM loaded")
        
        # Format prompt based on LLM architecture
        if 'Llama' in self.current_llm_name:
            messages = [
                {"role": "system", "content": "You are a financial analyst. Predict stock movements accurately using only the given information."},
                {"role": "user", "content": prompt}
            ]
            
            formatted_prompt = self.current_tokenizer.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=True
            )
        else:
            # For other models, use simple system/user format
            formatted_prompt = f"System: You are a financial analyst.\nUser: {prompt}\nAssistant:"
        
        # Tokenize
        input_ids = self.current_tokenizer.encode(
            formatted_prompt, 
            return_tensors="pt", 
            truncation=True, 
            max_length=1024
        ).to(self.device)
        
        # Generate with conservative settings
        with torch.no_grad():
            outputs = self.current_llm.generate(
                input_ids,
                max_new_tokens=10,  # Short response expected
                temperature=0.1,    # Low temperature for consistency
                do_sample=False,    # Deterministic
                pad_token_id=self.current_tokenizer.eos_token_id,
                eos_token_id=self.current_tokenizer.eos_token_id
            )
        
        # Decode response
        generated_ids = outputs[0][input_ids.shape[1]:]
        response = self.current_tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
        
        return response
    
    def extract_prediction(self, response: str, reference: str) -> Tuple[str, bool]:
        """Extract rise/fall prediction from LLM response"""
        response_clean = response.lower().strip()
        reference_clean = reference.lower().strip()
        
        # Extract prediction
        if 'rise' in response_clean:
            prediction = 'rise'
        elif 'fall' in response_clean:
            prediction = 'fall'
        else:
            prediction = 'freeze'  # Unclear response
        
        # Check correctness
        correct = (prediction == reference_clean)
        
        return prediction, correct
    
    def run_single_llm_experiment(self, llm_name: str, output_dir: str) -> pd.DataFrame:
        """Run no-retrieval experiment for a single LLM"""
        logger.info(f" Starting NO RETRIEVAL experiment with {llm_name}")
        
        # Load the specified LLM
        self.load_llm(llm_name)
        
        results = []
        processed_count = 0
        correct_count = 0
        
        for i, query in enumerate(self.test_queries):
            query_id = query['query_id']
            ground_truth = self.gt_lookup.get(query_id)
            
            # Skip if no ground truth
            if not ground_truth:
                continue
            
            reference_answer = ground_truth['actual_movement']
            
            # Skip freeze movements (focus on rise/fall)
            if reference_answer == 'freeze':
                continue
            
            # Progress logging
            if (i + 1) % 50 == 0:
                accuracy = correct_count / processed_count if processed_count > 0 else 0
                logger.info(f"Progress: {i+1}/{len(self.test_queries)} | Accuracy so far: {accuracy:.3f}")
            
            try:
                # Generate prompt (no retrieval context)
                prompt = self.generate_no_retrieval_prompt(query)
                
                # Get LLM prediction
                llm_response = self.ask_llm(prompt)
                
                # Extract and validate prediction
                prediction, correct = self.extract_prediction(llm_response, reference_answer)
                
                # Track accuracy
                processed_count += 1
                if correct:
                    correct_count += 1
                
                # Store result
                result = {
                    'llm_name': llm_name,
                    'query_id': query_id,
                    'query_stock': query.get('query_stock', ''),
                    'query_date': query.get('query_date', ''),
                    'method': 'no_retrieval',
                    'prompt': prompt,
                    'llm_response': llm_response,
                    'prediction': prediction,
                    'reference': reference_answer,
                    'correct': correct,
                    'candidate_count': 0,  # No candidates used
                    'retrieval_context': 'none'
                }
                
                results.append(result)
                
            except Exception as e:
                logger.error(f"Error processing query {query_id}: {e}")
                continue
        
        # Create results DataFrame
        df = pd.DataFrame(results)
        
        # Save detailed results
        os.makedirs(output_dir, exist_ok=True)
        output_file = os.path.join(output_dir, f'{llm_name}_no_retrieval_results.csv')
        df.to_csv(output_file, index=False)
        
        # Calculate metrics
        total_predictions = len(df)
        overall_accuracy = df['correct'].mean() if total_predictions > 0 else 0
        
        # Class-specific metrics
        rise_df = df[df['reference'] == 'rise']
        fall_df = df[df['reference'] == 'fall']
        
        rise_accuracy = rise_df['correct'].mean() if len(rise_df) > 0 else 0
        fall_accuracy = fall_df['correct'].mean() if len(fall_df) > 0 else 0
        
        # Log results
        logger.info("="*60)
        logger.info(f"NO RETRIEVAL RESULTS - {llm_name}")
        logger.info("="*60)
        logger.info(f"Total Predictions: {total_predictions}")
        logger.info(f"Overall Accuracy: {overall_accuracy:.4f}")
        logger.info(f"Rise Accuracy: {rise_accuracy:.4f} ({len(rise_df)} samples)")
        logger.info(f"Fall Accuracy: {fall_accuracy:.4f} ({len(fall_df)} samples)")
        logger.info(f"Results saved to: {output_file}")
        logger.info("="*60)
        
        return df
    
    def run_multi_llm_experiment(self, llm_list: List[str], output_dir: str) -> pd.DataFrame:
        """Run no-retrieval experiment across multiple LLMs"""
        logger.info(f" Starting MULTI-LLM NO RETRIEVAL experiment")
        logger.info(f"Testing LLMs: {llm_list}")
        
        all_results = []
        
        for llm_name in llm_list:
            logger.info(f"\n{'='*60}")
            logger.info(f"TESTING LLM: {llm_name}")
            logger.info(f"{'='*60}")
            
            try:
                # Run experiment for this LLM
                llm_results = self.run_single_llm_experiment(llm_name, output_dir)
                all_results.append(llm_results)
                
                logger.info(f" Completed {llm_name}")
                
            except Exception as e:
                logger.error(f" Failed {llm_name}: {e}")
                continue
        
        # Combine all results
        if all_results:
            combined_df = pd.concat(all_results, ignore_index=True)
            
            # Save combined results
            combined_file = os.path.join(output_dir, 'all_llms_no_retrieval_combined.csv')
            combined_df.to_csv(combined_file, index=False)
            
            # Generate comparison report
            self._generate_comparison_report(combined_df, output_dir)
            
            logger.info(f"\n Multi-LLM experiment completed!")
            logger.info(f"Combined results saved to: {combined_file}")
            
            return combined_df
        else:
            logger.error(" No successful experiments!")
            return pd.DataFrame()
    
    def _generate_comparison_report(self, combined_df: pd.DataFrame, output_dir: str):
        """Generate comparison report across LLMs"""
        
        comparison_results = []
        
        for llm_name in combined_df['llm_name'].unique():
            llm_df = combined_df[combined_df['llm_name'] == llm_name]
            
            # Calculate metrics
            total = len(llm_df)
            accuracy = llm_df['correct'].mean()
            
            rise_df = llm_df[llm_df['reference'] == 'rise']
            fall_df = llm_df[llm_df['reference'] == 'fall']
            
            rise_acc = rise_df['correct'].mean() if len(rise_df) > 0 else 0
            fall_acc = fall_df['correct'].mean() if len(fall_df) > 0 else 0
            
            # Prediction distribution
            pred_dist = llm_df['prediction'].value_counts()
            
            comparison_results.append({
                'llm_name': llm_name,
                'total_predictions': total,
                'overall_accuracy': accuracy,
                'rise_accuracy': rise_acc,
                'fall_accuracy': fall_acc,
                'rise_predictions': len(rise_df),
                'fall_predictions': len(fall_df),
                'predicted_rise': pred_dist.get('rise', 0),
                'predicted_fall': pred_dist.get('fall', 0),
                'predicted_freeze': pred_dist.get('freeze', 0),
                'method': 'no_retrieval'
            })
        
        # Save comparison
        comparison_df = pd.DataFrame(comparison_results)
        comparison_file = os.path.join(output_dir, 'no_retrieval_llm_comparison.csv')
        comparison_df.to_csv(comparison_file, index=False)
        
        # Print summary
        logger.info("\n" + "="*80)
        logger.info("NO RETRIEVAL - LLM COMPARISON SUMMARY")
        logger.info("="*80)
        
        for result in comparison_results:
            logger.info(f"{result['llm_name']:15} | Accuracy: {result['overall_accuracy']:.4f} | "
                       f"Rise: {result['rise_accuracy']:.3f} | Fall: {result['fall_accuracy']:.3f}")
        
        logger.info(f"\nDetailed comparison saved to: {comparison_file}")

# USAGE EXAMPLE
def run_no_retrieval_experiment():
    """Main function to run the no retrieval experiment"""
    
    # Configuration
    test_queries_file = '/root/nfs/AJ FinRag/Evaluation Results/Test Queries/test_queries_rise_fall_only.json'
    ground_truth_file = '/root/nfs/AJ FinRag/Evaluation Results/Test Queries/ground_truth_rise_fall_only.json'
    output_dir = 'no_retrieval_experiments'
    
    # LLMs to test (start with fewer for testing)
    test_llms = [
        'StockLLM',      # Specialized financial model
        'Qwen2.5-1.5B',   
        'Llama3.2-3B'    # Medium general model
    ]
    
    try:
        # Initialize experiment
        experiment = NoRetrievalExperiment(test_queries_file, ground_truth_file)
        
        # Run multi-LLM experiment
        results = experiment.run_multi_llm_experiment(test_llms, output_dir)
        
        if len(results) > 0:
            logger.info(" NO RETRIEVAL EXPERIMENT COMPLETED SUCCESSFULLY!")
            logger.info(f" Total results: {len(results)} predictions across {len(test_llms)} LLMs")
        else:
            logger.error(" Experiment failed - no results generated")
            
    except Exception as e:
        logger.error(f" Experiment failed: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    run_no_retrieval_experiment()

INFO:__main__:Loaded 2642 test queries
INFO:__main__:Loaded 2642 ground truth entries
INFO:__main__: Starting MULTI-LLM NO RETRIEVAL experiment
INFO:__main__:Testing LLMs: ['StockLLM', 'Qwen2.5-1.5B', 'Llama3.2-3B']
INFO:__main__:
INFO:__main__:TESTING LLM: StockLLM
INFO:__main__: Starting NO RETRIEVAL experiment with StockLLM
INFO:__main__:Loading StockLLM: ElsaShaw/StockLLM
INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

INFO:__main__:✅ Successfully loaded StockLLM
INFO:__main__:Progress: 50/2642 | Accuracy so far: 0.510
INFO:__main__:Progress: 100/2642 | Accuracy so far: 0.455
INFO:__main__:Progress: 150/2642 | Accuracy so far: 0.470
INFO:__main__:Progress: 200/2642 | Accuracy so far: 0.472
INFO:__main__:Progress: 250/2642 | Accuracy so far: 0.486
INFO:__main__:Progress: 300/2642 | Accuracy so far: 0.468
INFO:__main__:Progress: 350/2642 | Accuracy so far: 0.481
INFO:__main__:Progress: 400/2642 | Accuracy so far: 0.489
INFO:__main__:Progress: 450/2642 | Accuracy so far: 0.494
INFO:__main__:Progress: 500/2642 | Accuracy so far: 0.505
INFO:__main__:Progress: 550/2642 | Accuracy so far: 0.501
INFO:__main__:Progress: 600/2642 | Accuracy so far: 0.496
INFO:__main__:Progress: 650/2642 | Accuracy so far: 0.495
INFO:__main__:Progress: 700/2642 | Accuracy so far: 0.496
INFO:__main__:Progress: 750/2642 | Accuracy so far: 0.497
INFO:__main__:Progress: 800/2642 | Accuracy so far: 0.498
INFO:__main__:Progress: 850/

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

INFO:__main__:✅ Successfully loaded Llama3.2-3B
INFO:__main__:Progress: 50/2642 | Accuracy so far: 0.531
INFO:__main__:Progress: 100/2642 | Accuracy so far: 0.535
INFO:__main__:Progress: 150/2642 | Accuracy so far: 0.497
INFO:__main__:Progress: 200/2642 | Accuracy so far: 0.508
INFO:__main__:Progress: 250/2642 | Accuracy so far: 0.534
INFO:__main__:Progress: 300/2642 | Accuracy so far: 0.545
INFO:__main__:Progress: 350/2642 | Accuracy so far: 0.527
INFO:__main__:Progress: 400/2642 | Accuracy so far: 0.526
INFO:__main__:Progress: 450/2642 | Accuracy so far: 0.526
INFO:__main__:Progress: 500/2642 | Accuracy so far: 0.527
INFO:__main__:Progress: 550/2642 | Accuracy so far: 0.521
INFO:__main__:Progress: 600/2642 | Accuracy so far: 0.523
INFO:__main__:Progress: 650/2642 | Accuracy so far: 0.515
INFO:__main__:Progress: 700/2642 | Accuracy so far: 0.515
INFO:__main__:Progress: 750/2642 | Accuracy so far: 0.518
INFO:__main__:Progress: 800/2642 | Accuracy so far: 0.518
INFO:__main__:Progress: 8