In [1]:
import os
import json
import logging
import numpy as np
from pathlib import Path

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def load_json_files_from_folder(folder_path):
    """Load all JSON lines files from a folder"""
    json_data = []

    if not os.path.isdir(folder_path):
        logger.error("Error: Folder path does not exist.")
        return json_data

    files = os.listdir(folder_path)
    logger.info(f"Found {len(files)} files in {folder_path}")

    for file_name in files:
        if file_name.endswith('.json'):
            file_path = os.path.join(folder_path, file_name)
            logger.info(f"Loading {file_name}")

            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    for line_num, line in enumerate(f, 1):
                        try:
                            json_data.append(json.loads(line.strip()))
                        except json.JSONDecodeError as e:
                            logger.warning(f"Skipping invalid JSON at line {line_num} in {file_name}: {e}")
            except Exception as e:
                logger.error(f"Error reading {file_name}: {e}")

    logger.info(f"Loaded {len(json_data)} total records")
    return json_data

def load_single_json_file(file_path):
    """Load a single JSON lines file"""
    json_data = []

    if not os.path.exists(file_path):
        logger.error(f"Error: File {file_path} does not exist.")
        return json_data

    logger.info(f"Loading {file_path}")

    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line_num, line in enumerate(f, 1):
                try:
                    json_data.append(json.loads(line.strip()))
                except json.JSONDecodeError as e:
                    logger.warning(f"Skipping invalid JSON at line {line_num}: {e}")
    except Exception as e:
        logger.error(f"Error reading {file_path}: {e}")
        return json_data

    logger.info(f"Loaded {len(json_data)} records from {file_path}")
    return json_data

def allocate_pos_and_neg_from_probabilities(all_data, num_neg_samples=15, min_candidates=2, probability_threshold=0.5):
    """
    Allocate positive and negative candidates based on probability scores
    
    Args:
        all_data: List of sequences with candidate probabilities
        num_neg_samples: Target number of negative samples per query
        min_candidates: Minimum candidates required per query
        probability_threshold: Threshold for considering candidates (optional filtering)
    """
    all_json_list = []
    skipped_sequences = 0
    candidate_stats = []
    probability_stats = []

    logger.info(f"Processing {len(all_data)} sequences with probability-based allocation")

    for i, sequence in enumerate(all_data):
        try:
            # Extract required fields from NEW format
            candidate_list = sequence.get("candidates", [])
            candidate_index_list = sequence.get("candidate_indices", [])
            candidate_probabilities = sequence.get("candidate_probabilities", [])

            # Validate data integrity
            if not candidate_list or not candidate_probabilities:
                logger.warning(f"Skipping sequence {i}: Missing candidates or probabilities")
                skipped_sequences += 1
                continue

            if len(candidate_list) != len(candidate_probabilities):
                logger.warning(f"Skipping sequence {i}: Mismatch in candidates ({len(candidate_list)}) and probabilities ({len(candidate_probabilities)})")
                skipped_sequences += 1
                continue

            if candidate_index_list and len(candidate_index_list) != len(candidate_probabilities):
                logger.warning(f"Skipping sequence {i}: Mismatch in indices ({len(candidate_index_list)}) and probabilities ({len(candidate_probabilities)})")
                skipped_sequences += 1
                continue

            num_candidates = len(candidate_probabilities)
            candidate_stats.append(num_candidates)
            probability_stats.extend(candidate_probabilities)

            # Skip if we don't have minimum candidates (need at least 1 pos + 1 neg)
            if num_candidates < min_candidates:
                logger.warning(f"Skipping sequence {i}: Not enough candidates ({num_candidates}) for minimum requirement ({min_candidates})")
                skipped_sequences += 1
                continue

            # Convert probabilities to numpy array for easier processing
            probs_array = np.array(candidate_probabilities)
            
            # Sort probabilities in descending order (highest first) and get original indices
            sorted_indices = np.argsort(probs_array)[::-1]
            sorted_probabilities = probs_array[sorted_indices]

            # Adaptively adjust number of negative samples based on available candidates
            # Reserve 1 for positive, rest can be negative
            max_possible_neg = num_candidates - 1
            actual_neg_samples = min(num_neg_samples, max_possible_neg)

            if actual_neg_samples != num_neg_samples:
                logger.debug(f"Sequence {i}: Adjusting neg samples from {num_neg_samples} to {actual_neg_samples} (only {num_candidates} candidates available)")

            # Get the highest scoring candidate (positive example)
            pos_idx = sorted_indices[0]
            pos_candidate = candidate_list[pos_idx]
            pos_probability = float(sorted_probabilities[0])
            pos_index = candidate_index_list[pos_idx] if candidate_index_list else pos_idx

            # Get the lowest scoring candidates (negative examples)
            neg_candidates = []
            neg_indices = []
            neg_probabilities = []

            # Take the lowest scoring candidates as negatives
            neg_start_idx = max(1, num_candidates - actual_neg_samples)
            for idx in sorted_indices[neg_start_idx:]:
                neg_candidates.append(candidate_list[idx])
                neg_indices.append(candidate_index_list[idx] if candidate_index_list else idx)
                neg_probabilities.append(float(probs_array[idx]))

            # If we don't have enough low-scoring candidates, take from the middle
            while len(neg_candidates) < actual_neg_samples and len(neg_candidates) < num_candidates - 1:
                remaining_idx = len(neg_candidates) + 1  # Skip the positive candidate
                if remaining_idx < len(sorted_indices):
                    idx = sorted_indices[remaining_idx]
                    neg_candidates.append(candidate_list[idx])
                    neg_indices.append(candidate_index_list[idx] if candidate_index_list else idx)
                    neg_probabilities.append(float(probs_array[idx]))
                else:
                    break

            # Create teacher score list: [positive_probability, negative_probabilities...]
            teacher_score_list = [pos_probability] + neg_probabilities

            # Create the new sequence with pos/neg allocation
            sequence_with_score = {
                "query_id": sequence.get("query_id", f"query_{i}"),
                "query": sequence.get("query", ""),
                "pos": [pos_candidate],
                "pos_index": [pos_index],
                "neg": neg_candidates,
                "neg_index": neg_indices,
                "teacher_scores": teacher_score_list,
                "answers": sequence.get("answers", [sequence.get("correct_answer", "")]),
                "task": sequence.get("task", "icl"),
                "num_candidates_available": num_candidates,
                "neg_samples_used": len(neg_candidates),
                "pos_probability": pos_probability,
                "avg_neg_probability": np.mean(neg_probabilities) if neg_probabilities else 0.0,
                "probability_separation": pos_probability - (np.mean(neg_probabilities) if neg_probabilities else 0.0)
            }

            # Add optional fields if they exist
            for optional_field in ["query_date", "query_stock", "movement", "correct_answer"]:
                if optional_field in sequence:
                    sequence_with_score[optional_field] = sequence[optional_field]

            all_json_list.append(sequence_with_score)

        except Exception as e:
            logger.error(f"Error processing sequence {i}: {e}")
            skipped_sequences += 1
            continue

    # Calculate and log statistics
    if candidate_stats:
        avg_candidates = np.mean(candidate_stats)
        min_candidates_found = min(candidate_stats)
        max_candidates_found = max(candidate_stats)

        logger.info(f"Candidate statistics:")
        logger.info(f"  - Average candidates per sequence: {avg_candidates:.1f}")
        logger.info(f"  - Min candidates found: {min_candidates_found}")
        logger.info(f"  - Max candidates found: {max_candidates_found}")
        logger.info(f"  - Sequences with < {num_neg_samples + 1} candidates: {sum(1 for x in candidate_stats if x < num_neg_samples + 1)}")

    if probability_stats:
        logger.info(f"Probability statistics:")
        logger.info(f"  - Average probability: {np.mean(probability_stats):.4f}")
        logger.info(f"  - Min probability: {min(probability_stats):.4f}")
        logger.info(f"  - Max probability: {max(probability_stats):.4f}")
        logger.info(f"  - Std deviation: {np.std(probability_stats):.4f}")

    # Calculate separation statistics for processed sequences
    if all_json_list:
        separations = [seq.get('probability_separation', 0) for seq in all_json_list]
        pos_probs = [seq.get('pos_probability', 0) for seq in all_json_list]
        avg_neg_probs = [seq.get('avg_neg_probability', 0) for seq in all_json_list]
        
        logger.info(f"Pos/Neg separation statistics:")
        logger.info(f"  - Average positive probability: {np.mean(pos_probs):.4f}")
        logger.info(f"  - Average negative probability: {np.mean(avg_neg_probs):.4f}")
        logger.info(f"  - Average separation (pos - neg): {np.mean(separations):.4f}")
        logger.info(f"  - Min separation: {min(separations):.4f}")
        logger.info(f"  - Max separation: {max(separations):.4f}")

    logger.info(f"Successfully processed: {len(all_json_list)} sequences")
    logger.info(f"Skipped: {skipped_sequences} sequences")

    return all_json_list

def save_json_lines(data, output_file):
    """Save data as JSON lines format"""
    os.makedirs(os.path.dirname(output_file), exist_ok=True)

    with open(output_file, "w", encoding='utf-8') as file:
        for obj in data:
            json_str = json.dumps(obj, ensure_ascii=False)
            file.write(json_str + "\n")

    logger.info(f"Saved {len(data)} sequences to {output_file}")

def process_probability_based_dataset(input_file, output_file, num_neg_samples=15, min_candidates=2, 
                                     adaptive_neg_samples=True, probability_threshold=0.0):
    """
    Process LLM scored dataset with candidate probabilities
    
    Args:
        input_file: Path to scored data with candidate probabilities
        output_file: Path to save pos/neg allocated data
        num_neg_samples: Target number of negative samples per query
        min_candidates: Minimum candidates required per query
        adaptive_neg_samples: Whether to adaptively reduce neg samples when not enough candidates
        probability_threshold: Minimum probability threshold for candidates (0.0 = include all)
    """
    logger.info(f"=== Processing Probability-Based LLM Scored Dataset ===")
    logger.info(f"Input: {input_file}")
    logger.info(f"Output: {output_file}")
    logger.info(f"Target negative samples per query: {num_neg_samples}")
    logger.info(f"Minimum candidates required: {min_candidates}")
    logger.info(f"Probability threshold: {probability_threshold}")

    # Load the scored data
    data = load_single_json_file(input_file)

    if not data:
        logger.error("No data loaded, exiting")
        return 0

    logger.info(f"Loaded {len(data)} sequences from input file")

    # Validate data format
    sample_item = data[0] if data else {}
    required_fields = ["candidates", "candidate_probabilities"]
    missing_fields = [field for field in required_fields if field not in sample_item]
    
    if missing_fields:
        logger.error(f"Input data missing required fields: {missing_fields}")
        logger.error("Expected format: {'candidates': [...], 'candidate_probabilities': [...]}")
        logger.error(f"Found fields: {list(sample_item.keys())}")
        return 0

    logger.info("Data format validation passed")

    # Allocate positive and negative examples based on probabilities
    processed_data = allocate_pos_and_neg_from_probabilities(
        data, 
        num_neg_samples=num_neg_samples,
        min_candidates=min_candidates,
        probability_threshold=probability_threshold
    )

    if not processed_data:
        logger.error("No data processed successfully, exiting")
        return 0

    # Save the processed data
    save_json_lines(processed_data, output_file)

    # Generate statistics
    stats = {
        "input_file": input_file,
        "output_file": output_file,
        "original_sequences": len(data),
        "processed_sequences": len(processed_data),
        "processing_rate": len(processed_data) / len(data) if data else 0,
        "negative_samples_per_query": num_neg_samples,
        "probability_threshold": probability_threshold,
        "processing_method": "probability_based_allocation"
    }

    # Save statistics
    stats_file = output_file.replace('.json', '_processing_stats.json')
    with open(stats_file, 'w', encoding='utf-8') as f:
        json.dump(stats, f, indent=2)

    logger.info(f"Processing complete!")
    logger.info(f"Statistics saved to: {stats_file}")

    return len(processed_data)

def analyze_probability_distribution(input_file):
    """Analyze the probability distribution in the scored data"""
    logger.info(f"=== Analyzing Probability Distribution ===")
    
    data = load_single_json_file(input_file)
    if not data:
        logger.error("No data to analyze")
        return

    all_probabilities = []
    query_max_probs = []
    query_min_probs = []

    for sequence in data:
        probs = sequence.get("candidate_probabilities", [])
        if probs:
            all_probabilities.extend(probs)
            query_max_probs.append(max(probs))
            query_min_probs.append(min(probs))

    if all_probabilities:
        logger.info(f"Overall probability statistics:")
        logger.info(f"  - Total candidates analyzed: {len(all_probabilities)}")
        logger.info(f"  - Mean probability: {np.mean(all_probabilities):.4f}")
        logger.info(f"  - Std deviation: {np.std(all_probabilities):.4f}")
        logger.info(f"  - Min probability: {min(all_probabilities):.4f}")
        logger.info(f"  - Max probability: {max(all_probabilities):.4f}")
        logger.info(f"  - 25th percentile: {np.percentile(all_probabilities, 25):.4f}")
        logger.info(f"  - 50th percentile (median): {np.percentile(all_probabilities, 50):.4f}")
        logger.info(f"  - 75th percentile: {np.percentile(all_probabilities, 75):.4f}")

        logger.info(f"Per-query statistics:")
        logger.info(f"  - Average max probability per query: {np.mean(query_max_probs):.4f}")
        logger.info(f"  - Average min probability per query: {np.mean(query_min_probs):.4f}")
        logger.info(f"  - Average separation per query: {np.mean(query_max_probs) - np.mean(query_min_probs):.4f}")

if __name__ == "__main__":
    logger.info("=== Probability-Based Positive/Negative Allocation ===")

    # Configuration
    config = {
        "num_neg_samples": 15,  # Target number of negative samples per query
        "min_candidates": 2,    # Minimum candidates required (1 pos + 1 neg)
        "adaptive_neg_samples": True,  # Adapt neg samples when not enough candidates
        "probability_threshold": 0.0,  # Include all candidates (no threshold filtering)
        "processing_mode": "single_file"  # Options: "single_file", "multiple_files", "folder"
    }

    # File paths - UPDATE THESE TO MATCH YOUR NEW LLM SCORING OUTPUT
    input_file_path = "/root/nfs/AJ FinRag/LLM Scores/llm_data/all_companies_scored_candidates_probabilities.json"
    output_file_path = "/root/nfs/AJ FinRag/Training Data/llm_data/all_companies_train_pos_neg.json"

    logger.info(f"Input file: {input_file_path}")
    logger.info(f"Output file: {output_file_path}")
    logger.info(f"Configuration: {config}")

    # First, analyze the probability distribution
    logger.info("\n=== Analyzing Input Data ===")
    analyze_probability_distribution(input_file_path)

    # Process the probability-based dataset
    logger.info(f"\n=== Processing Probability-Based Data ===")
    total_processed = process_probability_based_dataset(
        input_file=input_file_path,
        output_file=output_file_path,
        num_neg_samples=config["num_neg_samples"],
        min_candidates=config["min_candidates"],
        adaptive_neg_samples=config["adaptive_neg_samples"],
        probability_threshold=config["probability_threshold"]
    )

    logger.info(f"\nProcessing complete! Total sequences processed: {total_processed}")
    
    if total_processed > 0:
        logger.info(f"\nOutput format:")
        logger.info(f"  - query: Query string")
        logger.info(f"  - pos: [highest_probability_candidate]")
        logger.info(f"  - neg: [lowest_probability_candidates...]")
        logger.info(f"  - teacher_scores: [pos_prob, neg_prob1, neg_prob2, ...]")
        logger.info(f"\nReady for FinSeer retriever training!")
    else:
        logger.error("No sequences were processed. Please check your input data format.")

    logger.info(f"\nFiles generated:")
    logger.info(f"  - Training data: {output_file_path}")
    logger.info(f"  - Statistics: {output_file_path.replace('.json', '_processing_stats.json')}")

2025-08-25 20:29:39,552 - INFO - === Probability-Based Positive/Negative Allocation ===
2025-08-25 20:29:39,553 - INFO - Input file: /root/nfs/AJ FinRag/LLM Scores/llm_data/all_companies_scored_candidates_probabilities.json
2025-08-25 20:29:39,553 - INFO - Output file: /root/nfs/AJ FinRag/Training Data/llm_data/all_companies_train_pos_neg.json
2025-08-25 20:29:39,554 - INFO - Configuration: {'num_neg_samples': 15, 'min_candidates': 2, 'adaptive_neg_samples': True, 'probability_threshold': 0.0, 'processing_mode': 'single_file'}
2025-08-25 20:29:39,555 - INFO - 
=== Analyzing Input Data ===
2025-08-25 20:29:39,555 - INFO - === Analyzing Probability Distribution ===
2025-08-25 20:29:39,572 - INFO - Loading /root/nfs/AJ FinRag/LLM Scores/llm_data/all_companies_scored_candidates_probabilities.json
2025-08-25 20:29:40,058 - INFO - Loaded 13210 records from /root/nfs/AJ FinRag/LLM Scores/llm_data/all_companies_scored_candidates_probabilities.json
2025-08-25 20:29:40,077 - INFO - Overall proba