# Examine the dataset which chatgpt 4o mini selects

In [None]:
import os
import json
import openai
import numpy as np
import torch
from tqdm import tqdm
from datasets import load_dataset
import re

def test_gpt4o_mini_indices(num_samples=10, compression_ratio=0.1, max_length=120):
    """
    Test GPT-4o mini's ability to output indices correctly using raw data

    Args:
        num_samples: Number of examples to test
        compression_ratio: Ratio of tokens to select
        max_length: Maximum context length

    Returns:
        success_rate: Percentage of successful index extractions
        all_results: Detailed results for each test case
    """
    # Load raw dataset
    raw = load_dataset("raw", "main", split="test")

    # Select a subset of samples for testing
    samples = raw.select(range(min(num_samples, len(raw))))

    # Set up OpenAI API key - using environment variable for security
    openai.api_key = "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"
    if not openai.api_key:
        raise ValueError("Please set OPENAI_API_KEY environment variable")

    all_results = []
    success_count = 0

    for i, sample in enumerate(tqdm(samples, desc="Testing GPT-4o mini indices")):
        query = sample["question"]
        reasoning = sample["answer"]

        # Calculate expected number of tokens
        num_of_compressed_tokens = int(max_length * compression_ratio)

        # Test GPT-4o mini index generation
        try:
            response = openai.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "user",
                     "content": f"You should select {num_of_compressed_tokens} important words from the following text. "
                                f"I need you to only output the words' indices increasingly, DO NOT OUTPUT ANYTHING ELSE. "
                                f"For example, for text 'Where is my pencil?', the important words are 'Where' and 'pencil' "
                                f"at position 0 and 4, so the output is '0, 4'. Now, do it for the following text: {reasoning}"}
                ],
                temperature=0.0,  # Use deterministic output for testing
                max_tokens=200
            )

            output = response.choices[0].message.content.strip()

            # Extract and validate indices
            try:
                # Clean the output - extract only numbers and commas
                cleaned_output = re.sub(r'[^0-9,\s]', '', output)
                indices = [int(idx.strip()) for idx in cleaned_output.split(',') if idx.strip()]

                # Validate the indices
                indices = sorted(indices)  # Ensure indices are sorted
                indices = [idx for idx in indices if idx < len(reasoning.split())]  # Remove out-of-bounds indices
                indices = indices[:num_of_compressed_tokens]  # Limit to expected count

                # Check if we have enough valid indices
                if len(indices) >= num_of_compressed_tokens * 0.8:  # Allow 80% success threshold
                    success = True
                    success_count += 1
                else:
                    success = False

                # Show selected words for manual verification
                words = reasoning.split()
                selected_words = [words[idx] if idx < len(words) else f"[OUT_OF_RANGE_{idx}]" for idx in indices]

            except Exception as e:
                indices = []
                selected_words = []
                success = False
                error_msg = str(e)

        except Exception as e:
            indices = []
            selected_words = []
            success = False
            error_msg = str(e)

        # Record results
        result = {
            "sample_id": i,
            "query": query,
            "reasoning": reasoning,
            "expected_token_count": num_of_compressed_tokens,
            "raw_model_output": output,
            "extracted_indices": indices,
            "selected_words": selected_words,
            "success": success
        }

        if not success and 'error_msg' in locals():
            result["error"] = error_msg

        all_results.append(result)

        # Print progress
        print(f"Sample {i+1}/{num_samples}: {'✓' if success else '✗'}")
        if not success:
            print(f"  Raw output: {output}")
            if indices:
                print(f"  Extracted {len(indices)} indices, expected {num_of_compressed_tokens}")

    # Calculate success rate
    success_rate = (success_count / len(samples)) * 100

    # Save detailed results to file
    with open("gpt4o_mini_indices_test_results.json", "w") as f:
        json.dump(all_results, f, indent=2)

    return success_rate, all_results

def analyze_results(success_rate, all_results):
    """Analyze the test results and print summary statistics"""
    print("\n==== GPT-4o Mini Indices Test Results ====")
    print(f"Success Rate: {success_rate:.2f}%")

    # Calculate statistics
    total_samples = len(all_results)
    successful_samples = sum(1 for r in all_results if r["success"])
    failed_samples = total_samples - successful_samples

    print(f"Total Samples: {total_samples}")
    print(f"Successful Samples: {successful_samples}")
    print(f"Failed Samples: {failed_samples}")

    # Common failure patterns
    if failed_samples > 0:
        print("\nCommon Failure Patterns:")

        # Check for non-numeric outputs
        non_numeric = sum(1 for r in all_results if not r["success"] and not r["extracted_indices"])
        if non_numeric > 0:
            print(f"- Non-numeric outputs: {non_numeric} samples")

        # Check for insufficient indices
        insufficient = sum(1 for r in all_results if not r["success"] and r["extracted_indices"] and
                         len(r["extracted_indices"]) < r["expected_token_count"] * 0.8)
        if insufficient > 0:
            print(f"- Insufficient indices count: {insufficient} samples")

        # Check for out-of-range indices
        original_outputs = [r for r in all_results if not r["success"] and "raw_model_output" in r]
        out_of_range = sum(1 for r in original_outputs if any(
            idx >= len(r["reasoning"].split()) for idx in r.get("extracted_indices", []) if idx
        ))
        if out_of_range > 0:
            print(f"- Out-of-range indices: {out_of_range} samples")

    # Sample successful and failed outputs
    if successful_samples > 0:
        print("\nSample Successful Output:")
        successful = next((r for r in all_results if r["success"]), None)
        if successful:
            print(f"Raw output: {successful['raw_model_output']}")
            print(f"Extracted indices: {successful['extracted_indices'][:10]}...")
            print(f"Selected words: {successful['selected_words'][:10]}...")

    if failed_samples > 0:
        print("\nSample Failed Output:")
        failed = next((r for r in all_results if not r["success"]), None)
        if failed:
            print(f"Raw output: {failed['raw_model_output']}")
            print(f"Reasoning: {failed['reasoning'][:100]}...")
            if failed.get("extracted_indices"):
                print(f"Extracted indices: {failed['extracted_indices']}")
            if "error" in failed:
                print(f"Error: {failed['error']}")

if __name__ == "__main__":
    # Run the test
    print("Testing GPT-4o mini indices extraction on raw dataset...")
    success_rate, all_results = test_gpt4o_mini_indices(num_samples=20)

    # Analyze the results
    analyze_results(success_rate, all_results)

  from .autonotebook import tqdm as notebook_tqdm


Testing GPT-4o mini indices extraction on GSM8K dataset...


Generating train split: 100%|██████████| 7473/7473 [00:00<00:00, 316830.42 examples/s]
Generating test split: 100%|██████████| 1319/1319 [00:00<00:00, 385955.56 examples/s]
Testing GPT-4o mini indices:   5%|▌         | 1/20 [00:01<00:35,  1.89s/it]

Sample 1/20: ✓


Testing GPT-4o mini indices:  10%|█         | 2/20 [00:03<00:36,  2.01s/it]

Sample 2/20: ✓


Testing GPT-4o mini indices:  15%|█▌        | 3/20 [00:04<00:24,  1.45s/it]

Sample 3/20: ✓


Testing GPT-4o mini indices:  20%|██        | 4/20 [00:05<00:19,  1.25s/it]

Sample 4/20: ✓


Testing GPT-4o mini indices:  25%|██▌       | 5/20 [00:06<00:16,  1.08s/it]

Sample 5/20: ✓


Testing GPT-4o mini indices:  30%|███       | 6/20 [00:07<00:15,  1.08s/it]

Sample 6/20: ✓


Testing GPT-4o mini indices:  35%|███▌      | 7/20 [00:08<00:14,  1.14s/it]

Sample 7/20: ✓


Testing GPT-4o mini indices:  40%|████      | 8/20 [00:09<00:12,  1.07s/it]

Sample 8/20: ✓


Testing GPT-4o mini indices:  45%|████▌     | 9/20 [00:10<00:10,  1.03it/s]

Sample 9/20: ✓


Testing GPT-4o mini indices:  50%|█████     | 10/20 [00:11<00:09,  1.10it/s]

Sample 10/20: ✓


Testing GPT-4o mini indices:  55%|█████▌    | 11/20 [00:12<00:08,  1.09it/s]

Sample 11/20: ✓


Testing GPT-4o mini indices:  60%|██████    | 12/20 [00:12<00:06,  1.19it/s]

Sample 12/20: ✓


Testing GPT-4o mini indices:  65%|██████▌   | 13/20 [00:13<00:05,  1.18it/s]

Sample 13/20: ✓


Testing GPT-4o mini indices:  70%|███████   | 14/20 [00:16<00:08,  1.36s/it]

Sample 14/20: ✓


Testing GPT-4o mini indices:  75%|███████▌  | 15/20 [00:16<00:05,  1.15s/it]

Sample 15/20: ✓


Testing GPT-4o mini indices:  80%|████████  | 16/20 [00:17<00:04,  1.11s/it]

Sample 16/20: ✓


Testing GPT-4o mini indices:  85%|████████▌ | 17/20 [00:19<00:03,  1.11s/it]

Sample 17/20: ✓


Testing GPT-4o mini indices:  90%|█████████ | 18/20 [00:19<00:02,  1.02s/it]

Sample 18/20: ✓


Testing GPT-4o mini indices:  95%|█████████▌| 19/20 [00:20<00:00,  1.04it/s]

Sample 19/20: ✓


Testing GPT-4o mini indices: 100%|██████████| 20/20 [00:21<00:00,  1.08s/it]

Sample 20/20: ✓

==== GPT-4o Mini Indices Test Results ====
Success Rate: 100.00%
Total Samples: 20
Successful Samples: 20
Failed Samples: 0

Sample Successful Output:
Raw output: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
Extracted indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]...
Selected words: ['Janet', 'sells', '16', '-', '3', '-', '4', '=', '<<16-3-4=9>>9', 'duck']...



