In [45]:
import pandas as pd

base_df = pd.read_csv('../../../datasets/question answering/ai2_arc/ARC-Challenge-clean.csv')
base_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2581 entries, 0 to 2580
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0.1  2581 non-null   int64 
 1   Unnamed: 0    2581 non-null   int64 
 2   id            2581 non-null   object
 3   question      2581 non-null   object
 4   answerKey     2581 non-null   object
 5   choice_A      2581 non-null   object
 6   choice_B      2581 non-null   object
 7   choice_C      2581 non-null   object
 8   choice_D      2581 non-null   object
 9   split         2581 non-null   object
dtypes: int64(2), object(8)
memory usage: 201.8+ KB


# Setup & Configuration (Logging + API Key Rotation)

In [2]:
import cohere
import time
import logging
from datetime import datetime
from pathlib import Path

# === 1. Setup Logging ===
def setup_logging(log_file: str = "cohere_predict.log"):
    log_dir = Path("logs")
    log_dir.mkdir(exist_ok=True)
    
    log_path = log_dir / f"{datetime.now().strftime('%Y%m%d_%H%M%S')}_{log_file}"
    
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s',
        handlers=[
            logging.FileHandler(log_path),
            logging.StreamHandler()
        ]
    )
    logger = logging.getLogger(__name__)
    logger.info(f"Logging initialized. Log file: {log_path}")
    return logger

logger = setup_logging()


2025-06-04 11:25:48,885 - INFO - setup_logging:23 - Logging initialized. Log file: logs\20250604_112548_cohere_predict.log


# Cohere Configuration & API Key Switching

In [5]:
# === 2. Configuration Class ===
class Config:
    COHERE_API_KEYS = [
    "Zqv68DB6S7CcEr0vmqfroN4Xwc6qy0Dm4knA8RGF",
    "FmTCH3v4g3VZMxnWvAP9jWjUqMfPd7Z5S9nuwWKe",
    "4i08JycbiUx9vgLA0Cka0bzVfOhqpixo0EpjuwY5"
]
    MODEL = "command-r-plus"
    MAX_RETRIES = 5
    REQUEST_DELAY = 7  # seconds
    TIMEOUT = 45
    MAX_REQUESTS_PER_KEY = 900

# === 3. API Key Management ===
key_index = 0
requests_made = 0
co = cohere.Client(Config.COHERE_API_KEYS[key_index])

def rotate_key():
    global key_index, requests_made, co
    key_index += 1
    if key_index >= len(Config.COHERE_API_KEYS):
        raise Exception("All Cohere API keys are exhausted.")
    co = cohere.Client(Config.COHERE_API_KEYS[key_index])
    requests_made = 0
    logger.warning(f"üîë Switched to Cohere API key #{key_index + 1}")


# Cohere Prompt Format & API Call

In [6]:
# === 4. Prompt Format (Same as GPT-4o) ===
predict_then_explain_prompt_context = '''
You are given a multiple-choice question. 

Step 1: Based on your knowledge and reasoning, select the most likely correct answer.  
Step 2: Justify your answer with clear reasoning and explanation.

Instructions:
- Use logical reasoning to determine the best answer.
- Do not reference the other answer options in your explanation.
- Keep the explanation concise but informative (2-4 sentences).
- Provide clear reasoning for your choice.

---

Question: {question}

Options:
A) {option_A}  
B) {option_B}  
C) {option_C}  
D) {option_D}

Respond in this format:

<prediction>[A/B/C/D]</prediction>  
<explanation>[Your reasoning and justification for the answer]</explanation>
'''

# === 5. API Call Function ===
def make_cohere_api_call(prompt: str, row_id=None) -> str:
    global requests_made
    for attempt in range(Config.MAX_RETRIES):
        if requests_made >= Config.MAX_REQUESTS_PER_KEY:
            rotate_key()
        try:
            time.sleep(Config.REQUEST_DELAY)
            response = co.chat(
                message=prompt,
                model=Config.MODEL,
                temperature=0.0
            )
            requests_made += 1
            return response.text
        except Exception as e:
            logger.warning(f"Retry {attempt+1}/{Config.MAX_RETRIES} failed for row {row_id}: {e}")
            time.sleep(3)
    logger.error(f"‚ùå Failed after {Config.MAX_RETRIES} attempts for row {row_id}")
    return None


# Prediction Logic (Cohere-based `cohere_predict` function)

In [7]:
import re
import time
from typing import Dict, Any, Optional

def extract_prediction_and_explanation(output: str) -> Dict[str, Optional[str]]:
    """Extract prediction and explanation using standard tag format"""
    try:
        pred_match = re.search(r"<prediction>\s*([A-D])\s*</prediction>", output, re.IGNORECASE)
        prediction = pred_match.group(1).upper() if pred_match else None
        
        expl_match = re.search(r"<explanation>\s*(.*?)\s*</explanation>", output, re.DOTALL | re.IGNORECASE)
        explanation = expl_match.group(1).strip() if expl_match else None
        
        return {"prediction": prediction, "explanation": explanation}
    except Exception as e:
        logger.error(f"Error extracting fields: {e}")
        return {"prediction": None, "explanation": None}

def validate_row(row: Dict[str, Any]) -> bool:
    required_fields = ['question', 'choice_A', 'choice_B', 'choice_C', 'choice_D']
    return all(field in row and str(row[field]).strip() for field in required_fields)

def cohere_predict(row: Dict[str, Any], row_id: Optional[int] = None) -> Dict[str, Any]:
    start_time = time.time()
    if not validate_row(row):
        logger.error(f"Invalid input for row {row_id}")
        return {
            "row_id": row_id,
            "question_id": row.get('id'),
            "prediction": None,
            "explanation": None,
            "raw_output": None,
            "actual_answer": row.get('answerKey'),
            "is_correct": None,
            "error": "Invalid input",
            "processing_time": 0
        }
    try:
        prompt = predict_then_explain_prompt_context.format(
            question=row['question'],
            option_A=row['choice_A'],
            option_B=row['choice_B'],
            option_C=row['choice_C'],
            option_D=row['choice_D']
        )

        raw_output = make_cohere_api_call(prompt, row_id=row_id)
        elapsed = time.time() - start_time

        if raw_output is None:
            return {
                "row_id": row_id,
                "question_id": row.get('id'),
                "prediction": None,
                "explanation": None,
                "raw_output": None,
                "actual_answer": row.get('answerKey'),
                "is_correct": None,
                "error": "Cohere API call failed",
                "processing_time": elapsed
            }

        extracted = extract_prediction_and_explanation(raw_output)
        actual = row.get('answerKey')
        pred = extracted['prediction']
        is_correct = pred == actual if pred else None

        return {
            "row_id": row_id,
            "question_id": row.get('id'),
            "prediction": pred,
            "explanation": extracted['explanation'],
            "raw_output": raw_output,
            "actual_answer": actual,
            "is_correct": is_correct,
            "split": row.get('split'),
            "error": None,
            "processing_time": elapsed
        }

    except Exception as e:
        logger.error(f"Error in cohere_predict for row {row_id}: {e}")
        return {
            "row_id": row_id,
            "question_id": row.get('id'),
            "prediction": None,
            "explanation": None,
            "raw_output": None,
            "actual_answer": row.get('answerKey'),
            "is_correct": None,
            "error": str(e),
            "processing_time": time.time() - start_time
        }


# DataFrame Processing (with progress bar + intermediate save)

In [8]:
from tqdm import tqdm
import pandas as pd
import json
from typing import List

def process_dataframe(df: pd.DataFrame, 
                      start_index=0, 
                      end_index=None, 
                      save_interval=50, 
                      output_file="cohere_results.json") -> List[Dict[str, Any]]:
    if end_index is None:
        end_index = len(df)
    subset_df = df.iloc[start_index:end_index]

    results = []
    failed_count = 0
    correct_count = 0

    pbar = tqdm(subset_df.iterrows(), total=len(subset_df), desc="Processing", unit="q")

    for i, row in pbar:
        result = cohere_predict(row.to_dict(), row_id=i)
        results.append(result)

        if result['prediction'] is None:
            failed_count += 1
        elif result['is_correct'] is True:
            correct_count += 1

        success_rate = ((len(results) - failed_count) / len(results)) * 100
        accuracy = (correct_count / (len(results) - failed_count)) * 100 if (len(results) - failed_count) > 0 else 0

        pbar.set_postfix({
            "Success": f"{success_rate:.1f}%",
            "Accuracy": f"{accuracy:.1f}%",
            "Last": result['prediction'] or "FAIL"
        })

        if len(results) % save_interval == 0:
            save_results(results, output_file)
            logger.info(f"Saved {len(results)} results to {output_file}")

    save_results(results, output_file)
    return results

def save_results(results: List[Dict[str, Any]], filename: str):
    Path("output").mkdir(exist_ok=True)
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    full_path = Path("output") / f"{timestamp}_{filename}"
    with open(full_path, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    logger.info(f"‚úÖ Results saved to {full_path}")


# Full Chunk-Based Dataset Loop (for 2581 samples)

In [9]:
import pandas as pd
from datetime import datetime

# === Load your dataset ===
base_df = pd.read_csv("../../../datasets/question answering/ai2_arc/ARC-Challenge-clean.csv")
base_df.dropna(inplace=True)

# Optional: Fix answers if needed
base_df['answerKey'] = base_df['answerKey'].replace({'1': 'A', '2': 'B', '3': 'C', '4': 'D'})
base_df = base_df[base_df['answerKey'].isin(['A', 'B', 'C', 'D'])]

# === Metadata ===
logger.info(f"Starting full dataset processing with Cohere")
logger.info(f"Dataset size: {len(base_df)} questions")
logger.info(f"Processing start time (UTC): {datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')}")

print("üöÄ Starting full dataset processing...")
print(f"üìä Total questions: {len(base_df)}")
print(f"‚è∞ Start time: {datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')}")

# === Chunking Strategy ===
chunk_size = 500
total_chunks = (len(base_df) + chunk_size - 1) // chunk_size
print(f"üì¶ Processing in {total_chunks} chunks of {chunk_size} questions each")

all_results = []

for chunk_num in range(total_chunks):
    start_idx = chunk_num * chunk_size
    end_idx = min(start_idx + chunk_size, len(base_df))

    print(f"\n{'='*60}")
    print(f"üîÑ Processing Chunk {chunk_num + 1}/{total_chunks}")
    print(f"üìç Rows {start_idx} to {end_idx - 1} ({end_idx - start_idx} questions)")
    print(f"{'='*60}")

    chunk_results = process_dataframe(
        base_df,
        start_index=start_idx,
        end_index=end_idx,
        save_interval=50,
        output_file=f"chunk_{chunk_num + 1}_cohere_predictions.json"
    )

    all_results.extend(chunk_results)

    # Stats
    chunk_successful = sum(1 for r in chunk_results if r['prediction'] is not None)
    chunk_accuracy = sum(1 for r in chunk_results if r['is_correct'] is True) / chunk_successful * 100 if chunk_successful > 0 else 0

    print(f"‚úÖ Chunk {chunk_num + 1} completed:")
    print(f"   üìà Success rate: {chunk_successful}/{len(chunk_results)} ({chunk_successful/len(chunk_results)*100:.1f}%)")
    print(f"   üéØ Accuracy: {chunk_accuracy:.1f}%")

    if chunk_num < total_chunks - 1:
        print("‚è∏Ô∏è  Sleeping 2s between chunks...")
        time.sleep(2)


2025-06-04 11:27:29,143 - INFO - <module>:13 - Starting full dataset processing with Cohere
2025-06-04 11:27:29,144 - INFO - <module>:14 - Dataset size: 2581 questions
2025-06-04 11:27:29,145 - INFO - <module>:15 - Processing start time (UTC): 2025-06-04 07:57:29


üöÄ Starting full dataset processing...
üìä Total questions: 2581
‚è∞ Start time: 2025-06-04 07:57:29
üì¶ Processing in 6 chunks of 500 questions each

üîÑ Processing Chunk 1/6
üìç Rows 0 to 499 (500 questions)


Processing:   0%|          | 0/500 [00:00<?, ?q/s]2025-06-04 11:27:37,978 - INFO - _send_single_request:1025 - HTTP Request: POST https://api.cohere.com/v1/chat "HTTP/1.1 200 OK"
Processing:   0%|          | 1/500 [00:08<1:13:18,  8.82s/q, Success=100.0%, Accuracy=100.0%, Last=A]2025-06-04 11:27:46,958 - INFO - _send_single_request:1025 - HTTP Request: POST https://api.cohere.com/v1/chat "HTTP/1.1 200 OK"
Processing:   0%|          | 2/500 [00:17<1:13:56,  8.91s/q, Success=100.0%, Accuracy=100.0%, Last=B]2025-06-04 11:27:55,804 - INFO - _send_single_request:1025 - HTTP Request: POST https://api.cohere.com/v1/chat "HTTP/1.1 200 OK"
Processing:   1%|          | 3/500 [00:26<1:13:34,  8.88s/q, Success=100.0%, Accuracy=100.0%, Last=B]2025-06-04 11:28:05,171 - INFO - _send_single_request:1025 - HTTP Request: POST https://api.cohere.com/v1/chat "HTTP/1.1 200 OK"
Processing:   1%|          | 4/500 [00:36<1:15:01,  9.07s/q, Success=100.0%, Accuracy=100.0%, Last=D]2025-06-04 11:28:20,353 - INFO

‚úÖ Chunk 1 completed:
   üìà Success rate: 493/500 (98.6%)
   üéØ Accuracy: 89.0%
‚è∏Ô∏è  Sleeping 2s between chunks...

üîÑ Processing Chunk 2/6
üìç Rows 500 to 999 (500 questions)


Processing:   0%|          | 0/500 [00:00<?, ?q/s]2025-06-04 12:45:51,031 - INFO - _send_single_request:1025 - HTTP Request: POST https://api.cohere.com/v1/chat "HTTP/1.1 200 OK"
Processing:   0%|          | 1/500 [00:08<1:12:02,  8.66s/q, Success=100.0%, Accuracy=100.0%, Last=D]2025-06-04 12:46:00,044 - INFO - _send_single_request:1025 - HTTP Request: POST https://api.cohere.com/v1/chat "HTTP/1.1 200 OK"
Processing:   0%|          | 2/500 [00:17<1:13:33,  8.86s/q, Success=100.0%, Accuracy=100.0%, Last=C]2025-06-04 12:46:08,777 - INFO - _send_single_request:1025 - HTTP Request: POST https://api.cohere.com/v1/chat "HTTP/1.1 200 OK"
Processing:   1%|          | 3/500 [00:26<1:12:55,  8.80s/q, Success=100.0%, Accuracy=100.0%, Last=C]2025-06-04 12:46:19,841 - INFO - _send_single_request:1025 - HTTP Request: POST https://api.cohere.com/v1/chat "HTTP/1.1 200 OK"
Processing:   1%|          | 4/500 [00:37<1:20:10,  9.70s/q, Success=100.0%, Accuracy=100.0%, Last=C]2025-06-04 12:46:28,592 - INFO

‚úÖ Chunk 2 completed:
   üìà Success rate: 495/500 (99.0%)
   üéØ Accuracy: 90.9%
‚è∏Ô∏è  Sleeping 2s between chunks...

üîÑ Processing Chunk 3/6
üìç Rows 1000 to 1499 (500 questions)


Processing:   0%|          | 0/500 [00:00<?, ?q/s]2025-06-04 14:03:31,554 - INFO - _send_single_request:1025 - HTTP Request: POST https://api.cohere.com/v1/chat "HTTP/1.1 200 OK"
Processing:   0%|          | 1/500 [00:11<1:37:19, 11.70s/q, Success=100.0%, Accuracy=100.0%, Last=A]2025-06-04 14:03:40,325 - INFO - _send_single_request:1025 - HTTP Request: POST https://api.cohere.com/v1/chat "HTTP/1.1 200 OK"
Processing:   0%|          | 2/500 [00:20<1:22:47,  9.97s/q, Success=100.0%, Accuracy=100.0%, Last=B]2025-06-04 14:03:49,109 - INFO - _send_single_request:1025 - HTTP Request: POST https://api.cohere.com/v1/chat "HTTP/1.1 200 OK"
Processing:   1%|          | 3/500 [00:29<1:18:06,  9.43s/q, Success=100.0%, Accuracy=100.0%, Last=A]2025-06-04 14:03:57,657 - INFO - _send_single_request:1025 - HTTP Request: POST https://api.cohere.com/v1/chat "HTTP/1.1 200 OK"
Processing:   1%|          | 4/500 [00:37<1:15:06,  9.09s/q, Success=100.0%, Accuracy=100.0%, Last=A]2025-06-04 14:04:07,536 - INFO

‚úÖ Chunk 3 completed:
   üìà Success rate: 496/500 (99.2%)
   üéØ Accuracy: 85.3%
‚è∏Ô∏è  Sleeping 2s between chunks...

üîÑ Processing Chunk 4/6
üìç Rows 1500 to 1999 (500 questions)


Processing:   0%|          | 0/500 [00:00<?, ?q/s]2025-06-04 15:21:53,537 - INFO - _send_single_request:1025 - HTTP Request: POST https://api.cohere.com/v1/chat "HTTP/1.1 200 OK"
Processing:   0%|          | 1/500 [00:09<1:17:27,  9.31s/q, Success=100.0%, Accuracy=100.0%, Last=A]2025-06-04 15:22:02,076 - INFO - _send_single_request:1025 - HTTP Request: POST https://api.cohere.com/v1/chat "HTTP/1.1 200 OK"
Processing:   0%|          | 2/500 [00:17<1:13:30,  8.86s/q, Success=50.0%, Accuracy=100.0%, Last=FAIL]2025-06-04 15:22:10,431 - INFO - _send_single_request:1025 - HTTP Request: POST https://api.cohere.com/v1/chat "HTTP/1.1 200 OK"
Processing:   1%|          | 3/500 [00:26<1:11:29,  8.63s/q, Success=33.3%, Accuracy=100.0%, Last=FAIL]2025-06-04 15:22:19,044 - INFO - _send_single_request:1025 - HTTP Request: POST https://api.cohere.com/v1/chat "HTTP/1.1 200 OK"
Processing:   1%|          | 4/500 [00:34<1:11:17,  8.62s/q, Success=50.0%, Accuracy=100.0%, Last=D]   2025-06-04 15:22:28,259 

‚úÖ Chunk 4 completed:
   üìà Success rate: 492/500 (98.4%)
   üéØ Accuracy: 85.6%
‚è∏Ô∏è  Sleeping 2s between chunks...

üîÑ Processing Chunk 5/6
üìç Rows 2000 to 2499 (500 questions)


Processing:   0%|          | 0/500 [00:00<?, ?q/s]2025-06-04 16:39:04,355 - INFO - _send_single_request:1025 - HTTP Request: POST https://api.cohere.com/v1/chat "HTTP/1.1 200 OK"
Processing:   0%|          | 1/500 [00:09<1:15:28,  9.08s/q, Success=100.0%, Accuracy=100.0%, Last=B]2025-06-04 16:39:12,989 - INFO - _send_single_request:1025 - HTTP Request: POST https://api.cohere.com/v1/chat "HTTP/1.1 200 OK"
Processing:   0%|          | 2/500 [00:17<1:13:14,  8.83s/q, Success=100.0%, Accuracy=100.0%, Last=A]2025-06-04 16:39:22,149 - INFO - _send_single_request:1025 - HTTP Request: POST https://api.cohere.com/v1/chat "HTTP/1.1 200 OK"
Processing:   1%|          | 3/500 [00:26<1:14:20,  8.97s/q, Success=100.0%, Accuracy=100.0%, Last=C]2025-06-04 16:39:31,073 - INFO - _send_single_request:1025 - HTTP Request: POST https://api.cohere.com/v1/chat "HTTP/1.1 200 OK"
Processing:   1%|          | 4/500 [00:35<1:14:01,  8.96s/q, Success=100.0%, Accuracy=100.0%, Last=C]2025-06-04 16:39:40,024 - INFO

‚úÖ Chunk 5 completed:
   üìà Success rate: 495/500 (99.0%)
   üéØ Accuracy: 84.8%
‚è∏Ô∏è  Sleeping 2s between chunks...

üîÑ Processing Chunk 6/6
üìç Rows 2500 to 2580 (81 questions)


Processing:   0%|          | 0/81 [00:00<?, ?q/s]2025-06-04 18:01:23,366 - INFO - _send_single_request:1025 - HTTP Request: POST https://api.cohere.com/v1/chat "HTTP/1.1 200 OK"
Processing:   1%|          | 1/81 [00:08<11:38,  8.74s/q, Success=100.0%, Accuracy=100.0%, Last=B]2025-06-04 18:01:33,137 - INFO - _send_single_request:1025 - HTTP Request: POST https://api.cohere.com/v1/chat "HTTP/1.1 200 OK"
Processing:   2%|‚ñè         | 2/81 [00:18<12:17,  9.34s/q, Success=100.0%, Accuracy=100.0%, Last=A]2025-06-04 18:01:43,441 - INFO - _send_single_request:1025 - HTTP Request: POST https://api.cohere.com/v1/chat "HTTP/1.1 200 OK"
Processing:   4%|‚ñé         | 3/81 [00:28<12:43,  9.78s/q, Success=100.0%, Accuracy=100.0%, Last=D]2025-06-04 18:01:53,761 - INFO - _send_single_request:1025 - HTTP Request: POST https://api.cohere.com/v1/chat "HTTP/1.1 200 OK"
Processing:   5%|‚ñç         | 4/81 [00:39<12:49,  9.99s/q, Success=100.0%, Accuracy=100.0%, Last=D]2025-06-04 18:02:02,716 - INFO - _sen

‚úÖ Chunk 6 completed:
   üìà Success rate: 80/81 (98.8%)
   üéØ Accuracy: 87.5%


# Final Analysis and Answer Distribution Summary

In [32]:
base_df

Unnamed: 0_level_0,Unnamed: 0.1,Unnamed: 0,question,answerKey,choice_A,choice_B,choice_C,choice_D,split
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Mercury_SC_415702,0,0,George wants to warm his hands quickly by rubb...,A,dry palms,wet palms,palms covered with oil,palms covered with lotion,train
MCAS_2009_5_6516,1,1,Which of the following statements best explain...,B,The refrigerator door is smooth.,The refrigerator door contains iron.,The refrigerator door is a good conductor.,The refrigerator door has electric wires in it.,train
Mercury_7233695,2,2,A fold observed in layers of sedimentary rock ...,B,cooling of flowing magma.,converging of crustal plates.,deposition of river sediments.,solution of carbonate minerals.,train
Mercury_7041615,3,3,Which of these do scientists offer as the most...,D,worldwide disease,global mountain building,rise of mammals that preyed upon plants and an...,impact of an asteroid created dust that blocke...,train
Mercury_7041860,4,4,A boat is acted on by a river current flowing ...,B,west,east,north,south,train
...,...,...,...,...,...,...,...,...,...
Mercury_7090598,2585,2585,Which of these processes involves the transfer...,C,erosion,sedimentation,subduction,cementation,validation
OHAT_2007_5_24,2586,2586,"In a forest, how do decomposers help other org...",B,They release oxygen into the air that animals ...,They put nutrients into the soil that plants u...,They provide shelter in forests where animals ...,They use sunlight to make food for plants and ...,validation
Mercury_SC_402239,2587,2587,What is the best way to conserve natural resou...,C,Throw all glass in the trash.,Use paper towels to clean up spills.,Shorten the time spent taking a shower.,Water the lawn every day.,validation
Mercury_7245088,2588,2588,Which describes the composition of carbohydrates?,D,lipids bonding to form phospholipids,monomers bonding to form polymers,amino acids bonding to form polypeptides,saccharides bonding to form polysaccharides,validation


In [46]:
print("base_df columns:", base_df.columns)
print("df_results columns:", df_results.columns)


base_df columns: Index(['Unnamed: 0.1', 'Unnamed: 0', 'id', 'question', 'answerKey', 'choice_A',
       'choice_B', 'choice_C', 'choice_D', 'split'],
      dtype='object')
df_results columns: Index(['question_id', 'prediction', 'explanation', 'raw_output',
       'actual_answer', 'is_correct', 'split', 'error', 'processing_time'],
      dtype='object')


In [47]:
from pathlib import Path
import json
import pandas as pd
from datetime import datetime

# === Save complete results ===
print(f"\n{'='*60}")
print("üíæ Saving final combined results with full context...")

# Step 1: Convert predictions to DataFrame
df_results = pd.DataFrame(all_results)

# Step 2: Merge on matching IDs
df_merged = pd.merge(
    base_df,
    df_results,
    how="left",
    left_on="id",
    right_on="question_id"
)

# Step 3: Ensure 'question_id' is in final output
# (it's already in df_results, so no renaming needed)

# === Order columns for clarity ===
columns_order = [
    "question_id", "question", "choice_A", "choice_B", "choice_C", "choice_D",
    "answerKey", "prediction", "is_correct", "explanation", "raw_output",
    "processing_time", "split", "error"
]
df_merged = df_merged[[col for col in columns_order if col in df_merged.columns]]

# === Save to file ===
final_name = "results_compelete_cohere_command-r-plus_arc_challenge"
output_dir = Path("output")
output_dir.mkdir(exist_ok=True)

df_merged.to_csv(output_dir / f"{final_name}.csv", index=False)
df_merged.to_json(output_dir / f"{final_name}.json", orient="records", indent=2, force_ascii=False)

print(f"üìÅ Results saved as:")
print(f" - output/{final_name}.csv")
print(f" - output/{final_name}.json")

# === Final Stats ===
total_time = df_merged["processing_time"].sum()
avg_time = total_time / len(df_merged)

print(f"\n‚è±Ô∏è PERFORMANCE METRICS:")
print(f"   Total processing time: {total_time:.1f} seconds ({total_time / 60:.1f} minutes)")
print(f"   Average time per question: {avg_time:.2f} seconds")

# === Answer Distribution Comparison ===
actual_dist = df_merged["answerKey"].value_counts().sort_index()
predicted_dist = df_merged[df_merged["prediction"].notna()]["prediction"].value_counts().sort_index()

print(f"\nüéØ ANSWER DISTRIBUTION COMPARISON:")
for letter in ['A', 'B', 'C', 'D']:
    actual_count = actual_dist.get(letter, 0)
    pred_count = predicted_dist.get(letter, 0)
    actual_pct = actual_count / len(df_merged) * 100
    pred_pct = pred_count / len(predicted_dist) * 100 if len(predicted_dist) > 0 else 0
    print(f"   {letter}: Actual {actual_count} ({actual_pct:.1f}%) ‚Üí Predicted {pred_count} ({pred_pct:.1f}%)")

# === Final Score ===
successful = df_merged["prediction"].notna().sum()
correct = df_merged[df_merged["is_correct"] == True].shape[0]
accuracy = correct / successful * 100 if successful > 0 else 0

print(f"\nüèÜ FINAL RESULTS SUMMARY:")
print(f"   Total Questions: {len(df_merged)}")
print(f"   ‚úÖ Successful Predictions: {successful} ({successful / len(df_merged) * 100:.1f}%)")
print(f"   üéØ Overall Accuracy: {accuracy:.1f}%")

# === Top Errors Summary ===
error_df = df_merged[df_merged["error"].notna()]
if not error_df.empty:
    print(f"\n‚ùå TOP ERRORS:")
    top_errors = error_df["error"].value_counts().head(5)
    for err, count in top_errors.items():
        print(f"   {err}: {count} occurrences")



üíæ Saving final combined results with full context...
üìÅ Results saved as:
 - output/results_compelete_cohere_command-r-plus_arc_challenge.csv
 - output/results_compelete_cohere_command-r-plus_arc_challenge.json

‚è±Ô∏è PERFORMANCE METRICS:
   Total processing time: 24390.3 seconds (406.5 minutes)
   Average time per question: 9.45 seconds

üéØ ANSWER DISTRIBUTION COMPARISON:
   A: Actual 565 (21.9%) ‚Üí Predicted 611 (15275.0%)
   B: Actual 677 (26.2%) ‚Üí Predicted 653 (16325.0%)
   C: Actual 678 (26.3%) ‚Üí Predicted 677 (16925.0%)
   D: Actual 661 (25.6%) ‚Üí Predicted 610 (15250.0%)

üèÜ FINAL RESULTS SUMMARY:
   Total Questions: 2581
   ‚úÖ Successful Predictions: 2551 (98.8%)
   üéØ Overall Accuracy: 87.1%


In [51]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2581 entries, 0 to 2580
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   question_id      2581 non-null   object 
 1   question         2581 non-null   object 
 2   choice_A         2581 non-null   object 
 3   choice_B         2581 non-null   object 
 4   choice_C         2581 non-null   object 
 5   choice_D         2581 non-null   object 
 6   answerKey        2581 non-null   object 
 7   prediction       2551 non-null   object 
 8   is_correct       2551 non-null   object 
 9   explanation      2564 non-null   object 
 10  raw_output       2581 non-null   object 
 11  processing_time  2581 non-null   float64
 12  error            0 non-null      object 
dtypes: float64(1), object(12)
memory usage: 262.3+ KB


In [50]:
import pandas as pd
import re
import logging

# Setup logging
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Define extraction function
def extract_prediction_and_explanation(text):
    try:
        pred_match = re.search(r"<prediction>\s*([A-D])\s*</prediction>", text)
        expl_match = re.search(r"<explanation>(.*?)</explanation>", text, re.DOTALL)

        prediction = pred_match.group(1).strip() if pred_match else None
        explanation = expl_match.group(1).strip() if expl_match else None
        return prediction, explanation
    except Exception as e:
        logger.warning(f"Regex extraction error: {e}")
        return None, None

# Track fixes
fixed_pred, fixed_expl = 0, 0

# Apply corrections
for idx, row in df_merged.iterrows():
    if pd.isna(row['prediction']) or pd.isna(row['explanation']) or row['prediction'] not in ['A','B','C','D']:
        pred, expl = extract_prediction_and_explanation(row['raw_output'])

        if pd.isna(row['prediction']) or row['prediction'] not in ['A','B','C','D']:
            if pred:
                df_merged.at[idx, 'prediction'] = pred
                fixed_pred += 1

        if pd.isna(row['explanation']) or row['explanation'].strip() == "":
            if expl:
                df_merged.at[idx, 'explanation'] = expl
                fixed_expl += 1

logger.info(f"Fixed {fixed_pred} predictions and {fixed_expl} explanations.")


2025-06-05 00:19:10,984 - INFO - <module>:41 - Fixed 0 predictions and 0 explanations.


In [64]:
df = pd.read_csv('results_compelete_gpt_4o_mini_arc_challenge.csv')
df.info()

FileNotFoundError: [Errno 2] No such file or directory: 'results_compelete_gpt_4o_mini_arc_challenge.csv'