In [None]:
import json
import pandas as pd
import os
import time
from datetime import datetime
import numpy as np
import re
import requests
from google.colab import drive
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type

class LlamaNvidiaEvaluator:
    def __init__(self, questions_file: str, max_consecutive_errors: int = 5):
        """
        Evaluator for nvidia/Llama-3.1-Nemotron-70B-Instruct-HF using
        Together's /v1/chat/completions endpoint.
        """
        self.max_consecutive_errors = max_consecutive_errors
        self.load_questions(questions_file)
        self.results = []
        # We'll target the chat completions endpoint on Together
        self.api_endpoint = "https://api.together.xyz/v1/chat/completions"

    def load_questions(self, file_path: str):
        with open(file_path, 'r', encoding='utf-8') as f:
            self.questions = json.load(f)
            print(f"Loaded {len(self.questions)} questions")

    def format_messages(self, question):
        """
        Build the system + user messages array, instructing the model
        to respond with exactly one multiple-choice letter (A/B/C/D).
        """
        system_prompt = (
            "You are a precise question-answering system. "
            "You must respond with only one letter: A, B, C, or D. No other text is allowed."
        )
        user_prompt = (
            f"Question: {question['question']}\n\n"
            f"A) {question['option_a']}\n"
            f"B) {question['option_b']}\n"
            f"C) {question['option_c']}\n"
            f"D) {question['option_d']}\n\n"
            "Remember: Respond with ONLY ONE LETTER (A, B, C, or D). No other text."
        )
        return [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ]

    def extract_answer(self, raw_answer):
        """
        Attempt to locate exactly one letter (A, B, C, or D) in the model's output.
        """
        patterns = [
            r"^\s*([ABCD])(\b|\.|$)",
            r".*[^a-zA-Z]([ABCD])(\b|\.|$)",
            r"([ABCD])[^\w\s]*$",
            r".*\b([ABCD])\b.*",
            r".*([ABCD]).*"
        ]
        for pattern in patterns:
            match = re.search(pattern, raw_answer, re.IGNORECASE)
            if match:
                return match.group(1).upper()
        return None

    @retry(stop=stop_after_attempt(5),
           wait=wait_exponential(multiplier=2, min=4, max=60),
           retry=retry_if_exception_type((requests.exceptions.RequestException, ValueError, KeyError)))
    def get_model_response(self, api_key: str, messages: list):
        """
        Sends a chat completion request to Together's endpoint with
        model='nvidia/Llama-3.1-Nemotron-70B-Instruct-HF',
        mimicking your Playground snippet.
        """
        headers = {
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json"
        }

        payload = {
            "model": "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
            "messages": messages,
            "max_tokens": 50,
            "temperature": 0.7,
            "top_p": 0.7,
            "top_k": 50,
            "repetition_penalty": 1,
            "stop": ["<|eot_id|>", "<|eom_id|>"],  # from your snippet
            "stream": False
        }

        response = requests.post(
            self.api_endpoint,
            headers=headers,
            json=payload,
            timeout=30
        )
        response.raise_for_status()  # Raise for 4xx/5xx
        data = response.json()

        # If the JSON includes an error key, raise ValueError so tenacity can retry
        if "error" in data:
            raise ValueError(f"API Error: {data['error']}")

        # Typically returns: {"choices": [{ "message": {"content": "..."} }]}
        if "choices" in data and len(data["choices"]) > 0:
            raw_text = data["choices"][0]["message"]["content"]
            return raw_text
        else:
            raise ValueError(f"Unexpected response format: {data}")

    def evaluate_model(self, api_key: str, test_run: bool = False):
        """
        Iterates through each question, calls get_model_response,
        extracts the letter (A/B/C/D), checks correctness, and records results.
        """
        consecutive_errors = 0
        questions_to_evaluate = self.questions[:5] if test_run else self.questions
        max_errors = self.max_consecutive_errors if test_run else float('inf')

        for i, question in enumerate(questions_to_evaluate):
            print(f"\nProcessing question {i+1}/{len(questions_to_evaluate)}")
            try:
                start_time = time.time()
                messages = self.format_messages(question)

                try:
                    raw_answer = self.get_model_response(api_key, messages).strip()
                except Exception as e:
                    # If initial attempt fails, wait and retry
                    print(f"Initial attempt failed: {str(e)}")
                    print("Retrying with longer wait...")
                    time.sleep(10)
                    raw_answer = self.get_model_response(api_key, messages).strip()

                end_time = time.time()

                model_answer = self.extract_answer(raw_answer)
                if not model_answer:
                    print(f"Warning: Could not extract letter from: {raw_answer}")
                    model_answer = "invalid"

                # Build a results dictionary
                result = {
                    'question_id': question['id'],
                    'model': 'nvidia/Llama-3.1-Nemotron-70B-Instruct-HF',
                    'question_input': question['question'],
                    'model_raw_output': raw_answer,
                    'answer': model_answer if model_answer in ['A', 'B', 'C', 'D'] else None,
                    'correct_answer': question['correct_answer'],
                    'correct': (model_answer == question['correct_answer']) if model_answer in ['A', 'B', 'C', 'D'] else False,
                    'time': end_time - start_time,
                    'tokens': None,  # Typically not in the Together chat response
                    'error': None if model_answer in ['A', 'B', 'C', 'D'] else f"Invalid format: {raw_answer}"
                }

                self.results.append(result)

                # Reset or increment error counters based on whether the answer was valid
                if model_answer in ['A', 'B', 'C', 'D']:
                    consecutive_errors = 0
                else:
                    consecutive_errors += 1

                print(f"Raw answer: {raw_answer}")
                print(f"Extracted answer: {model_answer}")
                print(f"Correct answer: {question['correct_answer']}")
                print(f"Time: {result['time']:.2f}s")
                print("-" * 40)

                # Delay to avoid rapid-fire calls
                delay = max(2, min(10, result['time'] * 2))
                time.sleep(delay)

                # If we're in test mode and have too many consecutive errors, stop
                if consecutive_errors >= max_errors and test_run:
                    print(f"\nAborting test run: {consecutive_errors} consecutive errors")
                    break

            except Exception as e:
                error_msg = str(e)
                print(f"Error on question {i+1}: {error_msg}")

                self.results.append({
                    'question_id': question['id'],
                    'model': 'nvidia/Llama-3.1-Nemotron-70B-Instruct-HF',
                    'question_input': question['question'],
                    'model_raw_output': None,
                    'answer': None,
                    'correct_answer': question['correct_answer'],
                    'correct': False,
                    'time': None,
                    'tokens': None,
                    'error': error_msg
                })

                consecutive_errors += 1
                time.sleep(10)

                if consecutive_errors >= max_errors and test_run:
                    print(f"\nAborting test run: {consecutive_errors} consecutive errors")
                    break

        return self.save_results()

    def save_results(self, output_dir: str = '/content/drive/MyDrive/TFM2/TFM-DATASETS/evaluations'):
        """
        Save the results to CSV + JSON in Google Drive, then print summary metrics.
        """
        os.makedirs(output_dir, exist_ok=True)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        df = pd.DataFrame(self.results)

        metrics = {
            'model': 'nvidia/Llama-3.1-Nemotron-70B-Instruct-HF',
            'total_questions': len(df),
            'completed': df['error'].isna().sum(),
            'errors': df['error'].notna().sum(),
            'correct': df['correct'].sum(),
            'accuracy': df['correct'].mean() if len(df) else 0,
            'avg_time': df['time'].mean() if df['time'].notna().any() else 0,
            'invalid_formats': len(df[df['answer'].isna()]),
            'valid_responses': len(df[df['answer'].notna()])
        }

        # Convert NumPy data types for JSON serialization
        for k, v in metrics.items():
            if isinstance(v, (np.integer, np.floating)):
                metrics[k] = v.item()

        base_path = f"{output_dir}/eval_NvidiaLlama_70B_{timestamp}"
        df.to_csv(f"{base_path}_results.csv", index=False)
        with open(f"{base_path}_summary.json", 'w') as f:
            json.dump(metrics, f, indent=2)

        # Print summary
        print("\nEvaluation Summary:")
        print(f"Model: {metrics['model']}")
        print(f"Total Questions: {metrics['total_questions']}")
        print(f"Valid Responses: {metrics['valid_responses']}")
        print(f"Invalid Formats: {metrics['invalid_formats']}")
        print(f"Correct Answers: {metrics['correct']}")
        print(f"Accuracy (of valid): {metrics['accuracy']:.2%}")
        print(f"Avg Time: {metrics['avg_time']:.2f}s")
        return metrics

# -- Install dependencies
!pip install -q requests tenacity

def run_test_evaluation(api_key: str):
    """
    Evaluate the first 5 questions to check correctness and formatting.
    """
    questions_file = '/content/drive/MyDrive/TFM2/TFM-DATASETS/structured_questions.json'
    evaluator = LlamaNvidiaEvaluator(questions_file)
    return evaluator.evaluate_model(api_key, test_run=True)

def run_full_evaluation(api_key: str):
    """
    Evaluate all questions in the structured_questions.json file.
    """
    questions_file = '/content/drive/MyDrive/TFM2/TFM-DATASETS/structured_questions.json'
    evaluator = LlamaNvidiaEvaluator(questions_file)
    return evaluator.evaluate_model(api_key)

# -- Mount Google Drive
drive.mount('/content/drive')

# -- Prompt for the Together API key
print("Get your Together API key from: https://api.together.xyz/settings/api-keys")
api_key = input("Enter your Together API key: ")

# -- Test run with 5 questions
print("\nRunning test evaluation with 5 questions...")
test_metrics = run_test_evaluation(api_key)

# -- Optionally do full evaluation
if input("\nContinue with full evaluation? (y/n): ").lower() == 'y':
    print("\nRunning full evaluation...")
    full_metrics = run_full_evaluation(api_key)


Mounted at /content/drive
Get your Together API key from: https://api.together.xyz/settings/api-keys
Enter your Together API key: 360829a0bf8daacbbc99660da73de0ea2cb5552eba77ffc1313887e50dbd066f

Running test evaluation with 5 questions...
Loaded 174 questions

Processing question 1/5
Raw answer: B
Extracted answer: B
Correct answer: B
Time: 0.44s
----------------------------------------

Processing question 2/5
Raw answer: C
Extracted answer: C
Correct answer: C
Time: 0.33s
----------------------------------------

Processing question 3/5
Raw answer: D
Extracted answer: D
Correct answer: A
Time: 0.77s
----------------------------------------

Processing question 4/5
Raw answer: B
Extracted answer: B
Correct answer: B
Time: 0.36s
----------------------------------------

Processing question 5/5
Raw answer: C
Extracted answer: C
Correct answer: C
Time: 0.23s
----------------------------------------

Evaluation Summary:
Model: nvidia/Llama-3.1-Nemotron-70B-Instruct-HF
Total Questions: 5
