# Environment Setup

In [1]:
!pip install rdflib
!pip install openai

Collecting rdflib
  Downloading rdflib-7.1.3-py3-none-any.whl.metadata (11 kB)
Downloading rdflib-7.1.3-py3-none-any.whl (564 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/564.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.6/564.9 kB[0m [31m4.4 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m563.2/564.9 kB[0m [31m9.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.9/564.9 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdflib
Successfully installed rdflib-7.1.3


In [2]:
pip install transformers accelerate huggingface_hub

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.wh

In [3]:
!pip install requests



In [4]:
import rdflib
import json
import pandas as pd
import numpy as np
from scipy.stats import zscore
from openai import OpenAI
import os

# Load Models

In [6]:
from google.colab import userdata
API_KEY = userdata.get('OPENAI_API_KEY')
HUGGIN_API_KEY = userdata.get('HF_TOKEN')
DEEPSEEK_API_KEY = userdata.get('DEEPSEEK_API_KEY')

client = OpenAI(api_key=API_KEY)

FALCON_API_URL = "https://api-inference.huggingface.co/models/tiiuae/falcon-7b-instruct"

MISTRAL_API_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.3"

In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import requests

### MULTI-MODEL LLM QUERY ###
def call_openai_model(query, model="gpt-4o"):
    """
    Calls OpenAI's GPT-4o or Mistral via OpenAI API.
    """
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "system", "content": "You analyze EEG, HRV, and Pose data."},
                      {"role": "user", "content": query}],
            max_tokens=700,
            temperature=0.7
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"Error calling OpenAI ({model}): {str(e)}"

### LLM Query via Hugging Face ###
def call_hf_model(query, model_name):
    """
    Calls a model hosted on Hugging Face using the transformers pipeline.
    """
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype="auto")
        generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

        response = generator(query, max_new_tokens=700, temperature=0.7, do_sample=True)
        return response[0]["generated_text"]
    except Exception as e:
        return f"Error calling {model_name}: {str(e)}"

def call_mistral_api(query):
    """
    Calls Hugging Face’s hosted Mistral API using the same prompt as GPT-4o.
    If the query is too long, it trims it to fit the token limit.
    """
    MAX_TOKENS = 32000  # Slightly below Mistral's max limit
    HEADROOM = 500  # Leave space for the generated response

    headers = {"Authorization": f"Bearer {HUGGIN_API_KEY}"}

    # Trim query dynamically if it's too long
    query_tokens = query.split()  # Rough token count by splitting on spaces
    if len(query_tokens) > MAX_TOKENS - HEADROOM:
        query = " ".join(query_tokens[:MAX_TOKENS - HEADROOM])  # Trim query

    payload = {
        "inputs": query,  # Same prompt as GPT-4o
        "parameters": {
            "max_new_tokens": 300,
            "temperature": 0.7,
            "top_p": 0.9,
            "do_sample": True
        }
    }

    try:
        response = requests.post(MISTRAL_API_URL, headers=headers, json=payload)
        response_json = response.json()

        if isinstance(response_json, dict) and "error" in response_json:
            return f"Error from API: {response_json['error']}"

        return response_json[0]["generated_text"] if isinstance(response_json, list) else response_json
    except Exception as e:
        return f"Error calling Mistral API: {str(e)}"

def call_falcon_api(query):
    """
    Calls Hugging Face’s hosted Falcon-7B API with input truncation.
    """
    headers = {"Authorization": f"Bearer {HUGGIN_API_KEY}"}

    # Truncate the input if it's too long
    max_input_tokens = 8192 - 300  # 300 is reserved for output tokens
    truncated_query = query[:max_input_tokens]  # Simple character-based truncation

    payload = {
        "inputs": truncated_query,
        "parameters": {
            "max_new_tokens": 300,
            "temperature": 0.7,
            "top_p": 0.9,
            "do_sample": True
        }
    }

    try:
        response = requests.post(FALCON_API_URL, headers=headers, json=payload)
        response_json = response.json()

        if isinstance(response_json, dict) and "error" in response_json:
            return f"Error from API: {response_json['error']}"

        return response_json[0]["generated_text"] if isinstance(response_json, list) else response_json
    except Exception as e:
        return f"Error calling Falcon API: {str(e)}"

def call_deepseek(query, model="deepseek-chat"):
    """
    Calls DeepSeek API with a given query.
    """
    API_KEY = DEEPSEEK_API_KEY
    if not API_KEY:
        return "API Key not found! Set DEEPSEEK_API_KEY environment variable."

    url = "https://api.deepseek.com/chat/completions"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {API_KEY}",
    }
    payload = {
        "model": model,  # deepseek-chat or deepseek-reasoner
        "messages": [
            {"role": "system", "content": "You analyze EEG, HRV, and Pose data."},
            {"role": "user", "content": query}
        ],
        "max_tokens": 700,
        "temperature": 0.7,
        "stream": False
    }

    try:
        response = requests.post(url, headers=headers, json=payload)
        response_json = response.json()

        if "error" in response_json:
            return f"API Error: {response_json['error']}"

        return response_json["choices"][0]["message"]["content"]
    except Exception as e:
        return f"Request failed: {str(e)}"

def compare_llms(query, models=["gpt-4o", "mistral", "falcon", "deepseek"]):
    """
    Runs the same query across multiple LLMs and returns their responses.
    """
    results = {}
    for model in models:
        if model == "gpt-4o":
            results[model] = call_openai_model(query, model="gpt-4o")
        elif model == "mistral":
            results[model] = call_mistral_api(query)
        elif model == "falcon":
            results[model] = call_falcon_api(query)
        elif model == "deepseek":
            results[model] = call_deepseek(query)
        else:
            results[model] = f"Model {model} not supported."
    return results

# Load Ontology

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
def load_ontology(file_path="/content/drive/MyDrive/NeuroSyncAI/ontology.ttl"):
    g = rdflib.Graph()
    g.parse(file_path, format="turtle")
    return g

def convert_to_jsonld(ontology_graph):
    json_data = [
        {"subject": subj.split("/")[-1], "predicate": pred.split("/")[-1], "object": obj.split("/")[-1]}
        for subj, pred, obj in ontology_graph
    ]
    return json.dumps(json_data, indent=4)

ontology_graph = load_ontology()
ontology_json = convert_to_jsonld(ontology_graph)

# Anomalies Detection

In [11]:
### EEG, HRV, and POSE ANOMALY DETECTION ###
def detect_eeg_anomalies(eeg_df):
    """
    Identifies sudden peaks, dips, or variability in EEG signals.
    Returns a structured list of detected anomalies with timestamps.
    """
    anomalies = []

    if 'Timestamp' not in eeg_df.columns:
        raise ValueError("Expected 'Timestamp' column in EEG data.")

    numeric_columns = eeg_df.select_dtypes(include=['number']).columns.difference(['Timestamp'])

    for channel in numeric_columns:
        mean_val = eeg_df[channel].mean()
        std_val = eeg_df[channel].std()
        threshold_high = mean_val + 3 * std_val  # Upper threshold (outliers)
        threshold_low = mean_val - 3 * std_val  # Lower threshold (outliers)

        for idx, value in eeg_df[channel].items():
            timestamp = eeg_df.loc[idx, 'Timestamp']
            if value > threshold_high:
                anomalies.append((f"EEG {channel} - Peak Detected", timestamp, timestamp))
            elif value < threshold_low:
                anomalies.append((f"EEG {channel} - Dip Detected", timestamp, timestamp))

    return anomalies if anomalies else [("No EEG Anomalies Detected", "N/A", "N/A")]

def detect_hrv_anomalies(hrv_df):
    """
    Identifies anomalies in HRV data, including sudden drops, instability, and extreme values.
    Returns a structured list of detected anomalies with timestamps.
    """
    anomalies = []

    if 'Timestamp' not in hrv_df.columns:
        raise ValueError("Expected 'Timestamp' column in HRV data.")

    numeric_columns = hrv_df.select_dtypes(include=['number']).columns.difference(['Timestamp'])

    for channel in numeric_columns:
        mean_val = hrv_df[channel].mean()
        std_val = hrv_df[channel].std()
        threshold_high = mean_val + 2.5 * std_val  # Relaxation (parasympathetic dominance)
        threshold_low = mean_val - 2.5 * std_val  # Stress (sympathetic dominance)

        for idx, value in hrv_df[channel].items():
            timestamp = hrv_df.loc[idx, 'Timestamp']

            if value > threshold_high:
                anomalies.append((f"HRV {channel} - Unusually High HRV (Relaxation)", timestamp, timestamp))
            elif value < threshold_low:
                anomalies.append((f"HRV {channel} - Unusually Low HRV (Stress)", timestamp, timestamp))

            # Additional logic for instability (fluctuations)
            if idx > 0:
                prev_value = hrv_df.loc[idx - 1, channel]
                change = abs(value - prev_value)
                if change > std_val * 2:
                    anomalies.append((f"HRV {channel} - Sudden Fluctuation Detected", timestamp, timestamp))

    return anomalies if anomalies else [("No HRV Anomalies Detected", "N/A", "N/A")]

def detect_pose_anomalies_windowed(pose_data, window_size=3):
    """
    Detects gait and movement anomalies in a windowed approach (default 3s windows).
    Focuses on stride variability, arm asymmetry, instability, and phase shifts.
    """

    if 'Timestamp' not in pose_data.columns:
        return [("Invalid Data: No Timestamp Found", "N/A", "N/A")]

    pose_data = pose_data.fillna(0)  # Handle missing values
    timestamps = pose_data["Timestamp"].values

    # Convert timestamps into seconds and segment into windows
    pose_data["Window"] = (pose_data["Timestamp"] // window_size).astype(int)

    anomalies = []

    # Key joints for analysis
    key_joints = {
        "stride": ["Left Hip", "Right Hip"],
        "arm_swing": ["Left Shoulder", "Right Shoulder"],
        "instability": ["Left Ankle", "Right Ankle"],
        "step_phase": ["Left Knee", "Right Knee"],
    }

    # Process each window separately
    grouped = pose_data.groupby("Window")
    for window, group in grouped:
        t_start, t_end = group["Timestamp"].iloc[0], group["Timestamp"].iloc[-1]

        # 1️⃣ **Stride Length Variability**
        if all(j in group.columns for j in key_joints["stride"]):
            stride_diff = np.abs(group["Left Hip"] - group["Right Hip"])
            if stride_diff.mean() < 10:  # Shortened stride
                anomalies.append((f"{t_start}-{t_end}s - Reduced stride length detected", t_start, t_end))

        # 2️⃣ **Arm Swing Asymmetry**
        if all(j in group.columns for j in key_joints["arm_swing"]):
            arm_diff = np.abs(group["Left Shoulder"] - group["Right Shoulder"])
            if arm_diff.mean() > 15:  # Uneven arm swings
                anomalies.append((f"{t_start}-{t_end}s - Arm swing asymmetry detected", t_start, t_end))

        # 3️⃣ **Instability Detection (Jitter in Ankles)**
        if all(j in group.columns for j in key_joints["instability"]):
            ankle_movement = (group["Left Ankle"] + group["Right Ankle"]) / 2
            instability_score = np.abs(np.diff(ankle_movement)).mean()
            if instability_score > 5:  # Detect erratic movement
                anomalies.append((f"{t_start}-{t_end}s - Instability detected", t_start, t_end))

        # 4️⃣ **Step Phase Shift Detection**
        if all(j in group.columns for j in key_joints["step_phase"]):
            knee_phase_diff = np.abs(group["Left Knee"] - group["Right Knee"])
            if knee_phase_diff.mean() > np.pi/6:  # Delayed stepping phase shift
                anomalies.append((f"{t_start}-{t_end}s - Step phase shift detected", t_start, t_end))

    return anomalies if anomalies else [("No Significant Pose Anomalies Detected", "N/A", "N/A")]


### STRUCTURED DATA PREPARATION ###
def generate_anomaly_summary(eeg_data, hrv_data, pose_data):
    eeg_anomalies = detect_eeg_anomalies(eeg_data) if not eeg_data.empty else []
    hrv_anomalies = detect_hrv_anomalies(hrv_data) if not hrv_data.empty else []
    pose_anomalies = detect_pose_anomalies_windowed(pose_data, 3) if not pose_data.empty else []

    structured_summary = {
        "EEG": {"EEG Anomalies": eeg_anomalies},
        "HRV": {"HRV Anomalies": hrv_anomalies},
        "Pose": {"Pose Anomalies": pose_anomalies}
    }
    return json.dumps(structured_summary, indent=4)

# Data Analysis Setup

In [12]:
def analyze_data(eeg_data, hrv_data, pose_data, user_query, models=["gpt-4o", "deepseek", "mistral", "falcon"]):
    """
    Processes data, generates insights, and compares LLM outputs.
    """
    anomaly_summary = json.dumps({
        "EEG": detect_eeg_anomalies(eeg_data),
        "HRV": detect_hrv_anomalies(hrv_data),
        "Pose": detect_pose_anomalies_windowed(pose_data,3)
    }, indent=4)

    system_prompt = f"""
    You are an AI analyzing EEG, HRV, and Pose data.

    **Ontology Information**:
    {ontology_json}

    **Data Summary**:
    {anomaly_summary}

    **User Query**:
    {user_query}
    """

    return compare_llms(system_prompt, models=models)

In [13]:
eeg_file_path = "/content/drive/MyDrive/NeuroSyncAI/eeg_data_Motor Task_MCI_20250307_221718.csv"
hrv_file_path = "/content/drive/MyDrive/NeuroSyncAI/hrv_data_Motor Task_MCI_20250307_221718.csv"
pose_file_path = "/content/drive/MyDrive/NeuroSyncAI/walking_pose_Motor Task_MCI_trial1_20250307_221718_590611.csv"

eeg_data = pd.read_csv(eeg_file_path)
hrv_data = pd.read_csv(hrv_file_path)
pose_data = pd.read_csv(pose_file_path)

query = "What is the most significant anomaly in EEG, HRV, and Pose correlation, and what are its possible causes?"
llm_comparisons = analyze_data(eeg_data, hrv_data, pose_data, query)

  instability_score = np.abs(np.diff(ankle_movement)).mean()
  ret = ret.dtype.type(ret / rcount)


# Model Comparison

In [None]:
for model, response in llm_comparisons.items():
    print(f"\n--- {model} Response ---\n{response}\n")


--- gpt-4o Response ---
To identify the most significant anomaly in the correlation between EEG, HRV, and Pose data, we need to look at patterns and events that are notable across these different modalities and consider the ontology information provided.

### Analysis:

1. **EEG Data:**
   - There are numerous peaks and dips across different EEG channels, indicating fluctuations in brainwave activity.
   - Notable patterns include frequent peaks in channels like AF3, AF4, C3, F3, and F8. Peaks in EEG data can often indicate bursts of neural activity, which depending on the frequency band, might correlate with states of stress, focus, or relaxation.

2. **HRV Data:**
   - The HRV data shows both "Unusually Low HRV (Stress)" and "Unusually High HRV (Relaxation)" events, as well as several "Sudden Fluctuation Detected" events.
   - Low HRV is generally associated with stress, while high HRV is linked with relaxation and parasympathetic dominance.

3. **Pose Data:**
   - The Pose data fre

# Model Scoring (LLM as a judge)

## GPT-4o as a Judge

In [None]:
import openai
import json
import pprint

# Ensure you have a properly initialized OpenAI client
client = openai.OpenAI(api_key=userdata.get('OPENAI_API_KEY'))  # Replace with actual key

def judge_responses_with_gpt4o(llm_comparisons):
    """
    Uses GPT-4o to evaluate different model responses based on predefined criteria.
    Returns a structured JSON containing scores and justifications.
    """

    evaluation_prompt = f"""
    Your task is to **score each model's response** based on the following criteria (out of 10):

    1. **Relevance** - How well does it address the key aspects of EEG, HRV, and Pose correlations?
    2. **Depth of Analysis** - Does it provide detailed, well-explained insights?
    3. **Clarity & Coherence** - Is the response structured and easy to understand?
    4. **Actionability** - Does it offer useful interpretations or next steps?
    5. **Overall Score** - A weighted combination of the above.

    Provide scores **in JSON format**, including a short justification for each score.

    ### Model Responses:
    {json.dumps(llm_comparisons, indent=2)}

    ### Expected JSON Output Format:
    {{
        "GPT-4o": {{
            "Relevance": X,
            "Depth of Analysis": X,
            "Clarity & Coherence": X,
            "Actionability": X,
            "Overall Score": X,
            "Justification": "..."
        }},
        "DeepSeek": {{ ... }},
        "Falcon": {{ ... }}
    }}
    """

    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are an expert AI judge evaluating responses from different models analyzing EEG, HRV, and Pose data."},
                {"role": "user", "content": evaluation_prompt}
            ],
            max_tokens=700,
            temperature=0.7
        )

        # Extract the raw response text
        gpt4o_judgment = response.choices[0].message.content

        # DEBUG: Print raw response for troubleshooting
        print("Raw GPT-4o Response:\n", gpt4o_judgment)

        # Remove Markdown formatting (```json ... ```)
        if gpt4o_judgment.startswith("```json"):
            gpt4o_judgment = gpt4o_judgment.strip("```json").strip("```")

        # Parse and return as JSON
        return json.loads(gpt4o_judgment)

    except Exception as e:
        return f"Error calling GPT-4o: {str(e)}"

# Run the evaluation
evaluation_results = judge_responses_with_gpt4o(llm_comparisons)

# Display the results
pprint.pprint(evaluation_results)

Raw GPT-4o Response:
 ```json
{
    "GPT-4o": {
        "Relevance": 9,
        "Depth of Analysis": 9,
        "Clarity & Coherence": 9,
        "Actionability": 9,
        "Overall Score": 9,
        "Justification": "The response from GPT-4o is highly relevant, addressing the key aspects of EEG, HRV, and Pose data correlations effectively. It provides a detailed analysis of the data, identifying significant anomalies and possible causes, which are well-supported by the ontology information. The structure is clear and easy to follow, and the response offers actionable insights into identifying and addressing cognitive stress and task-related issues. Overall, it presents a comprehensive and coherent analysis."
    },
    "DeepSeek": {
        "Relevance": 8,
        "Depth of Analysis": 8,
        "Clarity & Coherence": 7,
        "Actionability": 8,
        "Overall Score": 7.8,
        "Justification": "DeepSeek offers a relevant analysis focusing on EEG, HRV, and Pose data, identif

In [None]:
!pip install tabulate

Collecting tabulate
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.9.0


In [None]:
import pandas as pd

# Convert evaluation results into a DataFrame
df = pd.DataFrame.from_dict(evaluation_results, orient="index")

# Convert DataFrame to Markdown table format
md_table = df.to_markdown()

# Display the markdown table
from IPython.core.display import display, Markdown
display(Markdown(md_table))

|          |   Relevance |   Depth of Analysis |   Clarity & Coherence |   Actionability |   Overall Score | Justification                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
|:---------|------------:|--------------------:|----------------------:|----------------:|----------------:|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| GPT-4o   |           9 |                   9 |                     9 |               9 |             9   | The response from GPT-4o is highly relevant, addressing the key aspects of EEG, HRV, and Pose data correlations effectively. It provides a detailed analysis of the data, identifying significant anomalies and possible causes, which are well-supported by the ontology information. The structure is clear and easy to follow, and the response offers actionable insights into identifying and addressing cognitive stress and task-related issues. Overall, it presents a comprehensive and coherent analysis. |
| DeepSeek |           8 |                   8 |                     7 |               8 |             7.8 | DeepSeek offers a relevant analysis focusing on EEG, HRV, and Pose data, identifying significant anomalies and potential causes. While the analysis is detailed, some parts are less structured, affecting clarity. The response is actionable, suggesting links between cognitive stress and physical irregularities, but it could benefit from more specific recommendations or next steps. Overall, a solid analysis but with room for improved clarity.                                                         |
| Falcon   |           5 |                   4 |                     3 |               4 |             4   | Falcon's response lacks relevance and depth due to its focus on ontology information without adequately linking it to the data analysis. The structure is disjointed and hard to follow, which hampers coherence and clarity. The actionability is limited as it does not provide clear interpretations or next steps based on the data. Overall, the response is insufficiently detailed and lacks practical insights.                                                                                             |

## Blind GPT-4o Judge

In [21]:
import random

# Shuffle and rename models as "Model A", "Model B", etc.
model_names = list(llm_comparisons.keys())
random.shuffle(model_names)  # Shuffle to ensure no inherent bias
anonymized_mapping = {name: f"Model {chr(65 + i)}" for i, name in enumerate(model_names)}

# Apply new names to the responses
anonymized_responses = {anonymized_mapping[key]: value for key, value in llm_comparisons.items()}

# Print mapping for reference (DO NOT include this when submitting to the LLM)
print("Anonymized Mapping (for reference only):", anonymized_mapping)


Anonymized Mapping (for reference only): {'deepseek': 'Model A', 'mistral': 'Model B', 'falcon': 'Model C', 'gpt-4o': 'Model D'}


In [22]:
import openai

def blind_judge_responses(anonymized_responses):
    """
    Uses GPT-4o to evaluate different model responses without knowing which model is which.
    """

    evaluation_prompt = f"""
    Your task is to **evaluate and score each model's response** based on the following criteria (out of 10):

    1. **Relevance** - How well does it address the key aspects of EEG, HRV, and Pose correlations?
    2. **Depth of Analysis** - Does it provide detailed, well-explained insights?
    3. **Clarity & Coherence** - Is the response structured and easy to understand?
    4. **Actionability** - Does it offer useful interpretations or next steps?
    5. **Overall Score** - A weighted combination of the above.

    Provide scores **in JSON format**, including a short justification for each score.

    ### Anonymized Model Responses:
    {json.dumps(anonymized_responses, indent=2)}

    ### Expected JSON Output Format:
    {{
        "Model A": {{
            "Relevance": X,
            "Depth of Analysis": X,
            "Clarity & Coherence": X,
            "Actionability": X,
            "Overall Score": X,
            "Justification": "..."
        }},
        "Model B": {{ ... }},
        "Model C": {{ ... }}
    }}
    """

    system_message = {
        "role": "system",
        "content": "You are an impartial AI judge evaluating anonymized model responses analyzing EEG, HRV, and Pose data."
    }

    user_message = {"role": "user", "content": evaluation_prompt}

    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are an expert AI judge evaluating responses from different models analyzing EEG, HRV, and Pose data."},
                {"role": "user", "content": evaluation_prompt}
            ],
            max_tokens=700,
            temperature=0.7
        )

        # Extract the raw response text
        blind_judgment = response.choices[0].message.content

        # DEBUG: Print raw response for troubleshooting
        print("Raw GPT-4o Response:\n", blind_judgment)

        # Remove Markdown formatting (```json ... ```)
        if blind_judgment.startswith("```json"):
            blind_judgment = blind_judgment.strip("```json").strip("```")

        return json.loads(blind_judgment)

    except Exception as e:
        return f"Error calling GPT-4o: {str(e)}"

# Run the blind evaluation
blind_evaluation_results = blind_judge_responses(anonymized_responses)

# Print results
import pprint
pprint.pprint(blind_evaluation_results)


Raw GPT-4o Response:
 ```json
{
    "Model A": {
        "Relevance": 9,
        "Depth of Analysis": 8,
        "Clarity & Coherence": 9,
        "Actionability": 8,
        "Overall Score": 8.6,
        "Justification": "Model A provides a comprehensive breakdown of anomalies in EEG, HRV, and Pose data, addressing key aspects of their correlations. It identifies specific EEG channels and wave patterns, discusses HRV fluctuations, and explains gait irregularities in Pose data. The response is well-structured and easy to understand. While it offers possible causes and correlations, the depth could be slightly improved by integrating more specific data points or examples. Overall, it provides actionable insights into stress and fatigue indicators."
    },
    "Model B": {
        "Relevance": 0,
        "Depth of Analysis": 0,
        "Clarity & Coherence": 0,
        "Actionability": 0,
        "Overall Score": 0,
        "Justification": "Model B did not provide a response due to an A

In [23]:
import pandas as pd

# Convert evaluation results into a DataFrame
df = pd.DataFrame.from_dict(blind_evaluation_results, orient="index")

# Convert DataFrame to Markdown table format
md_table = df.to_markdown()

# Display the markdown table
from IPython.core.display import display, Markdown
display(Markdown(md_table))

|         |   Relevance |   Depth of Analysis |   Clarity & Coherence |   Actionability |   Overall Score | Justification                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
|:--------|------------:|--------------------:|----------------------:|----------------:|----------------:|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| Model A |           9 |                   8 |                     9 |               8 |             8.6 | Model A provides a comprehensive breakdown of anomalies in EEG, HRV, and Pose data, addressing key aspects of their correlations. It identifies specific EEG channels and wave patterns, discusses HRV fluctuations, and explains gait irregularities in Pose data. The response is well-structured and easy to understand. While it offers possible causes and correlations, the depth could be slightly improved by integrating more specific data points or examples. Overall, it provides actionable insights into stress and fatigue indicators. |
| Model B |           0 |                   0 |                     0 |               0 |             0   | Model B did not provide a response due to an API error, resulting in no analysis or insights on the data. Consequently, it scores 0 across all criteria.                                                                                                                                                                                                                                                                                                                                                                                              |
| Model C |           4 |                   3 |                     2 |               2 |             2.8 | Model C's response primarily consists of ontology information and a fragmented data summary, lacking a coherent analysis of EEG, HRV, and Pose data correlations. The organization is poor, making it difficult to extract meaningful insights or actionable steps. It fails to clearly address the task of identifying significant anomalies, resulting in low scores across all criteria.                                                                                                                                                           |
| Model D |           9 |                   9 |                     9 |               9 |             9   | Model D provides a detailed and structured analysis of EEG, HRV, and Pose data, effectively identifying significant anomalies and their correlations. It offers a nuanced understanding of stress and cognitive load impacts on physiology and motor functions, with clear explanations and potential causes. The response is coherent and actionable, suggesting a holistic approach to address underlying stressors. This comprehensive approach earns high scores in all categories.                                                               |

In [24]:
# Convert results to DataFrames
blind_df = pd.DataFrame.from_dict(blind_evaluation_results, orient="index")

# Reverse mapping to reveal original model names
revealed_results = {key: blind_evaluation_results[value] for key, value in anonymized_mapping.items()}
revealed_df = pd.DataFrame.from_dict(revealed_results, orient="index")

# Display results side-by-side
print("\nBlind Evaluation Scores (Anonymized):")
display(blind_df)

print("\nRevealed Evaluation Scores (With Model Names):")
display(revealed_df)



Blind Evaluation Scores (Anonymized):


Unnamed: 0,Relevance,Depth of Analysis,Clarity & Coherence,Actionability,Overall Score,Justification
Model A,9,8,9,8,8.6,Model A provides a comprehensive breakdown of ...
Model B,0,0,0,0,0.0,Model B did not provide a response due to an A...
Model C,4,3,2,2,2.8,Model C's response primarily consists of ontol...
Model D,9,9,9,9,9.0,Model D provides a detailed and structured ana...



Revealed Evaluation Scores (With Model Names):


Unnamed: 0,Relevance,Depth of Analysis,Clarity & Coherence,Actionability,Overall Score,Justification
deepseek,9,8,9,8,8.6,Model A provides a comprehensive breakdown of ...
mistral,0,0,0,0,0.0,Model B did not provide a response due to an A...
falcon,4,3,2,2,2.8,Model C's response primarily consists of ontol...
gpt-4o,9,9,9,9,9.0,Model D provides a detailed and structured ana...


## Blind DeepSeek as a Judge

In [None]:
import random
import json
import openai  # Ensure you have OpenAI installed or replace with DeepSeek's API call

# Use the anonymized responses from before
def anonymize_and_shuffle_responses(responses):
    shuffled_items = list(responses.items())
    random.shuffle(shuffled_items)  # Shuffle model order
    model_mapping = {f"Model {chr(65+i)}": resp for i, (name, resp) in enumerate(shuffled_items)}
    original_mapping = {f"Model {chr(65+i)}": name for i, (name, _) in enumerate(shuffled_items)}
    return model_mapping, original_mapping

# Run anonymization
blind_responses, original_names = anonymize_and_shuffle_responses(llm_comparisons)

# Create the evaluation prompt for DeepSeek
deepseek_evaluation_prompt = f"""
Your task is to **score each model's response** based on the following criteria (out of 10):

1. **Relevance** - How well does it address EEG, HRV, and Pose correlations?
2. **Depth of Analysis** - Does it provide detailed, well-explained insights?
3. **Clarity & Coherence** - Is the response structured and easy to understand?
4. **Actionability** - Does it offer useful interpretations or next steps?
5. **Overall Score** - A weighted combination of the above.

Provide scores **in JSON format**, including a short justification for each score.

### Anonymized Model Responses:
{json.dumps(blind_responses, indent=2)}

### Expected JSON Output Format:
{{
    "Model A": {{
        "Relevance": X,
        "Depth of Analysis": X,
        "Clarity & Coherence": X,
        "Actionability": X,
        "Overall Score": X,
        "Justification": "..."
    }},
    "Model B": {{ ... }},
    "Model C": {{ ... }},
    "Model D": {{ ... }}
}}
"""

# Call DeepSeek as the judge
def judge_with_deepseek(query):
    """
    Calls DeepSeek API to evaluate responses.
    """
    API_KEY = DEEPSEEK_API_KEY
    if not API_KEY:
        return "API Key not found! Set DEEPSEEK_API_KEY environment variable."

    url = "https://api.deepseek.com/chat/completions"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {API_KEY}",
    }
    payload = {
        "model": "deepseek-chat",  # deepseek-chat or deepseek-reasoner
        "messages": [
            {"role": "system", "content": "You analyze EEG, HRV, and Pose data."},
            {"role": "user", "content": query}
        ],
        "max_tokens": 700,
        "temperature": 0.7,
        "stream": False
    }

    try:
        response = requests.post(url, headers=headers, json=payload)
        response_json = response.json()

        if "error" in response_json:
            return f"API Error: {response_json['error']}"

        return response_json["choices"][0]["message"]["content"]
    except Exception as e:
        return f"Error calling DeepSeek: {str(e)}"

In [None]:
# Run DeepSeek judgment
blind_deepseek_evaluation = judge_with_deepseek(deepseek_evaluation_prompt)

In [18]:
if blind_deepseek_evaluation.startswith("```json"):
    blind_deepseek_evaluation = blind_deepseek_evaluation.strip("```json").strip("```")

print(blind_deepseek_evaluation)


{
    "Model A": {
        "Relevance": 0,
        "Depth of Analysis": 0,
        "Clarity & Coherence": 0,
        "Actionability": 0,
        "Overall Score": 0,
        "Justification": "The response is an error message and does not address the task. It fails to provide any analysis or insights related to EEG, HRV, and Pose data."
    },
    "Model B": {
        "Relevance": 9,
        "Depth of Analysis": 8,
        "Clarity & Coherence": 9,
        "Actionability": 8,
        "Overall Score": 8.5,
        "Justification": "The response is highly relevant, providing a detailed analysis of EEG, HRV, and Pose data correlations. It is well-structured and easy to follow, offering actionable insights into stress, relaxation, and motion anomalies. However, it could delve deeper into specific EEG states and their implications."
    },
    "Model C": {
        "Relevance": 6,
        "Depth of Analysis": 5,
        "Clarity & Coherence": 4,
        "Actionability": 5,
        "Overall Sc

In [20]:
if isinstance(blind_deepseek_evaluation, str):
    blind_deepseek_evaluation = json.loads(blind_deepseek_evaluation)  # Convert from string to dictionary

# Reveal the real model names
revealed_results = {original_names[anon_name]: scores for anon_name, scores in blind_deepseek_evaluation.items()}

# Print out results for comparison
import pandas as pd
df_blind_deepseek = pd.DataFrame.from_dict(blind_deepseek_evaluation, orient='index')
df_revealed_deepseek = pd.DataFrame.from_dict(revealed_results, orient='index')

print("\n🔹 **Blind Evaluation Scores (DeepSeek Judge, Anonymized):**")
display(df_blind_deepseek)

print("\n🔹 **Revealed Evaluation Scores (DeepSeek Judge, With Model Names):**")
display(df_revealed_deepseek)


🔹 **Blind Evaluation Scores (DeepSeek Judge, Anonymized):**


Unnamed: 0,Relevance,Depth of Analysis,Clarity & Coherence,Actionability,Overall Score,Justification
Model A,0,0,0,0,0.0,The response is an error message and does not ...
Model B,9,8,9,8,8.5,"The response is highly relevant, providing a d..."
Model C,6,5,4,5,5.0,The response includes ontology information and...
Model D,9,9,9,9,9.0,The response is highly relevant and provides a...



🔹 **Revealed Evaluation Scores (DeepSeek Judge, With Model Names):**


Unnamed: 0,Relevance,Depth of Analysis,Clarity & Coherence,Actionability,Overall Score,Justification
mistral,0,0,0,0,0.0,The response is an error message and does not ...
gpt-4o,9,8,9,8,8.5,"The response is highly relevant, providing a d..."
falcon,6,5,4,5,5.0,The response includes ontology information and...
deepseek,9,9,9,9,9.0,The response is highly relevant and provides a...


## Deepseek as a Judge

In [25]:
# Create the evaluation prompt for DeepSeek without anonymization
deepseek_evaluation_prompt = f"""
Your task is to **score each model's response** based on the following criteria (out of 10):

1. **Relevance** - How well does it address EEG, HRV, and Pose correlations?
2. **Depth of Analysis** - Does it provide detailed, well-explained insights?
3. **Clarity & Coherence** - Is the response structured and easy to understand?
4. **Actionability** - Does it offer useful interpretations or next steps?
5. **Overall Score** - A weighted combination of the above.

Provide scores **in JSON format**, including a short justification for each score.

### Model Responses:
{json.dumps(llm_comparisons, indent=2)}

### Expected JSON Output Format:
{{
    "GPT-4o": {{
        "Relevance": X,
        "Depth of Analysis": X,
        "Clarity & Coherence": X,
        "Actionability": X,
        "Overall Score": X,
        "Justification": "..."
    }},
    "DeepSeek": {{ ... }},
    "Falcon": {{ ... }},
    "Mistral": {{ ... }}
}}
"""

# Get evaluation results
non_blind_deepseek_evaluation = judge_with_deepseek(deepseek_evaluation_prompt)

# Clean up the response in case it is wrapped in markdown-style JSON formatting
if non_blind_deepseek_evaluation.startswith("```json"):
    non_blind_deepseek_evaluation = non_blind_deepseek_evaluation.strip("```json").strip("```")

# Ensure JSON parsing is correct
if isinstance(non_blind_deepseek_evaluation, str):
    non_blind_deepseek_evaluation = json.loads(non_blind_deepseek_evaluation)  # Convert from string to dictionary

# Convert results to DataFrame for better readability
df_non_blind = pd.DataFrame.from_dict(non_blind_deepseek_evaluation, orient='index')

# Display results
print("\n🔹 **Non-Blinded Evaluation Scores (DeepSeek Judge, With Model Names):**")
display(df_non_blind)



🔹 **Non-Blinded Evaluation Scores (DeepSeek Judge, With Model Names):**


Unnamed: 0,Relevance,Depth of Analysis,Clarity & Coherence,Actionability,Overall Score,Justification
GPT-4o,9,9,9,8,8.8,"The response is highly relevant, addressing EE..."
DeepSeek,8,8,8,7,7.8,The response is relevant and provides a good b...
Falcon,5,4,5,3,4.3,The response is less relevant as it primarily ...
Mistral,1,1,1,1,1.0,The response is an error message and does not ...
