In [3]:

import json
import pandas as pd
from llama_cpp import Llama
from IPython.display import display, Markdown

# Files map (edit paths as needed)
METRIC_FILES = {
    "Rmsd": "RMSD_data.csv",
    "Skil": "Skil_data.csv",
    "Bias": "bias_data.csv"
}

AttributeError: partially initialized module 'pandas' has no attribute 'core' (most likely due to a circular import)

In [None]:
# Data and processing functions

def get_metric_interval(start_date: str,
                        end_date: str,
                        min_lead: int,
                        max_lead: int,
                        metric: str) -> pd.DataFrame:
    """
    Returns a DataFrame containing `metric` between `start_date` and `end_date`
    (inclusive) for all lead times between `min_lead` and `max_lead` (inclusive).
    """
    metric = metric.capitalize()
    if metric not in METRIC_FILES:
        raise ValueError(f"Unknown metric '{metric}'. Choose from {list(METRIC_FILES)}.")

    df = pd.read_csv(METRIC_FILES[metric])
    df['Date'] = pd.to_datetime(df['Date'])
    start = pd.to_datetime(start_date)
    end = pd.to_datetime(end_date)
    df = df[(df['Date'] >= start) & (df['Date'] <= end)]
    if df.empty:
        raise KeyError(f"No rows found between {start_date} and {end_date}.")

    lead_cols = [col for col in df.columns if col != 'Date' and min_lead <= int(col) <= max_lead]
    if not lead_cols:
        raise KeyError(f"No lead‐time columns between {min_lead} and {max_lead}.")

    return df[['Date'] + sorted(lead_cols, key=int)]

def get_metric_stats(start_date: str,
                     end_date: str,
                     min_lead: int,
                     max_lead: int,
                     metric: str) -> dict:
    """
    Returns a dict with the overall min and max of `metric` in the specified
    date and lead-time window, plus the Date/LeadTime where each occurs.
    """
    df_int = get_metric_interval(start_date, end_date, min_lead, max_lead, metric)
    df_long = df_int.melt(id_vars='Date', var_name='LeadTime', value_name='Value')
    df_long['LeadTime'] = df_long['LeadTime'].astype(int)

    idx_min = df_long['Value'].idxmin()
    idx_max = df_long['Value'].idxmax()
    min_row = df_long.loc[idx_min]
    max_row = df_long.loc[idx_max]

    return {
        'min_value': min_row['Value'],
        'min_date': min_row['Date'].strftime('%Y-%m-%d'),
        'min_lead': int(min_row['LeadTime']),
        'max_value': max_row['Value'],
        'max_date': max_row['Date'].strftime('%Y-%m-%d'),
        'max_lead': int(max_row['LeadTime'])
    }

def parse_args_with_defaults(args):
    args.setdefault('min_lead', 0)
    args.setdefault('max_lead', 150)
    return args


Let me clarify your request: You're asking about potential errors in the provided Jupyter notebook code, but there's no explicit error shown. Let me help by identifying potential issues and providing a check.



The notebook has a few potential issues that should be checked:

1. Model path verification is needed - the hardcoded path might not exist
2. CSV files existence check is missing
3. No error handling for file loading



Let's add a diagnostic cell to check these potential issues:



In [None]:
import os

# Check files existence
files_status = {
    path: os.path.exists(path) for path in METRIC_FILES.values()
}

# Check model path
model_path = "/Users/aryanharooni/models/Meta-Llama-3-8B-Instruct-v2.Q4_K_M.gguf"
model_exists = os.path.exists(model_path)

print("Files status:", files_status)
print("Model exists:", model_exists)



Let's also add a safer version of file loading with error handling:



In [None]:
def safe_load_metric_file(file_path: str) -> pd.DataFrame:
    try:
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File not found: {file_path}")
        return pd.read_csv(file_path)
    except Exception as e:
        raise Exception(f"Error loading {file_path}: {str(e)}")

In [None]:
# Test one of the files
try:
    df = safe_load_metric_file(METRIC_FILES["Rmsd"])
    print("Successfully loaded RMSD data")
    print("Columns:", df.columns.tolist())
except Exception as e:
    print("Error:", str(e))


In [None]:
# Llama model and prompts

# Instantiate llama.cpp (edit path to your own model file if needed)
llm = Llama(model_path="/Users/aryanharooni/models/Meta-Llama-3-8B-Instruct-v2.Q4_K_M.gguf")

system_prompt = """
You are a function-calling assistant. When the user asks a question about RMSD, Skil, or Bias metrics, reply ONLY with JSON in this exact format:
{"function": "get_metric_stats", "args": {"start_date": "<YYYY-MM-DD>", "end_date": "<YYYY-MM-DD>", "min_lead": <int>, "max_lead": <int>, "metric": "<metric>"}}

- Only use the function "get_metric_stats" to answer all queries.
- If min_lead or max_lead is not specified, use min_lead=0 and max_lead=150.
- Do NOT include explanations or any extra text, just the JSON.
"""

def call_llm(user_prompt):
    prompt = f"<|system|>\n{system_prompt}\n<|user|>\n{user_prompt}\n<|assistant|>\n"
    output = llm(
        prompt=prompt,
        stop=["<|user|>", "<|system|>"],
        max_tokens=256,
        temperature=0.2,
    )["choices"][0]["text"].strip()
    try:
        json_start = output.find('{')
        json_data = output[json_start:]
        return json.loads(json_data)
    except Exception as e:
        print(f"Could not parse LLM output: {output}")
        raise

def generate_human_answer(user_prompt, stats, args):
    answer_prompt = f"""
You are an assistant that summarizes results for metrics queries.
- Given a user question and a dictionary of statistics, answer concisely and clearly.
- Only mention information relevant to the user's question.
- If the user only asks for the max, do not mention the min, and vice versa.

User question:
{user_prompt}

Statistics dictionary:
{json.dumps(stats)}

Date range: {args.get('start_date')} to {args.get('end_date')}
Lead times: {args.get('min_lead')} to {args.get('max_lead')}
Metric: {args.get('metric')}
"""
    output = llm(
        prompt=f"<|system|>\n{answer_prompt}\n<|assistant|>\n",
        stop=["<|user|>", "<|system|>"],
        max_tokens=192,
        temperature=0.2,
    )["choices"][0]["text"].strip()
    return output


In [None]:
# Run a query

def ask_llm_metric_question(user_question):
    try:
        llm_response = call_llm(user_question)
        fn = llm_response["function"]
        args = parse_args_with_defaults(llm_response["args"])
        if fn == "get_metric_stats":
            stats = get_metric_stats(**args)
            display(Markdown(f"**Raw stats:** `{stats}`"))
            answer = generate_human_answer(user_question, stats, args)
            display(Markdown(f"**LLM Answer:** {answer}"))
        else:
            display(Markdown("Unknown function."))
    except Exception as ex:
        display(Markdown(f"**Error:** {ex}"))

# Example interactive call (uncomment to use)
# ask_llm_metric_question("What is the maximum RMSD between June 2024 and July 2024?")


In [None]:
# Interactive input

# To use interactively in the notebook
user_question = input("Ask your metrics question: ")
ask_llm_metric_question(user_question)
