# Imports

In [72]:
# standard library imports
import os
import random
from typing import Callable

# related third party imports
import dotenv
import pandas as pd
import numpy as np
import structlog
from langchain_chroma import Chroma
from langchain_core.example_selectors import SemanticSimilarityExampleSelector
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_ollama import OllamaEmbeddings
from langchain_core.prompts import (
    ChatPromptTemplate,
    FewShotChatMessagePromptTemplate,
    PromptTemplate,
    SystemMessagePromptTemplate,
)
from langchain_core.output_parsers import PydanticOutputParser
from langchain_ollama import ChatOllama
from pydantic import BaseModel, Field, ValidationError
from yacs.config import CfgNode
from sklearn.metrics import accuracy_score

# local application/library specific imports
from example_selector.example_selector import (
    RandomExampleSelector,
    StudentIDRandomExampleSelector,
)
from data_loader.data_loader import DataLoader
from tools.constants import SILVER_DIR, TRAIN, VALIDATION, TEST, MODEL_STRUCTURED_OUTPUT
from prompt.few_shot_prompt import (
    df_to_listdict,
)
from model.build import build_model
from example_formatter.build import build_example_formatter

logger = structlog.get_logger()

# Reload the variables in your '.env' file (override the existing variables)
dotenv.load_dotenv("../.env", override=True)

True

In [None]:
### INPUTS ###
MODEL_NAME = "llama3"  # "olmo2:7b"  # "gpt-4o-mini"  # "llama3.2"
MODEL_PROVIDER = "ollama"  # "openai"  # 
SUPPORTS_STRUCTURED_OUTPUT = MODEL_STRUCTURED_OUTPUT[MODEL_NAME]

In [74]:
model_cfg = CfgNode(
    {
        "NAME": MODEL_NAME,
        "PROVIDER": MODEL_PROVIDER,
        "TEMPERATURE": 0.5,
        "MAX_TOKENS": None,
        "TIMEOUT": None,
        "MAX_RETRIES": None,
    }
)
example_formatter_cfg = CfgNode(
    {
        "NAME": "A"
    }
)

# Data

In [75]:
# load data
data_loader = DataLoader(read_dir=SILVER_DIR, dataset_name="dbe_kt22", join_key="question_id")
datasets = data_loader.split_data(train_size=0.6, test_size=0.25, seed=42)


[2m2025-03-26 16:59:18[0m [[32m[1minfo     [0m] [1mSet seed (42)                 [0m
[2m2025-03-26 16:59:18[0m [[32m[1minfo     [0m] [1mCreating train split          [0m [36mnum_interactions[0m=[35m1967[0m
[2m2025-03-26 16:59:18[0m [[32m[1minfo     [0m] [1mCreating validation split     [0m [36mnum_interactions[0m=[35m492[0m
[2m2025-03-26 16:59:18[0m [[32m[1minfo     [0m] [1mCreating test split           [0m [36mnum_interactions[0m=[35m820[0m


In [76]:

# # dataframes
# df_train = apply_prompt_fmt(
#     df=dataset[TRAIN], input_fmt=human_format_input, output_fmt=human_format_output
# )
# df_val = apply_prompt_fmt(
#     df=dataset[VALIDATION], input_fmt=human_format_input, output_fmt=human_format_output
# )
# df_test = apply_prompt_fmt(
#     df=dataset[TEST], input_fmt=human_format_input, output_fmt=human_format_output
# )

# # list of dicts
# list_train = df_to_listdict(df_train)
# list_val = df_to_listdict(df_val)
# list_test = df_to_listdict(df_test)

In [77]:
# dataframes
datasets_fmt = build_example_formatter(
    example_formatter_cfg=example_formatter_cfg,
    datasets=datasets,
)

# list of dicts
list_train = df_to_listdict(datasets_fmt[TRAIN])
list_val = df_to_listdict(datasets_fmt[VALIDATION])
list_test = df_to_listdict(datasets_fmt[TEST])  # noqa

[2m2025-03-26 16:59:18[0m [[32m[1minfo     [0m] [1mBuilding example formatter    [0m [36mname[0m=[35mA[0m [36msplits[0m=[35m['train', 'validation', 'test'][0m


In [78]:
datasets_fmt[VALIDATION].head()

Unnamed: 0,input,output,student_id,question_id,interact_id
6,Question:\nWhat is the Cartesian product of A ...,Student answer: 0,86,4,787
10,"Question:\nIf A × B = {(p, x), (p, y), (q, x),...",Student answer: 1,86,8,791
11,"Question:\nIf A = {2, 3, 4, 5}, B = {4, 5, 6, ...",Student answer: 1,86,9,792
15,"Question:\nIf A = {2, 3, 4, 5}, B = {4, 5, 6, ...",Student answer: 3,86,13,796
21,Question:\nConsider a database that stores nam...,Student answer: 1,31,95,7873


# Dynamic few-shot prompting

## Create example selector

NOTE: I need OpenAI credits to use the OpenAI embeddings.

In [None]:
import time
from pinecone import Pinecone, ServerlessSpec
from langchain_ollama import OllamaEmbeddings
from langchain_pinecone import PineconeVectorStore


index_name = "llama3"  # change if desired

pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))

EMBEDDINGS_DIM = {"llama3": 4096}

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=EMBEDDINGS_DIM[model_cfg.NAME],
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

embeddings = OllamaEmbeddings(model="llama3")  # TODO: make dynamic

vector_store = PineconeVectorStore(index=index, embedding=embeddings, namespace="dbe_kt22")

In [80]:
vector_input_df = datasets_fmt[TRAIN].head(100)

In [81]:
vector_input_df

Unnamed: 0,input,output,student_id,question_id,interact_id
1,Question:\nThe set that consists of all odd po...,Student answer: 0,76,3,751
2,Question:\nWhat is the Cartesian product of A ...,Student answer: 0,76,4,752
3,Question:\nThe Cartesian product B x A is alwa...,Student answer: 1,76,5,753
4,Question:\nA __________ is a collection of dis...,Student answer: 3,86,2,785
5,Question:\nThe set that consists of all odd po...,Student answer: 0,86,3,786
...,...,...,...,...,...
163,Question:\nWhat operation(s) can a transaction...,Student answer: 1,411,203,30621
165,Question:\nWhich of the following problems occ...,Student answer: 2,411,205,30638
167,Question:\nWhich properties are ensured by the...,Student answer: 2,411,207,30663
169,Question:\nConsider two transactions <img src=...,Student answer: 2,411,209,30689


In [82]:
from langchain_core.documents import Document

vector_input_doc = [
    Document(
        page_content=row["input"],
        metadata={
            "student_id": row["student_id"],
            "question_id": row["question_id"],
            "output": row["output"],
        },
    )
    for _, row in vector_input_df.iterrows()
]
vector_input_id = vector_input_df["interact_id"].astype(str).tolist()

In [83]:
len(vector_input_doc)

100

In [87]:
# _ = vector_store.add_documents(documents=vector_input_doc, ids=vector_input_id)

In [88]:
# "llama-text-embed-v2"

In [89]:
# vector_store.delete(delete_all=True)

In [119]:
from langchain_core.example_selectors.base import BaseExampleSelector
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings
from pinecone import Pinecone


def get_vector_store(
    index_name: str, embedding_name: str, namespace: str
) -> PineconeVectorStore:
    """Get the Pinecode vector store.

    Parameters
    ----------
    index_name : str
        Index name
    embedding_name : str
        Embedding name
    namespace : str
        Index namespace

    Returns
    -------
    PineconeVectorStore
        The Pinecone vector store.

    Raises
    ------
    ValueError
        If the index does not exist.
    """
    pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
    if index_name not in existing_indexes:
        raise ValueError(f"Index {index_name} does not exist.")
    index = pc.Index(index_name)
    embeddings = OllamaEmbeddings(
        model=embedding_name
    )  # TODO: how to handle different embedding providers?
    vector_store = PineconeVectorStore(index=index, embedding=embeddings, namespace=namespace)
    logger.info(
        f"Loaded Pinecone vector store", index_name=index_name, namespace=namespace
    )
    return vector_store


EMBEDDING_NAMES = {"llama3": "llama3"}


class StudentIDSemanticExampleSelector(BaseExampleSelector):
    """Filter examples of the same student_id and select based on semantic similarity."""

    def __init__(
        self, k: int, index_name: str, model_name: str, namespace: str
    ) -> None:
        """Initialize the example selector.

        Parameters
        ----------
        k : int
            k-shot prompting
        index_name : str
            The name of the Pinecone index.
        model_name : str
            The name of the LLM.
        namespace : str
            The namespace of the Pinecone index.
        """
        self.k = k

        embedding_name = EMBEDDING_NAMES[model_name]
        self.vectorstore = get_vector_store(
            index_name=index_name, embedding_name=embedding_name, namespace=namespace
        )

    def add_example(self, example):
        # NOTE:
        raise NotImplementedError

    def select_examples(self, input_variables: dict) -> list[dict[str, str]]:
        """Select examples based on semantic similarity.

        Parameters
        ----------
        input_variables : dict[str, str]
            A dict containing info about a single observation.

        Returns
        -------
        list[dict[str, str]]
            The selected examples.
        """
        # student_id of target student
        student_id = input_variables["student_id"]
        input = input_variables["input"]

        results = self.vectorstore.similarity_search(
            query=input,
            k=self.k,
            filter={"student_id": student_id},
        )
        return [
            {"input": res.page_content, "output": res.metadata["output"]}
            for res in results
        ]

In [120]:
# example_selector = SemanticSimilarityExampleSelector(
#     examples=list_train, k=1
# )
# example_selector.select_examples({"input": list_val[0]["input"]})

In [95]:
results = vector_store.similarity_search(
    list_val[0]["input"],
    k=2,
    filter={"student_id": list_val[0]["student_id"]},
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* Question:
If {a,b} is a superkey in R(a,b,c), then________________.

Options:
1. {a,b,c} must not be a candidate key
2. {a,b,c} must be a primary key
3. {a,b,c} must not be a superkey

Correct answer: 0 [{'output': 'Student answer: 2', 'question_id': 46.0, 'student_id': 86.0}]
* Question:
The set that consists of all odd positive integers less than 10 is represented by _____________.

Options:
1. {1, 3, 5, 7, 9}
2. {1, 5, 7, 9, 11}
3. {1, 2, 5, 9}
4. {1, 2, 3}

Correct answer: 0 [{'output': 'Student answer: 0', 'question_id': 3.0, 'student_id': 86.0}]


In [121]:
example_selector = StudentIDSemanticExampleSelector(
    k=2,
    index_name="llama3",
    model_name=model_cfg.NAME,
    namespace="dbe_kt22",
)
example_selector.select_examples(list_val[0])

[2m2025-03-26 17:05:42[0m [[32m[1minfo     [0m] [1mLoaded Pinecone vector store  [0m [36mindex_name[0m=[35mllama3[0m [36mnamespace[0m=[35mdbe_kt22[0m


[{'input': 'Question:\nIf {a,b} is a superkey in R(a,b,c), then________________.\n\nOptions:\n1. {a,b,c} must not be a candidate key\n2. {a,b,c} must be a primary key\n3. {a,b,c} must not be a superkey\n\nCorrect answer: 0',
  'output': 'Student answer: 2'},
 {'input': 'Question:\nThe set that consists of all odd positive integers less than 10 is represented by _____________.\n\nOptions:\n1. {1, 3, 5, 7, 9}\n2. {1, 5, 7, 9, 11}\n3. {1, 2, 5, 9}\n4. {1, 2, 3}\n\nCorrect answer: 0',
  'output': 'Student answer: 0'}]

___

In [106]:
# examples = list_train[:10]
# to_vectorize = [example["input"] for example in examples]
# # embeddings = OpenAIEmbeddings()
# embeddings = OllamaEmbeddings(model="llama3")
# vectorstore = Chroma.from_texts(
#     texts=to_vectorize,
#     embedding=embeddings,
#     metadatas=examples,
#     persist_directory=os.path.join("output", "vectorstore", "chroma_langchain_db"),
# )

In [107]:
# NOTE: texts depend on the example formatter used

In [108]:
# vectorstore

In [109]:
# example_selector = SemanticSimilarityExampleSelector(
#     vectorstore=vectorstore,
#     k=1
# )
# example_selector.select_examples({"input": list_val[0]["input"]})

In [110]:
# example_selector = SemanticSimilarityExampleSelector(
#     vectorstore=vectorstore,
#     k=2,
# )

# # The prompt template will load examples by passing the input do the `select_examples` method
# example_selector.select_examples({"input": "horse"})

In [111]:
# # Create the selector with k=3 for 3-shot prompting
# example_selector = RandomExampleSelector(examples=list_train, k=3)
# example_selector.select_examples({})

In [112]:
# # Select examples of a specific student
# example_selector = StudentIDExampleSelector(examples=list_train, k=3)
# example_selector.select_examples({"student_id": 395})

## Create prompt template

In [122]:
# Pydantic
class MCQAnswer(BaseModel):
    """Answer to a multiple-choice question."""

    explanation: str = Field(
        description="Misconception if incorrectly answered; motivation if correctly answered"
    )
    student_answer: int = Field(
        description="The student's answer to the question, as an integer (1-4)"
    )
    # difficulty: str = Field(description="The difficulty level of the question")

In [123]:
# Define the few-shot prompt.
few_shot_prompt = FewShotChatMessagePromptTemplate(
    # The input variables select the values to pass to the example_selector
    input_variables=["student_id"],  # TODO: do not hardcode
    example_selector=example_selector,
    # Define how each example will be formatted.
    # In this case, each example will become 2 messages:
    # 1 human, and 1 AI
    example_prompt=ChatPromptTemplate.from_messages(
        [("human", "{input}"), ("ai", "{output}")]
    ),
)

out = few_shot_prompt.invoke(input=list_val[0]).to_messages()
print(len(out))
print(out)

4
[HumanMessage(content='Question:\nIf {a,b} is a superkey in R(a,b,c), then________________.\n\nOptions:\n1. {a,b,c} must not be a candidate key\n2. {a,b,c} must be a primary key\n3. {a,b,c} must not be a superkey\n\nCorrect answer: 0', additional_kwargs={}, response_metadata={}), AIMessage(content='Student answer: 2', additional_kwargs={}, response_metadata={}), HumanMessage(content='Question:\nThe set that consists of all odd positive integers less than 10 is represented by _____________.\n\nOptions:\n1. {1, 3, 5, 7, 9}\n2. {1, 5, 7, 9, 11}\n3. {1, 2, 5, 9}\n4. {1, 2, 3}\n\nCorrect answer: 0', additional_kwargs={}, response_metadata={}), AIMessage(content='Student answer: 0', additional_kwargs={}, response_metadata={})]


In [124]:
system_prompt_raw = (
    "You are a student working on {exam_type}, containing multiple choice questions. "
    "You are shown a set of questions that you answered earlier in the exam, together with the correct answers and your student answers. "
    "Analyse your responses to the questions and identify the possible misconceptions that led to answering incorrectly. "
    "Inspect the new question and think how you would answer it as a student. "
    "If you answer incorrectly, explain which misconception leads to selecting that answer. "
    "If you answer correctly, explain why you think the answer is correct. "
    "Provide your answer as an integer in the range 1-4. "
)
# Set up a parser (not used if model supports structured output)
parser = PydanticOutputParser(pydantic_object=MCQAnswer)
if not SUPPORTS_STRUCTURED_OUTPUT:
    system_prompt_raw += "Wrap the output in `json` tags\n{format_instructions}"


final_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt_raw),
        few_shot_prompt,
        ("human", "{input}"),
    ]
).partial(
    format_instructions=parser.get_format_instructions(),
    exam_type="a database systems exam (Department of Computer Science)",
)

# print(
#     final_prompt.invoke(
#         input=list_val[0],
#     ).to_string()
# )
out = final_prompt.invoke(input=list_val[0]).to_messages()
print(len(out))
print(out)

6
[SystemMessage(content='You are a student working on a database systems exam (Department of Computer Science), containing multiple choice questions. You are shown a set of questions that you answered earlier in the exam, together with the correct answers and your student answers. Analyse your responses to the questions and identify the possible misconceptions that led to answering incorrectly. Inspect the new question and think how you would answer it as a student. If you answer incorrectly, explain which misconception leads to selecting that answer. If you answer correctly, explain why you think the answer is correct. Provide your answer as an integer in the range 1-4. Wrap the output in `json` tags\nThe output should be formatted as a JSON instance that conforms to the JSON schema below.\n\nAs an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}\nthe object {"foo":

# Model

In [125]:
# model
model = build_model(model_cfg=model_cfg)
if SUPPORTS_STRUCTURED_OUTPUT:
    model = model.with_structured_output(MCQAnswer, include_raw=True)

# chain
chain = final_prompt | model
# if not SUPPORTS_STRUCTURED_OUTPUT:
#     chain = chain.pipe(parser)

[2m2025-03-26 17:05:53[0m [[32m[1minfo     [0m] [1mBuilding model                [0m [36mname[0m=[35mllama3[0m [36mprovider[0m=[35mollama[0m


In [127]:
from prompt.json_schema import validate_output

# run model in batch
preds_raw = chain.batch(list_val[:10])
if SUPPORTS_STRUCTURED_OUTPUT:
    # get all raw outputs
    preds_raw = [output["raw"] for output in preds_raw]
preds_validated = validate_output(preds_raw, schema=MCQAnswer)

[2m2025-03-26 17:06:05[0m [[32m[1minfo     [0m] [1mValidating outputs            [0m


In [128]:
y_val_pred = np.array([output.student_answer for output in preds_validated])
y_val_pred

array([0, 1, 1, 3, 2, 1, 1, 0, 1, 0])

In [129]:
y_val_student = datasets[VALIDATION]["student_option_id"].to_numpy()[:10]
y_val_student


array([0, 1, 1, 3, 1, 1, 1, 0, 1, 0])

In [130]:
y_val_true = datasets[VALIDATION]["correct_option_id"].to_numpy()[:10]
y_val_true

array([0, 1, 1, 3, 1, 1, 1, 0, 1, 0])

In [131]:
acc_student_pred = accuracy_score(y_true=y_val_student, y_pred=y_val_pred)
acc_true_student = accuracy_score(y_true=y_val_true, y_pred=y_val_student)
acc_true_pred = accuracy_score(y_true=y_val_true, y_pred=y_val_pred)

print(f"{acc_student_pred = }")
print(f"{acc_true_student = }")
print(f"{acc_true_pred = }")

acc_student_pred = 0.9
acc_true_student = 1.0
acc_true_pred = 0.9


In [132]:
# TODO: add func to only print input (also printing output can be confusing)
def print_example(example: dict) -> None:
    """Print single example.

    Parameters
    ----------
    example : dict
        Example dictionary with 'input' and 'output' keys.
    """
    text = (
        "#" * 40
        + f"\nINPUT\n"
        + "#" * 40
        + f"\n{example['input']}\n"
        + "#" * 40
        + f"\nOUTPUT\n"
        + "#" * 40
        + f"\n{example['output']}\n"
    )
    print(text)


print_example(list_val[0])

########################################
INPUT
########################################
Question:
What is the Cartesian product of A = {1, 2} and B = {a, b}?

Options:
1. {(1, a), (2, a), (1, b), (2, b)}
2. {(1, a), (1, b), (2, a), (b, b)}
3. {(a, 1), (a, 2), (b, 1), (b, 2)}
4. {(1, 1), (2, 2), (a, a), (b, b)}

Correct answer: 0
########################################
OUTPUT
########################################
Student answer: 0

