# Imports

In [None]:
# standard library imports
import os
import random
from typing import Callable

# related third party imports
import dotenv
import pandas as pd
from langchain_chroma import Chroma
from langchain_core.example_selectors import SemanticSimilarityExampleSelector
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.prompts import (
    ChatPromptTemplate,
    FewShotChatMessagePromptTemplate,
    PromptTemplate,
    SystemMessagePromptTemplate,
)
from langchain_core.output_parsers import PydanticOutputParser
from langchain_ollama import ChatOllama
from pydantic import BaseModel, Field, ValidationError
from yacs.config import CfgNode

# local application/library specific imports
from example_selector.example_selector import (
    RandomExampleSelector,
    StudentIDExampleSelector,
)
from data_loader.data_loader import DataLoader
from tools.constants import SILVER_DIR, TRAIN, VALIDATION, TEST
from prompt_template.example_prompt import (
    df_to_listdict,
    human_format_input,
    human_format_output,
    apply_prompt_fmt,
)
from model.build import build_model

# Reload the variables in your '.env' file (override the existing variables)
dotenv.load_dotenv("../.env", override=True)

In [None]:
MODEL_STRUCTURED_OUTPUT = {
    "llama3": False,
    "llama3.2": True,
    "olmo2": False,
    "gpt-4": True,
    "gpt-4o-mini": True,
}

In [None]:
### INPUTS ###
MODEL_NAME = "gpt-4o-mini"  # "llama3"  # "llama3.2"
MODEL_PROVIDER = "openai"  # "ollama"
SUPPORTS_STRUCTURED_OUTPUT = MODEL_STRUCTURED_OUTPUT[MODEL_NAME]

In [None]:
model_cfg = CfgNode({"NAME": MODEL_NAME, "PROVIDER": MODEL_PROVIDER, "TEMPERATURE": 0.5, "MAX_TOKENS": None, "TIMEOUT": None, "MAX_RETRIES": None})

# Data

In [None]:
# load data
data_loader = DataLoader(read_dir=SILVER_DIR, dataset_name="dbe_kt22")
dataset = data_loader.split_data(train_size=0.6, test_size=0.25, seed=42)

# dataframes
df_train = apply_prompt_fmt(
    df=dataset[TRAIN], input_fmt=human_format_input, output_fmt=human_format_output
)
df_val = apply_prompt_fmt(
    df=dataset[VALIDATION], input_fmt=human_format_input, output_fmt=human_format_output
)
df_test = apply_prompt_fmt(
    df=dataset[TEST], input_fmt=human_format_input, output_fmt=human_format_output
)

# list of dicts
list_train = df_to_listdict(df_train)
list_val = df_to_listdict(df_val)
list_test = df_to_listdict(df_test)

# Dynamic few-shot prompting

## Create example selector

NOTE: I need OpenAI credits to use the OpenAI embeddings.

In [None]:
# examples = few_shot_list
# to_vectorize = [" ".join(example.values()) for example in examples]
# embeddings = OpenAIEmbeddings()
# vectorstore = Chroma.from_texts(to_vectorize, embeddings, metadatas=examples)

In [None]:
# example_selector = SemanticSimilarityExampleSelector(
#     vectorstore=vectorstore,
#     k=2,
# )

# # The prompt template will load examples by passing the input do the `select_examples` method
# example_selector.select_examples({"input": "horse"})

In [None]:
# # Create the selector with k=3 for 3-shot prompting
# example_selector = RandomExampleSelector(examples=list_train, k=3)
# example_selector.select_examples({})

In [None]:
# Select examples of a specific student
example_selector = StudentIDExampleSelector(examples=list_train, k=3)
example_selector.select_examples({"student_id": 395})

## Create prompt template

In [None]:
# Pydantic
class MCQAnswer(BaseModel):
    """Answer to a multiple-choice question."""

    explanation: str = Field(
        description="Misconception if incorrectly answered; motivation if correctly answered"
    )
    student_answer: int = Field(
        description="The student's answer to the question, as an integer (1-4)"
    )
    # difficulty: str = Field(description="The difficulty level of the question")

In [None]:
system_prompt_raw = (
    "You are a student working on {exam_type}, containing multiple choice questions. "
    "You are shown a set of questions that you answered earlier in the exam, together with the correct answers and your student answers. "
    "Analyse your responses to the questions and identify the possible misconceptions that led to answering incorrectly. "
    "Inspect the new question and think how you would answer it as a student. "
    "If you answer incorrectly, explain which misconception leads to selecting that answer. "
    "If you answer correctly, explain why you think the answer is correct. "
    "Provide your answer as an integer in the range 1-4. "
)
if SUPPORTS_STRUCTURED_OUTPUT:
    system_prompt_template = PromptTemplate.from_template(system_prompt_raw)
else:
    system_prompt_raw += "Wrap the output in `json` tags\n{format_instructions}"
    # Set up a parser
    parser = PydanticOutputParser(pydantic_object=MCQAnswer)
    system_prompt_template = PromptTemplate.from_template(system_prompt_raw).partial(
        format_instructions=parser.get_format_instructions()
    )

system_prompt_input = system_prompt_template.format(
    exam_type="a database systems exam (Department of Computer Science)",
)
system_prompt_input

In [None]:
# FIXME: problem if SUPPORTS_STRUCTURED_OUTPUT is False

# Error: Note: if you intended {"description"} to be part of the string and not a variable, please escape it with double curly braces like: \'{{"description"}}\'
# -> how do I do this???

In [None]:
# # TODO: check if this works

# system_prompt_raw = (
#     "You are a student working on {exam_type}, containing multiple choice questions. "
#     "You are shown a set of questions that you answered earlier in the exam, together with the correct answers and your student answers. "
#     "Analyse your responses to the questions and identify the possible misconceptions that led to answering incorrectly. "
#     "Inspect the new question and think how you would answer it as a student. "
#     "If you answer incorrectly, explain which misconception leads to selecting that answer. "
#     "If you answer correctly, explain why you think the answer is correct. "
#     "Provide your answer as an integer in the range 1-4. "
#     "Wrap the output in `json` tags\n{format_instructions}"
# )

# prompt = PromptTemplate(
#     template=system_prompt_raw,
#     input_variables=[],
#     partial_variables={"format_instructions": parser.get_format_instructions()}
# )
# system_prompt_template = SystemMessagePromptTemplate(prompt=prompt)
# system_prompt_template.format(
#     exam_type="a database systems exam (Department of Computer Science)",
# )

In [None]:
# Define the few-shot prompt.
few_shot_prompt = FewShotChatMessagePromptTemplate(
    # The input variables select the values to pass to the example_selector
    input_variables=["student_id"],
    example_selector=example_selector,
    # Define how each example will be formatted.
    # In this case, each example will become 2 messages:
    # 1 human, and 1 AI
    example_prompt=ChatPromptTemplate.from_messages(
        [("human", "{input}"), ("ai", "{output}")]
    ),
)

out = few_shot_prompt.invoke(input=list_val[0]).to_messages()
print(len(out))
print(out)

In [None]:
final_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt_input),
        few_shot_prompt,
        ("human", "{input}"),
    ]
)

print(list_val[0])
out = final_prompt.invoke(input=list_val[0]).to_messages()
print(len(out))
print(out)

# Model

In [None]:
# model
model_raw = build_model(model_cfg=model_cfg)
if SUPPORTS_STRUCTURED_OUTPUT:
    model = model_raw.with_structured_output(MCQAnswer)
else:
    model = model_raw

# chain
chain = final_prompt | model
if not SUPPORTS_STRUCTURED_OUTPUT:
    chain = chain.with_output_parser(parser)

In [None]:
# run model
val_example = list_val[0]
val_output = chain.invoke(val_example)
val_output

In [None]:
len(list_val)

In [None]:
# TODO: add func to only print input (also printing output can be confusing)
def print_example(example: dict) -> None:
    """Print single example.

    Parameters
    ----------
    example : dict
        Example dictionary with 'input' and 'output' keys.
    """
    text = (
        "#" * 40
        + f"\nINPUT\n"
        + "#" * 40
        + f"\n{example['input']}\n"
        + "#" * 40
        + f"\nOUTPUT\n"
        + "#" * 40
        + f"\n{example['output']}\n"
    )
    print(text)


print_example(list_val[0])

Manually implementing structured output

In [None]:
# # Set up a parser
# parser = PydanticOutputParser(pydantic_object=MCQAnswer)

# # Prompt
# prompt = ChatPromptTemplate.from_messages(
#     [
#         (
#             "system",
#             "Answer the user query. Wrap the output in `json` tags\n{format_instructions}",
#         ),
#         ("human", "{query}"),
#     ]
# ).partial(format_instructions=parser.get_format_instructions())

# query = "Anna is 23 years old and she is 6 feet tall"

# print(prompt.invoke({"query": query}).to_string())

In [None]:
# chain = prompt | model | parser

# chain.invoke({"query": query})

In [None]:
# from typing import List

# from langchain_core.prompts import ChatPromptTemplate
# from langchain.chat_models import init_chat_model
# from pydantic import BaseModel, Field


# class Person(BaseModel):
#     """Information about a person."""

#     name: str = Field(..., description="The name of the person")
#     height_in_meters: float = Field(
#         ..., description="The height of the person expressed in meters."
#     )


# class People(BaseModel):
#     """Identifying information about all people in a text."""

#     people: List[Person]


# # Prompt
# prompt = ChatPromptTemplate.from_messages(
#     [
#         (
#             "system",
#             "Answer the user query.",
#         ),
#         ("human", "{query}"),
#     ]
# )

# query = "Anna is 23 years old and she is 6 feet tall"

# print(prompt.invoke({"query": query}).to_string())

# model_name = "llama3.2"  # "gpt-4o-mini"
# model_provider = "ollama"  # "openai"
# llm = init_chat_model(model_name, model_provider=model_provider)
# structured_llm = llm.with_structured_output(People)
# chain = prompt | structured_llm

# chain.invoke({"query": query})

In [None]:
# from typing import List

# from langchain_core.output_parsers import PydanticOutputParser
# from langchain_core.prompts import ChatPromptTemplate
# from langchain.chat_models import init_chat_model
# from pydantic import BaseModel, Field


# class Person(BaseModel):
#     """Information about a person."""

#     name: str = Field(..., description="The name of the person")
#     height_in_meters: float = Field(
#         ..., description="The height of the person expressed in meters."
#     )


# class People(BaseModel):
#     """Identifying information about all people in a text."""

#     people: List[Person]


# # Set up a parser
# parser = PydanticOutputParser(pydantic_object=People)

# # Prompt
# prompt = ChatPromptTemplate.from_messages(
#     [
#         (
#             "system",
#             "Answer the user query. Wrap the output in `json` tags\n{format_instructions}",
#         ),
#         ("human", "{query}"),
#     ]
# ).partial(format_instructions=parser.get_format_instructions())

# query = "Anna is 23 years old and she is 6 feet tall"

# print(prompt.invoke({"query": query}).to_string())

# model_name = "llama3"  # "llama3.2"  # "gpt-4o-mini"
# model_provider = "ollama"  # "openai"
# llm = init_chat_model(model_name, model_provider=model_provider)
# chain = prompt | llm | parser

# chain.invoke({"query": query})