In [1]:
import os
import openai
import tiktoken  # for counting tokens
import pandas as pd  # for DataFrames to store article sections and embeddings
import ast
from scipy import spatial  # for calculating vector similarities for search strings_ranked_by_relatedness()
from IPython.display import Markdown # for rendering answer directly in jupyter cell

## Env

Environment variables:
1. opeanAI key
   
   (before starting jupyter from terminal I use something like)
   
   ```unset HISTFILE```
   
   ```export OPENAI_API_KEY="some-secret-key-that-you-pay-yes-you-pay"```
   


2. Embedding source (now CSV file)

In [2]:
# 1 
client = openai.OpenAI(api_key=(os.environ.get("OPENAI_API_KEY")))

# 2
CSV_SAVE_PATH = "data/everfoam_book-annotations_2024.csv"

##  1.  Load Embeddings

Load from CSV

In [3]:
# Load existing DataFrame
df = pd.read_csv(CSV_SAVE_PATH, converters={'embedding': ast.literal_eval})

In [4]:
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-3.5-turbo"

# search function
def strings_ranked_by_relatedness(
    query: str,
    df: pd.DataFrame,
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
    top_n: int = 100
) -> tuple[list[str], list[float]]:
    """Returns a list of strings and relatednesses, sorted from most related to least."""
    query_embedding_response = client.embeddings.create(
        model=EMBEDDING_MODEL,
        input=query,
    )
    query_embedding = query_embedding_response.data[0].embedding
    strings_and_relatednesses = [
        (row["text"], relatedness_fn(query_embedding, row["embedding"]))
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return strings[:top_n], relatednesses[:top_n]

just for checking 

In [None]:
# examples
strings, relatednesses = strings_ranked_by_relatedness("cynefin", df, top_n=5)
for string, relatedness in zip(strings, relatednesses):
    print(f"{relatedness=:.3f}")
    display(string)

## 2. Ask

Retrieve knowledge with prompting to GPT.

For troubleshooting what message sent to GPT `print_message: bool = True`

In [5]:
def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

def query_message(
    query: str,
    df: pd.DataFrame,
    model: str,
    token_budget: int
) -> str:
    """Return a message for GPT, with relevant source texts pulled from a dataframe."""
    strings, relatednesses = strings_ranked_by_relatedness(query, df)
    introduction = 'Use the below markdown content to answer the subsequent question. If the answer cannot be found in the articles, write "I could not find an answer."'
    question = f"\n\nQuestion: {query}"
    message = introduction
    for string in strings:
        next_article = f'\n\nMarkdown content:\n"""\n{string}\n"""'
        if (
            num_tokens(message + next_article + question, model=model)
            > token_budget
        ):
            break
        else:
            message += next_article
    return message + question


def ask(
    query: str,
    df: pd.DataFrame = df,
    model: str = GPT_MODEL,
    token_budget: int = 4096 - 500,
    print_message: bool = False,
) -> str:
    """Answers a query using GPT and a dataframe of relevant texts and embeddings."""
    message = query_message(query, df, model=model, token_budget=token_budget)
    if print_message:
        print(message)
    messages = [
        {"role": "system", "content": "You answer questions with relevant embedding mentioning book-title"},
        {"role": "user", "content": message},
    ]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0
    )
    response_message = response.choices[0].message.content
    return response_message

In [11]:
answer=ask('What information do you have have on Cynefin from annotation of books that I have created? Explain in pointers')

# Render the Markdown string
Markdown(answer)

- In "Lean Software Development: An Agile Toolkit" by Mary Poppendieck, the concept of Cynefin is discussed in relation to two schools of thought in software development: one emphasizing perfect design and code from the start, and the other advocating for small, rapid cycles of trying, testing, and fixing, especially for ill-structured problems.
- The book "Essential Scrum: A Practical Guide to the Most Popular Agile Process" by Kenneth S. Rubin mentions that software development does not neatly fit into just one Cynefin domain, as it involves aspects that overlap and activities that can fall into different domains, such as complicated or complex.
- The articles do not provide direct information on Cynefin from the other book annotations.