In [1]:
# imports
import ast  # for converting embeddings saved as strings back to arrays
from openai import OpenAI # for calling the OpenAI API
import pandas as pd  # for storing text and embeddings data
import tiktoken  # for counting tokens
from scipy import spatial  # for calculating vector similarities for search

import os # for loading environment variables
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
from openai import OpenAI

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# models
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-3.5-turbo"

In [6]:
embeddings_path = "data/letter_embeddings.csv"

df = pd.read_csv(embeddings_path, index_col=0)
df['embedding'] = df['embedding'].apply(ast.literal_eval) # convert string to array

In [5]:
df.head()

Unnamed: 0,chunk,Number,Date,Origin,From,To,PDF Link,embeddings
0,"\nLetter 001 The Hague, c. 18 August 1872 Dear...",1,"August, 1872",T-H,VvG,TvG,http://www.vggallery.com/letters/001_V-T_001.pdf,"[-0.005897581111639738, -0.012334061786532402,..."
1,"\nLetter 002 The Hague, 13 Dec 1872 Dear Theo,...",2,13 December 1872,T-H,VvG,TvG,http://www.vggallery.com/letters/002_V-T_002.pdf,"[0.002705251332372427, -0.012125852517783642, ..."
2,"\nLetter 003 The Hague, January 1873 My dear T...",3,"January, 1873",T-H,VvG,TvG,http://www.vggallery.com/letters/003_V-T_003.pdf,"[0.0003611222200561315, -0.0034950226545333862..."
3,"\nLetter 004 The Hague, January 28 1873 Dear T...",4,28 January 1873,T-H,VvG,TvG,http://www.vggallery.com/letters/004_V-T_004.pdf,"[-0.012272024527192116, -0.013098468072712421,..."
4,"\nLetter 005 The Hague, 17 March 1873 Dear The...",5,17 March 1873,T-H,VvG,TvG,http://www.vggallery.com/letters/005_V-T_005.pdf,"[-0.0019975858740508556, -0.023258762434124947..."


In [13]:
# Search function: Given a query string and a dataframe of text and embeddings, returns the top N most related strings and their relatedness scores.
def strings_ranked_by_relatedness(
    query: str,
    df: pd.DataFrame,
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
    top_n: int = 5
) -> tuple[list[str], list[float]]:
    """Returns a list of strings and relatednesses, sorted from most related to least."""
    query_embedding_response = client.embeddings.create(
        model=EMBEDDING_MODEL,
        input=query,
    )
    query_embedding = query_embedding_response.data[0].embedding
    strings_and_relatednesses = [
        (row["chunk"], relatedness_fn(query_embedding, row["embedding"]))
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return strings[:top_n], relatednesses[:top_n]

In [14]:
# Try out a simple first prompt
query = """What did Van Gogh like about the process of painting his bedroom in Arles?"""

In [16]:
strings, relatednesses = strings_ranked_by_relatedness(query, df, top_n=1)
for string, relatedness in zip(strings, relatednesses):
    print(f"{relatedness=:.3f}")
    print(string)

relatedness=0.861

Letter 554 Arles, 16 October 1888 My dear Theo,
 At last I can send you a little sketch [JH 1609] to give you at least an idea of the way the work is shaping up. For today I am all right again. My eyes are still tired by then I had a new idea in my head and here is the sketch of it.
 Another size 30 canvas. This time it's just simply my bedroom [F 482, JH 1608], only here colour is to do everything, and giving by its simplification a grander style to things, is to be suggestive here of rest or of sleep in general. In a word, looking at the picture ought to rest the brain, or rather the imagination.
 The walls are pale violet. The floor is of red tiles.
 The wood of the bed and chairs is the yellow of fresh butter, the sheets and pillows very light greenish-citron.
 The coverlet scarlet. The window green.
 The toilet table orange, the basin blue.
 The doors lilac.
And that is all — there is nothing in this room with its closed shutters.
The squareness of the furniture

## First attempt at RAG: a simple prompt

In [17]:
def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

def query_message(
    query: str,
    df: pd.DataFrame,
    model: str,
    token_budget: int
) -> str:
    """Return a message for GPT, with relevant source texts pulled from a dataframe."""
    strings, relatednesses = strings_ranked_by_relatedness(query, df)
    introduction = """
    Use the below letters from the archive of Vincent Van Gogh to answer the subsequent question.
    Include quotes from the letters to support your answers.
    If the answer cannot be found in the letters, write "Sorry, I can't find an answer for you."
    """
    question = f"\n\nQUESTION:\n{query}"
    message = introduction + question
    for string in strings:
        next_article = f'\n\nCONTEXT:\n"""\n{string}\n"""'
        if (
            num_tokens(message + next_article + question, model=model)
            > token_budget
        ):
            break
        else:
            message += next_article
    return message

In [21]:
print(query_message(query, df, GPT_MODEL, token_budget=10000))


    Use the below letters from the archive of Vincent Van Gogh to answer the subsequent question.
    Include quotes from the letters to support your answers.
    If the answer cannot be found in the letters, write "Sorry, I can't find an answer for you."
    

QUESTION:
What did Van Gogh like about the process of painting his bedroom in Arles?

CONTEXT:
"""

Letter 554 Arles, 16 October 1888 My dear Theo,
 At last I can send you a little sketch [JH 1609] to give you at least an idea of the way the work is shaping up. For today I am all right again. My eyes are still tired by then I had a new idea in my head and here is the sketch of it.
 Another size 30 canvas. This time it's just simply my bedroom [F 482, JH 1608], only here colour is to do everything, and giving by its simplification a grander style to things, is to be suggestive here of rest or of sleep in general. In a word, looking at the picture ought to rest the brain, or rather the imagination.
 The walls are pale violet. The

In [22]:
# Obtain a response from GPT for our query message
def ask(
    query: str,
    df: pd.DataFrame = df,
    model: str = GPT_MODEL,
    token_budget: int = 4096 - 500,
    print_message: bool = False,
) -> str:
    """Answers a query using GPT and a dataframe of relevant texts and embeddings."""
    message = query_message(query, df, model=model, token_budget=token_budget)
    if print_message:
        print(message)
    messages = [
        {"role": "system", "content": "You answer questions about Vincent Van Gogh's life and works."},
        {"role": "user", "content": message},
    ]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0
    )
    response_message = response.choices[0].message.content
    return response_message

In [32]:
import textwrap

print(f"{query}\n")

response_v0 = ask(query)
print(textwrap.fill(response_v0,100))

What did Van Gogh like about the process of painting his bedroom in Arles?

Van Gogh liked the process of painting his bedroom in Arles because it allowed him to create a space
that was restful and suggestive of sleep. In a letter to Theo, he describes his vision for the room,
saying, "Only here colour is to do everything, and giving by its simplification a grander style to
things, is to be suggestive here of rest or of sleep in general. In a word, looking at the picture
ought to rest the brain, or rather the imagination." He goes on to describe the colors he used for
each element of the room, such as pale violet walls, red tiles for the floor, and yellow wood for
the bed and chairs. He also mentions that there are no shadows or cast shadows in the painting, and
that it is painted in free flat tints like Japanese prints.


Now, let's see if the quality of the response changes when we use GPT-4:

In [34]:
print(f"{query}\n")

response_gpt4 = ask(query, model="gpt-4")
print(textwrap.fill(response_gpt4,100))

What did Van Gogh like about the process of painting his bedroom in Arles?

Van Gogh enjoyed the simplicity and the use of color in painting his bedroom in Arles. He wrote,
"This time it's just simply my bedroom, only here colour is to do everything, and giving by its
simplification a grander style to things, is to be suggestive here of rest or of sleep in general.
In a word, looking at the picture ought to rest the brain, or rather the imagination." He also
appreciated the process of using color to evoke certain feelings or ideas, such as rest and sleep.
He detailed the colors he used, such as pale violet for the walls, red for the floor tiles, and the
yellow of fresh butter for the bed and chairs. He also enjoyed the process of suppressing shadows
and painting in free flat tints, similar to Japanese prints.


In [33]:
print(f"{query}\n")

response_gpt4_turbo = ask(query, model="gpt-4-turbo-preview")
print(textwrap.fill(response_gpt4_turbo,100))

What did Van Gogh like about the process of painting his bedroom in Arles?

Vincent Van Gogh appreciated several aspects of painting his bedroom in Arles, as revealed in his
letter to his brother Theo. One of the primary elements he enjoyed was the use of color to evoke a
sense of rest or sleep. He stated, "For today I am all right again... This time it's just simply my
bedroom, only here colour is to do everything, and giving by its simplification a grander style to
things, is to be suggestive here of rest or of sleep in general. In a word, looking at the picture
ought to rest the brain, or rather the imagination." This quote highlights Van Gogh's intention to
use color in a way that would simplify the scene and evoke a calming, restful feeling in the viewer.
Van Gogh also enjoyed the challenge of painting with a limited palette, as he aimed to suppress
shadows and cast shadows, painting in "free flat tints like the Japanese prints." This approach was
a departure from more traditional

In [25]:
# Prompt iterations

In [None]:
"""
Use the below description of a painting, described in {PAINTING}, as well as the letters from the archive of Vincent Van Gogh, described 
in {CONTEXT} to answer the subsequent question. How does Vincent Van Gogh describe this painting in his personal letters? 
How does he feel about it?
"""