In [17]:
import os
import openai
import tiktoken  # for counting tokens
import pandas as pd  # for DataFrames to store article sections and embeddings
import re
import datetime
from scipy import spatial  # for calculating vector similarities for search strings_ranked_by_relatedness()

## Env

Environment variables:
1. opeanAI key
   
   (before starting jupyter from terminal I use something like)
   
   ```unset HISTFILE```
   
   ```export OPENAI_API_KEY="some-secret-key-that-you-pay-yes-you-pay"```
   


2. path to markdown files
3. file logging last import date
4. CSV save path for storing embedding result

In [18]:
# 1 
client = openai.OpenAI(api_key=(os.environ.get("OPENAI_API_KEY")))

# 2
# Global variable for markdown folders
folder_path = 'my book annotations from the project https://github.com/bandono/kobo-annot-import'

# 3
# Global variable to store the last import date or last imported file name 
# used in save_last_import_info() and load_last_import_info()
last_import_info_file = "last_import_info.txt"

# 4
CSV_SAVE_PATH = "data/everfoam_book-annotations_2024.csv"

Importing with checking on newer files and updated only by comparing date to `last_import_info_file`

*Note: to restart allover simply delete that file*

In [19]:
def save_last_import_info(last_import_info):
    with open(last_import_info_file, "w") as file:
        file.write(str(last_import_info))

def load_last_import_info():
    if os.path.exists(last_import_info_file):
        with open(last_import_info_file, "r") as file:
            return datetime.datetime.fromisoformat(file.read())
    else:
        return None

def remove_fields_from_markdown(markdown_text):
    # Remove category field
    markdown_text = re.sub(r'^category:\s*.*$', '', markdown_text, flags=re.MULTILINE)
    # Remove UUID field
    markdown_text = re.sub(r'^UUID:\s*.*$', '', markdown_text, flags=re.MULTILINE)
    return markdown_text.strip()

def import_markdown_files(folder_path):
    global last_import_info
    
    markdown_contents = []
    
    # Check the last import info
    last_import_info = load_last_import_info()
    
    # If last import info does not exist, import all files
    if last_import_info is None:
        for filename in os.listdir(folder_path):
            if filename.endswith('.md'):
                with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                    markdown_content = file.read()
                    if 'status: duplicate' not in markdown_content:
                        markdown_content = remove_fields_from_markdown(markdown_content)
                        markdown_contents.append(markdown_content)
        
        # Log the last import date or file name
        save_last_import_info(datetime.datetime.now())
    
    else:
        # If last import info exists, import only newer or updated files
        for filename in os.listdir(folder_path):
            if filename.endswith('.md'):
                file_path = os.path.join(folder_path, filename)
                last_modified = datetime.datetime.fromtimestamp(os.path.getmtime(file_path))
                
                # Check if the file has been modified after the last import
                if last_modified > last_import_info:
                    with open(file_path, 'r', encoding='utf-8') as file:
                        markdown_content = file.read()
                        if 'status: duplicate' not in markdown_content:
                            markdown_content = remove_fields_from_markdown(markdown_content)
                            markdown_contents.append(markdown_content)
        
        # Update the last import date or file name
        save_last_import_info(datetime.datetime.now())
    
    return markdown_contents


## 1. Ingest book-annotations markdown files

`# Example usage:`

`markdown_contents = import_markdown_files(folder_path)`

`# Now you can access each markdown content individually`

```for i, content in enumerate(markdown_contents):```

```    print(f"Content of markdown file {i+1}:\n{content}\n")```

In [20]:
markdown_contents = import_markdown_files(folder_path)


## 2.  Embedding using GPT to Dataframe

Embedding book annotations using `text-embedding-ada-002`

### Alter 1

Embedding recalculated from the start.

Approach 1 ensures that all embeddings are recalculated and up-to-date, but may be computationally expensive for large number of files

In [21]:
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-3.5-turbo"
BATCH_SIZE = 1000  # you can submit up to 2048 embedding inputs per request

# Calculate embeddings for all Markdown files
embeddings = []
total_items = len(markdown_contents)
for batch_start in range(0, total_items, BATCH_SIZE):
    batch_end = min(batch_start + BATCH_SIZE, total_items)
    batch = markdown_contents[batch_start:batch_end]
    response = client.embeddings.create(model=EMBEDDING_MODEL, input=batch)
    batch_embeddings = [e.embedding for e in response.data]
    embeddings.extend(batch_embeddings)

# Update DataFrame with embeddings
df = pd.DataFrame({"text": markdown_contents, "embedding": embeddings})

### Alter 2 (to do)

Embedding combine with previously generated if there was already previous embbedding.

Approach 2 is more efficient for a large DataFrame and only a few newer or updated files.


In [26]:
# to do

## 3. Save embedding to CSV file

For small data saving the embedding to CSV file should be OK. 

In [22]:
# save document embeddings

df.to_csv(CSV_SAVE_PATH, index=False)

## 4. Just to test the embedding no need to run

Just simple function to test the embedding

In [23]:
# search function
def strings_ranked_by_relatedness(
    query: str,
    df: pd.DataFrame,
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
    top_n: int = 100
) -> tuple[list[str], list[float]]:
    """Returns a list of strings and relatednesses, sorted from most related to least."""
    query_embedding_response = client.embeddings.create(
        model=EMBEDDING_MODEL,
        input=query,
    )
    query_embedding = query_embedding_response.data[0].embedding
    strings_and_relatednesses = [
        (row["text"], relatedness_fn(query_embedding, row["embedding"]))
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return strings[:top_n], relatednesses[:top_n]

In [None]:
# examples
strings, relatednesses = strings_ranked_by_relatedness("overwork", df, top_n=5)
for string, relatedness in zip(strings, relatednesses):
    print(f"{relatedness=:.3f}")
    display(string)