# RAG Experiments

## Imports

In [1]:
import dotenv
import json
import os
import random

import datasets
import langchain

import numpy as np
import pandas as pd

import faiss
from openai import OpenAI
import tiktoken

import semantic_kernel as sk

from tqdm import tqdm


## Utilities

In [2]:
def key_width(d):
    return max(len(str(k)) for k in d.keys())

def print_dict(d):
    width = key_width(d) + 1
    for k, v in d.items():
        # It's a quiet English format string, and you are a horrible goose.
        print(f'{{k:{width}}}: {{v}}'.format(k=k, v=v))

## Datasets

### ROCStories

The ROCStories corpus might be a good one to just test our ability to load a dataset and use it with various language chain tools.

In [3]:
# Load the ROCStories dataset
dataset = datasets.load_dataset('Ximing/ROCStories')

# Explore the dataset
element = dataset['train'][0]
print_dict(element)

story_id         : 080198fc-d0e7-42b3-8e63-b2144e59d816
prompt           : On my way to work I stopped to get some coffee.
continuation     : I went through the drive through and placed my order. I paid the cashier and patiently waited for my drink. When she handed me the drink, the lid came off and spilled on me. The coffee hurt and I had to go home and change clothes.
constraint_words : ['drive', 'order', 'drink', 'lid', 'coffee', 'hurt', 'home', 'change', 'clothes']


### LCATS

Define paths to our corpora and environment files.

In [4]:
# If the following code is run from lcats/notebooks in VSCode and the data is in lcats/data ...
CURRENT_PATH = os.path.abspath(os.curdir)  # This is where the notebook is executing.
PROJECT_ROOT = os.path.dirname(CURRENT_PATH)   # This should be the root of the project.
DEV_CORPUS = os.path.abspath(os.path.join(PROJECT_ROOT, 'data'))  # Local copy of the data.
GIT_CORPUS = os.path.abspath(os.path.join(PROJECT_ROOT, '../corpora'))  # Data in the git repo.
OPENIA_API_KEYS_ENV = os.path.abspath(os.path.join(PROJECT_ROOT, '../.secrets/openai_api_keys.env'))  # Local OpenAI API key.

DEV_CORPUS, GIT_CORPUS, OPENIA_API_KEYS_ENV

('/Users/centaur/Workspace/LCATS/LCATS/lcats/data',
 '/Users/centaur/Workspace/LCATS/LCATS/corpora',
 '/Users/centaur/Workspace/LCATS/LCATS/.secrets/openai_api_keys.env')

In [5]:
assert os.path.exists(DEV_CORPUS), f"DEV_CORPUS does not exist: {DEV_CORPUS}"
assert os.path.exists(GIT_CORPUS), f"GIT_CORPUS does not exist: {GIT_CORPUS}"
assert os.path.exists(OPENIA_API_KEYS_ENV), f"API_ENV does not exist: {OPENIA_API_KEYS_ENV}"

Create our OpenAI Client

In [6]:
dotenv.load_dotenv(OPENIA_API_KEYS_ENV)
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
print(OPENAI_API_KEY)


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


In [7]:

client = OpenAI(
    api_key=OPENAI_API_KEY
)

Next, get copies of the files.

In [8]:
def load_corpus(data_dir):
    corpus = []
    for root, _, files in os.walk(data_dir):
        for file in files:
            if file.endswith('.json'):
                with open(os.path.join(root, file)) as f:
                    data = json.load(f)
                    corpus.append({
                        'name': data['name'],
                        'body': data['body'],
                        'metadata': data['metadata'],
                    })
    return corpus


We should have 20-30 files if all goes well.

In [9]:
lcats_corpus = load_corpus(DEV_CORPUS)
len(lcats_corpus)

29

In [10]:
def count_tokens(story_text, model="gpt-3.5-turbo"):
    """
    Counts the number of tokens in the given story text for a specific OpenAI model.

    Parameters:
        story_text (str): The text to tokenize.
        model (str): The OpenAI model to use for tokenization. Default is "gpt-3.5-turbo".

    Returns:
        int: The number of tokens in the story text.
    """
    # Get the tokenizer for the specified model
    encoding = tiktoken.encoding_for_model(model)
    
    # Tokenize the text
    tokens = encoding.encode(story_text)
    
    # Return the number of tokens
    return len(tokens)

In [11]:
lcats_analysis = []
for story in lcats_corpus:
    story_name = story['name']
    story_text = story['body']
    story_len = len(story_text)
    story_tokens = count_tokens(story_text)
    readable_by_gpt_3_5 = story_tokens < 4096
    readable_by_gpt_4o = story_tokens < 32768
    lcats_analysis.append({
        'name': story_name,
        'length': story_len,
        'tokens': story_tokens,
        'readable_by_gpt_3_5': readable_by_gpt_3_5,
        'readable_by_gpt_4o': readable_by_gpt_4o,
    })
lcats_analysis = pd.DataFrame(lcats_analysis)
lcats_analysis

Unnamed: 0,name,length,tokens,readable_by_gpt_3_5,readable_by_gpt_4o
0,Sherlock Holmes - The Adventure of the Enginee...,44730,10908,False,True
1,Sherlock Holmes - The Man with the Twisted Lip,49349,12186,False,True
2,Sherlock Holmes - The Five Orange Pips,39584,9694,False,True
3,Sherlock Holmes - A Case of Identity,38026,9316,False,True
4,Sherlock Holmes - The Red-Headed League,49431,12140,False,True
5,Sherlock Holmes - The Adventure of the Blue Ca...,42260,10639,False,True
6,Sherlock Holmes - The Adventure of the Copper ...,53193,13112,False,True
7,Sherlock Holmes - The Adventure of the Noble B...,44261,10796,False,True
8,Sherlock Holmes - A Scandal in Bohemia,46620,11457,False,True
9,Sherlock Holmes - The Adventure of the Speckle...,53162,12909,False,True


Now, generate embeddings

In [12]:
def get_entry_text(entry):
    return f"{entry['name']}\n{entry['body']}"

def chunk_text_for_embeddings(text):
    chunks = text.split("\n\n")
    return chunks

def get_embeddings_for_text(text):
    try:
        response = client.embeddings.create(
            input=text,
            model="text-embedding-ada-002"  # Specify the embedding model
        )
        return response.data[0].embedding
    except Exception as e:
        print(f"Error generating embedding: {e}")
        return None
    
def get_embeddings_for_corpus(corpus):
    print("-" * 72)
    print(f"Generating embeddings for {len(corpus)} corpus entries.")
    print("-" * 72)
    corpus_with_embeddings = []
    for entry in corpus:
        print(f"Generating embeddings for {entry['name']}:")
        text = get_entry_text(entry)
        chunks = chunk_text_for_embeddings(text)
        print(f"  {len(chunks)} chunks found.")
        for chunk in tqdm(chunks):
            embedding = get_embeddings_for_text(chunk)
            corpus_with_embeddings.append({
                'text': chunk,
                'embedding': embedding,
                'metadata': entry['metadata']
            })
        print()
    return corpus_with_embeddings

Make it possible to save and load the data 

In [13]:
def save_embeddings_json(data, filepath):
    with open(filepath, 'w') as f:
        json.dump(data, f)

def load_embeddings_json(filepath):
    with open(filepath, 'r') as f:
        return json.load(f)

In [14]:
# Generate embeddings for each chunk
if False:
    corpus_with_embeddings = get_embeddings_for_corpus(lcats_corpus)
    save_embeddings_json(corpus_with_embeddings, 'output/lcats_corpus_embeddings.json')
else:
    corpus_with_embeddings = load_embeddings_json('output/lcats_corpus_embeddings.json')

Examine the loaded corpus

In [15]:
(len(corpus_with_embeddings), 
 corpus_with_embeddings[0].keys(), 
 len(corpus_with_embeddings[0]['embedding']))

(4863, dict_keys(['text', 'embedding', 'metadata']), 1536)

In [16]:
def summarize_chunk(chunk):
    story_name = chunk['metadata']['name']
    story_author = chunk['metadata']['author']
    embedding = chunk['embedding']
    embedding_len = len(embedding)
    text = chunk['text']
    text_len = len(text)
    text_tokens = count_tokens(text)

    print(f"Chunk from Story: '{story_name}' by {story_author}")
    print(f" - Embedding ({embedding_len} elements): {embedding[:3] + ['...']}")
    print(f" - Snippet ({text_len} characters, {text_tokens} tokens): '{text.strip()}'")


In [17]:
summarize_chunk(random.choice(corpus_with_embeddings))

Chunk from Story: 'The Haunter of the Dark' by H. P. Lovecraft
 - Embedding (1536 elements): [-0.0106479711830616, 0.00693340040743351, 0.002756253583356738, '...']
 - Snippet (539 characters, 130 tokens): 'He had to keep the house dark in order to see out the window, and it
appears that most of his time was spent at his desk, peering anxiously
through the rain across the glistening miles of downtown roofs at the
constellation of distant lights marking Federal Hill. Now and then he
would fumblingly make an entry in his diary, so that detached phrases
such as "The lights must not go"; "It knows where I am"; "I must
destroy it"; and "It is calling to me, but perhaps it means no injury
this time"; are found scattered down two of the pages.'


Create the vector database

In [18]:
# Create index
dimension = len(corpus_with_embeddings[0]['embedding'])
index = faiss.IndexFlatL2(dimension)

# Add embeddings to the index
embeddings = np.array([item['embedding'] for item in corpus_with_embeddings])
index.add(embeddings)

Retrieve from the vector database

In [19]:
def get_chunks_for_query(query, top_n=5):
    query_embedding = get_embeddings_for_text(query)
    _, indices = index.search(np.array([query_embedding]), top_n)
    return [corpus_with_embeddings[i] for i in indices[0]]

for chunk in get_chunks_for_query("The cat sat on the mat."):
    summarize_chunk(chunk)


Chunk from Story: 'twisted_lip' by Arthur Conan Doyle
 - Embedding (1536 elements): [-0.0003073564439546317, 0.003910928964614868, 0.0015472694067284465, '...']
 - Snippet (88 characters, 22 tokens): 'Sherlock Holmes sat down beside him on the couch and patted him kindly on the
shoulder.'
Chunk from Story: 'speckled_band' by Arthur Conan Doyle
 - Embedding (1536 elements): [0.022861696779727936, 0.008309729397296906, 0.02262313850224018, '...']
 - Snippet (295 characters, 79 tokens): '“Ah, yes, of course! Well, a cheetah is just a big cat, and yet a saucer
of milk does not go very far in satisfying its wants, I daresay. There is one
point which I should wish to determine.” He squatted down in front of the
wooden chair and examined the seat of it with the greatest attention.'
Chunk from Story: 'scandal_in_bohemia' by Arthur Conan Doyle
 - Embedding (1536 elements): [0.019529543817043304, -0.009010582230985165, 0.016645098105072975, '...']
 - Snippet (89 characters, 23 tokens): 'The King

In [20]:
def generate_context_from_chunks(chunks):
    return "\n".join([chunk['text'] for chunk in chunks])

def generate_prompt_from_query_and_context(query, context):
    return f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
    
def elaborate_query_with_context(query):
    chunks = get_chunks_for_query(query)
    context = generate_context_from_chunks(chunks)
    return generate_prompt_from_query_and_context(query, context)

In [21]:
print(elaborate_query_with_context("What did the cat do?"))

Context:

“What of the rat, then?”

“What, then, did Peterson do?”

“There isn’t a cat in it, for example?”

I had forgotten the strange pets which the Doctor affected. There was a
cheetah, too; perhaps we might find it upon our shoulders at any moment. I
confess that I felt easier in my mind when, after following Holmes’
example and slipping off my shoes, I found myself inside the bedroom. My
companion noiselessly closed the shutters, moved the lamp onto the table, and
cast his eyes round the room. All was as we had seen it in the daytime. Then
creeping up to me and making a trumpet of his hand, he whispered into my ear
again so gently that it was all that I could do to distinguish the words:

“How did he come?”

Question: What did the cat do?
Answer:


Generate completions

In [22]:
def generate_completions(prompt, max_tokens=100):
    completion = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {
                "role": "user",
                "content": prompt,
            },
        ],
    )
    return completion.choices[0].message.content

def retrieve_and_generate(query):
    elaborated_query = elaborate_query_with_context(query)
    return generate_completions(elaborated_query)

In [23]:
retrieve_and_generate("What did the cat do?")

'The text does not provide information on what the cat did.'

In [24]:
retrieve_and_generate("Who is Sherlock's friend?")

'Dr. Watson'

In [25]:
retrieve_and_generate("What year was The Adventure of the Engineer's Thumb written?")

'1892'

In [26]:
generate_completions("Who is Sherlock's friend?")

'Dr. John Watson'

In [27]:
generate_completions("What did the cat do?")

'As an AI, I need more context to answer this question accurately. What cat are you referring to?'

## NEXTUP: STORY ANALYSIS