# RAG Experiments

## Imports

In [132]:
import dotenv
import json
import os
import random

import datasets
import langchain

import numpy as np
import pandas as pd

import faiss
from openai import OpenAI
import tiktoken

import semantic_kernel as sk

from tqdm import tqdm


## Utilities

In [88]:
def key_width(d):
    return max(len(str(k)) for k in d.keys())

def print_dict(d):
    width = key_width(d) + 1
    for k, v in d.items():
        # It's a quiet English format string, and you are a horrible goose.
        print(f'{{k:{width}}}: {{v}}'.format(k=k, v=v))

## Datasets

### ROCStories

The ROCStories corpus might be a good one to just test our ability to load a dataset and use it with various language chain tools.

In [None]:
# Load the ROCStories dataset
dataset = datasets.load_dataset('Ximing/ROCStories')

# Explore the dataset
element = dataset['train'][0]
print_dict(element)

### LCATS

Define paths to our corpora and environment files.

In [None]:
# If the following code is run from lcats/notebooks in VSCode and the data is in lcats/data ...
CURRENT_PATH = os.path.abspath(os.curdir)  # This is where the notebook is executing.
PROJECT_ROOT = os.path.dirname(CURRENT_PATH)   # This should be the root of the project.
DEV_CORPUS = os.path.abspath(os.path.join(PROJECT_ROOT, 'data'))  # Local copy of the data.
GIT_CORPUS = os.path.abspath(os.path.join(PROJECT_ROOT, '../corpora'))  # Data in the git repo.
OPENIA_API_KEYS_ENV = os.path.abspath(os.path.join(PROJECT_ROOT, '../.secrets/openai_api_keys.env'))  # Local OpenAI API key.

DEV_CORPUS, GIT_CORPUS, OPENIA_API_KEYS_ENV

In [52]:
assert os.path.exists(DEV_CORPUS), f"DEV_CORPUS does not exist: {DEV_CORPUS}"
assert os.path.exists(GIT_CORPUS), f"GIT_CORPUS does not exist: {GIT_CORPUS}"
assert os.path.exists(OPENIA_API_KEYS_ENV), f"API_ENV does not exist: {OPENIA_API_KEYS_ENV}"

Create our OpenAI Client

In [None]:
dotenv.load_dotenv(OPENIA_API_KEYS_ENV)
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
print(OPENAI_API_KEY)


In [58]:

client = OpenAI(
    api_key=OPENAI_API_KEY
)

Next, get copies of the files.

In [42]:
def load_corpus(data_dir):
    corpus = []
    for root, _, files in os.walk(data_dir):
        for file in files:
            if file.endswith('.json'):
                with open(os.path.join(root, file)) as f:
                    data = json.load(f)
                    corpus.append({
                        'name': data['name'],
                        'body': data['body'],
                        'metadata': data['metadata'],
                    })
    return corpus


We should have 20-30 files if all goes well.

In [None]:
lcats_corpus = load_corpus(DEV_CORPUS)
len(lcats_corpus)

In [113]:
def count_tokens(story_text, model="gpt-3.5-turbo"):
    """
    Counts the number of tokens in the given story text for a specific OpenAI model.

    Parameters:
        story_text (str): The text to tokenize.
        model (str): The OpenAI model to use for tokenization. Default is "gpt-3.5-turbo".

    Returns:
        int: The number of tokens in the story text.
    """
    # Get the tokenizer for the specified model
    encoding = tiktoken.encoding_for_model(model)
    
    # Tokenize the text
    tokens = encoding.encode(story_text)
    
    # Return the number of tokens
    return len(tokens)

In [None]:
lcats_analysis = []
for story in lcats_corpus:
    story_name = story['name']
    story_text = story['body']
    story_len = len(story_text)
    story_tokens = count_tokens(story_text)
    readable_by_gpt_3_5 = story_tokens < 4096
    readable_by_gpt_4o = story_tokens < 32768
    lcats_analysis.append({
        'name': story_name,
        'length': story_len,
        'tokens': story_tokens,
        'readable_by_gpt_3_5': readable_by_gpt_3_5,
        'readable_by_gpt_4o': readable_by_gpt_4o,
    })
lcats_analysis = pd.DataFrame(lcats_analysis)
lcats_analysis

Now, generate embeddings

In [93]:
def get_entry_text(entry):
    return f"{entry['name']}\n{entry['body']}"

def chunk_text_for_embeddings(text):
    chunks = text.split("\n\n")
    return chunks

def get_embeddings_for_text(text):
    try:
        response = client.embeddings.create(
            input=text,
            model="text-embedding-ada-002"  # Specify the embedding model
        )
        return response.data[0].embedding
    except Exception as e:
        print(f"Error generating embedding: {e}")
        return None
    
def get_embeddings_for_corpus(corpus):
    print("-" * 72)
    print(f"Generating embeddings for {len(corpus)} corpus entries.")
    print("-" * 72)
    corpus_with_embeddings = []
    for entry in corpus:
        print(f"Generating embeddings for {entry['name']}:")
        text = get_entry_text(entry)
        chunks = chunk_text_for_embeddings(text)
        print(f"  {len(chunks)} chunks found.")
        for chunk in tqdm(chunks):
            embedding = get_embeddings_for_text(chunk)
            corpus_with_embeddings.append({
                'text': chunk,
                'embedding': embedding,
                'metadata': entry['metadata']
            })
        print()
    return corpus_with_embeddings

Make it possible to save and load the data 

In [105]:
def save_embeddings_json(data, filepath):
    with open(filepath, 'w') as f:
        json.dump(data, f)

def load_embeddings_json(filepath):
    with open(filepath, 'r') as f:
        return json.load(f)

In [120]:
# Generate embeddings for each chunk
if False:
    corpus_with_embeddings = get_embeddings_for_corpus(lcats_corpus)
    save_embeddings_json(corpus_with_embeddings, 'output/lcats_corpus_embeddings.json')
else:
    corpus_with_embeddings = load_embeddings_json('output/lcats_corpus_embeddings.json')

Examine the loaded corpus

In [None]:
(len(corpus_with_embeddings), 
 corpus_with_embeddings[0].keys(), 
 len(corpus_with_embeddings[0]['embedding']))

In [None]:
def summarize_chunk(chunk):
    story_name = chunk['metadata']['name']
    story_author = chunk['metadata']['author']
    embedding = chunk['embedding']
    embedding_len = len(embedding)
    text = chunk['text']
    text_len = len(text)
    text_tokens = count_tokens(text)

    print(f"Chunk from Story: '{story_name}' by {story_author}")
    print(f" - Embedding ({embedding_len} elements): {embedding[:3] + ['...']}")
    print(f" - Snippet ({text_len} characters, {text_tokens} tokens): '{text.strip()}'")

summarize_chunk(random.choice(corpus_with_embeddings))


Create the vector database

In [99]:
# Create index
dimension = len(corpus_with_embeddings[0]['embedding'])
index = faiss.IndexFlatL2(dimension)

# Add embeddings to the index
embeddings = np.array([item['embedding'] for item in corpus_with_embeddings])
index.add(embeddings)

Retrieve from the vector database

In [None]:
def get_chunks_for_query(query, top_n=5):
    query_embedding = get_embeddings_for_text(query)
    _, indices = index.search(np.array([query_embedding]), top_n)
    return [corpus_with_embeddings[i] for i in indices[0]]

for chunk in get_chunks_for_query("The cat sat on the mat."):
    summarize_chunk(chunk)


In [None]:
def generate_context_from_chunks(chunks):
    return "\n".join([chunk['text'] for chunk in chunks])

def generate_prompt_from_query_and_context(query, context):
    return f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
    
def elaborate_query_with_context(query):
    chunks = get_chunks_for_query(query)
    context = generate_context_from_chunks(chunks)
    return generate_prompt_from_query_and_context(query, context)
    
elaborate_query_with_context("What did that cat do?")

Generate completions

In [None]:
def generate_completions(prompt, max_tokens=100):
    completion = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {
                "role": "user",
                "content": prompt,
            },
        ],
    )
    return completion.choices[0].message.content

def retrieve_and_generate(query):
    elaborated_query = elaborate_query_with_context(query)
    return generate_completions(elaborated_query)

retrieve_and_generate("Who is Sherlock's friend?")

In [None]:
retrieve_and_generate("What year was The Adventure of the Engineer's Thumb written?")

In [None]:
generate_completions("Who is Sherlock's friend?")

## NEXTUP: STORY ANALYSIS