# RAG Experiments

## Imports

In [87]:
import dotenv
import json
import os

import datasets
import langchain

import numpy as np

from openai import OpenAI

import semantic_kernel as sk

from tqdm import tqdm


## Utilities

In [88]:
def key_width(d):
    return max(len(str(k)) for k in d.keys())

def print_dict(d):
    width = key_width(d) + 1
    for k, v in d.items():
        # It's a quiet English format string, and you are a horrible goose.
        print(f'{{k:{width}}}: {{v}}'.format(k=k, v=v))

## Datasets

### ROCStories

The ROCStories corpus might be a good one to just test our ability to load a dataset and use it with various language chain tools.

In [None]:
# Load the ROCStories dataset
dataset = datasets.load_dataset('Ximing/ROCStories')

# Explore the dataset
element = dataset['train'][0]
print_dict(element)

### LCATS

Define paths to our corpora and environment files.

In [None]:
# If the following code is run from lcats/notebooks in VSCode and the data is in lcats/data ...
CURRENT_PATH = os.path.abspath(os.curdir)  # This is where the notebook is executing.
PROJECT_ROOT = os.path.dirname(CURRENT_PATH)   # This should be the root of the project.
DEV_CORPUS = os.path.abspath(os.path.join(PROJECT_ROOT, 'data'))  # Local copy of the data.
GIT_CORPUS = os.path.abspath(os.path.join(PROJECT_ROOT, '../corpora'))  # Data in the git repo.
OPENIA_API_KEYS_ENV = os.path.abspath(os.path.join(PROJECT_ROOT, '../.secrets/openai_api_keys.env'))  # Local OpenAI API key.

DEV_CORPUS, GIT_CORPUS, OPENIA_API_KEYS_ENV

In [52]:
assert os.path.exists(DEV_CORPUS), f"DEV_CORPUS does not exist: {DEV_CORPUS}"
assert os.path.exists(GIT_CORPUS), f"GIT_CORPUS does not exist: {GIT_CORPUS}"
assert os.path.exists(OPENIA_API_KEYS_ENV), f"API_ENV does not exist: {OPENIA_API_KEYS_ENV}"

Create our OpenAI Client

In [None]:
dotenv.load_dotenv(OPENIA_API_KEYS_ENV)
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
print(OPENAI_API_KEY)


In [58]:

client = OpenAI(
    api_key=OPENAI_API_KEY
)

Next, get copies of the files.

In [42]:
def load_corpus(data_dir):
    corpus = []
    for root, _, files in os.walk(data_dir):
        for file in files:
            if file.endswith('.json'):
                with open(os.path.join(root, file)) as f:
                    data = json.load(f)
                    corpus.append({
                        'name': data['name'],
                        'body': data['body'],
                        'metadata': data['metadata'],
                    })
    return corpus


We should have 20-30 files if all goes well.

In [None]:
lcats_corpus = load_corpus(DEV_CORPUS)
len(lcats_corpus)

Now, generate embeddings

In [84]:
def get_entry_text(entry):
    return f"{entry['name']}\n{entry['body']}"

def chunk_text_for_embeddings(text):
    chunks = text.split("\n\n")
    return chunks

def get_embeddings_for_text(text):
    try:
        response = client.embeddings.create(
            input=text,
            model="text-embedding-ada-002"  # Specify the embedding model
        )
        return response.data[0].embedding
    except Exception as e:
        print(f"Error generating embedding: {e}")
        return None
    
def get_embeddings_for_corpus(corpus):
    print("-" * 72)
    print(f"Generating embeddings for {len(corpus)} corpus entries.")
    print("-" * 72)
    corpus_with_embeddings = []
    for entry in corpus:
        print(f"Generating embeddings for {entry['name']}:")
        text = get_entry_text(entry)
        chunks = chunk_text_for_embeddings(text)
        print(f"  {len(chunks)} chunks found.")
        for chunk in tqdm(chunks):
            embedding = get_embeddings_for_text(chunk)
            corpus_with_embeddings.append({
                'text': chunk,
                'embedding': embedding,
                'metadata': entry['metadata']
            })
        print()
    return corpus_with_embeddings

In [None]:
# Generate embeddings for each chunk
corpus_with_embeddings = get_embeddings_for_corpus(lcats_corpus)

In [None]:
corpus_with_embeddings

In [None]:
type(response)

In [None]:
len(response.data)