In [1]:
import os 
import dspy
from dotenv import load_dotenv

In [3]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings

text_splitter = SemanticChunker(OpenAIEmbeddings())

In [7]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
loader = PyPDFDirectoryLoader("data/")

In [8]:
docs = [doc.page_content for doc in loader.load()]

In [7]:
# docs = text_splitter.create_documents(docs)
# print(docs[0].page_content)

from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)

docs = text_splitter.create_documents(docs)
print(docs[0])

page_content='PRU Health Critical Illness  
Extended Care III
Continuous cover for ongoing critical illness –'


In [8]:
from dspy.retrieve.faiss_rm import FaissRM
frm = FaissRM(docs)

: 

In [9]:
colbertv2_wiki17_abstracts = dspy.ColBERTv2(docs)
turbo = dspy.OpenAI(model='gpt-3.5-turbo')

In [3]:
dspy.settings.configure(lm=turbo, rm=colbertv2_wiki17_abstracts)

In [4]:
class ragSignature(dspy.Signature):
    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="often between 200-250 words")

In [5]:
class RAG(dspy.Module):
    def __init__(self, num_passages=3):
        super().__init__()

        self.retrieve = dspy.Retrieve(k=num_passages)
        self.generate_answer = dspy.ChainOfThought(ragSignature)
    
    def forward(self, question):
        context = self.retrieve(question).passages
        prediction = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=prediction.answer)

In [10]:
my_question = "what are the differences between Pru health critical illness extended care III and first protector II"
pred = RAG(my_question)

In [11]:
pred

generate_answer = ChainOfThought(ragSignature(context, question -> answer
    instructions='Given the fields `context`, `question`, produce the fields `answer`.'
    context = Field(annotation=str required=True json_schema_extra={'desc': 'may contain relevant facts', '__dspy_field_type': 'input', 'prefix': 'Context:'})
    question = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Question:', 'desc': '${question}'})
    answer = Field(annotation=str required=True json_schema_extra={'desc': 'often between 200-250 words', '__dspy_field_type': 'output', 'prefix': 'Answer:'})
))