In [1]:
from dotenv import load_dotenv
import os
load_dotenv("../.env")
api_key = os.environ.get('OPENAI-API-KEY')

In [61]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
from langchain.document_loaders import PyMuPDFLoader
from langchain.llms import OpenAI
from operator import itemgetter
import numpy as np
import tiktoken
from langchain.prompts import BasePromptTemplate
from pydantic import BaseModel, validator

from typing import List, Dict, Tuple

# Functions

Parse and embed

In [6]:
# this prevents the metadata from being None which causes errors with the vectorstore
def sanitize_metadata(data):
    for item in data:
        meta = item.metadata
        for key, value in meta.items():
            if value is None:
                meta[key] = ""
    return data

In [13]:
def unpack (data):
    return [{'page' : idx + 1, 'content' : page.page_content, 'metadata' : page.metadata} for idx, page in enumerate(data)]

getting context

In [26]:
def dot_product_similarity(doc_data: List[Dict], query_data: Dict) -> List[Tuple[int, float]]:
    query_embedding = query_data['embedding']
    doc_embeddings = [page['embedding'] for page in doc_data]
    tuples_list = [(page['page'], np.dot(query_embedding, embedding)) for page, embedding in zip(doc_data, doc_embeddings)]
    ordered_tuples = sorted(tuples_list, key=itemgetter(1), reverse=True)
    top_five_tuples = ordered_tuples[:5]
    return top_five_tuples


In [28]:
def embed_query(query: str):
    base_embeddings = OpenAIEmbeddings()
    embedding = base_embeddings.embed_query(query)
    return {"query" : query, "embedding" : embedding}

In [35]:
def embed_doc(text_list : List[str]):
    base_embeddings = OpenAIEmbeddings()
    doc_embeddings = base_embeddings.embed_documents(text_list)
    return doc_embeddings

In [27]:
def get_context(query_data, doc_data: List[Dict]) -> List[Dict]:
    top_five_tuples = dot_product_similarity(doc_data, query_data)
    context = []
    for item in top_five_tuples:
        page = item[0]
        data = {'page': page, 'similarity' : item[1], 'text': doc_data[page - 1]['content'], 'metadata' : doc_data[page - 1]['metadata']}
        context.append(data)
    return context

In [46]:
def get_tokens(string: str, model: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.encoding_for_model(model)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [80]:
def format_context(context: List[Dict], model: str, token_limit : int) -> str:
    """Returns a string of the first 1024 tokens of the context."""
    context_string = ""
    meta_list = []
    encoding = tiktoken.encoding_for_model(model)
    for idx, item in enumerate(context):
        sanitized_text = item['text'].replace("\n", " ")
        context_string += f"Page: {item['page']}\n\nText: {sanitized_text}\n\n"
        meta_list.append(item['metadata'])
        tokens = get_tokens(context_string, model)
        if tokens > token_limit:
            encoded_text = encoding.encode(context_string)
            # cut it down to the token limit
            encoded_text = encoded_text[:token_limit]
            # decode it back to a string
            context_string = encoding.decode(encoded_text)
            # some testing to make sure it worked
            tokens = get_tokens(context_string, model)
            assert tokens <= token_limit, f"format context function failed to cut context down far enough. tokens: {tokens}"
            break
    return context_string, meta_list

# Main

In [4]:
fpath = "../data/powers2017.pdf"

Parse the PDF

In [7]:
loader = PyMuPDFLoader(fpath)
# load the data
unsanitized = loader.load()
# make sure the metadata is not None
data = sanitize_metadata(unsanitized)

In [17]:
base_embeddings = OpenAIEmbeddings()
doc_embeddings = base_embeddings.embed_documents([page.page_content for page in data])

In [19]:
mongodoc = unpack(data)
mongodoc = [{**page, "embedding": embedding} for page, embedding in zip(mongodoc, doc_embeddings)]

Get Query Data

In [98]:
query = "What is a good mental health treatment for depression?"
query_data = embed_query(query)
context = get_context(query_data, mongodoc)

Cut the context down to the token limit and format it

In [83]:
model = "text-davinci-003"
token_limit = 2800
context_string, meta_list = format_context(context, model, token_limit)

In [None]:
print(context_string)

Build the prompt

In [88]:
class DocQAPromptTemplate(BasePromptTemplate, BaseModel):
    """ A custom prompt template that takes a query and document data, and formats the prompt template to provide the formatted context + query to the language model. """
    @validator("input_variables")
    def validate_input_variables(cls, v):
        """ Validate that the input variables are correct. """
        if len(v) != 2:
            raise ValueError("DocQAPromptTemplate must have two input variables: query and context.")
        return v
    
    def format(self, query, formatted_context) -> str:
        # Get the source code of the function
        instruction = "Answer the query with a lengthy, deatiled reponse, to the best of your ability based on the provided context. If the question isn't relevant to the context, tell me that and briefly describe the context."
        # Generate the prompt to be sent to the language model
        prompt = f"INSTRUCTION:\n{instruction}\n\nCONTEXT:\n{formatted_context}\n\nQUERY:\n{query}\n\nOUTPUT:\n"
        return prompt
    
    def _prompt_type(self):
        return "doc context + query"

In [100]:
prompt_template = DocQAPromptTemplate(input_variables=["query", "formatted_context"])
prompt = prompt_template.format(query, context_string)

In [102]:
llm = OpenAI(temperature=0.9)
print(llm(prompt))
print(meta_list)

Cognitive-Behavioral Therapy (CBT) is a well-established, highly effective form of treatment for major depression. CBT is an evidence-based treatment in which clients learn to identify, challenge, and replace negative thoughts and behaviors that lead to depressive feelings and symptoms. CBT is typically a short-term treatment (eight to twelve sessions) that focuses on current problems rather than past events. In CBT for depression, the therapist and client work together to identify patterns of thinking and behavior that are unhelpful and to replace them with more healthy ones. CBT helps clients to recognize their negative thoughts, question the validity of those thoughts, and then test their new, healthier thoughts in real-life situations. Clients also learn strategies to build and practice skills to manage stress, regulate emotions, manage difficult relationships, and identify and use important sources of social support. Other therapeutic interventions, such as Interpersonal Therapy (