In [1]:
from dotenv import load_dotenv
import os
load_dotenv("../.env")
api_key = os.environ.get('OPENAI-API-KEY')

In [2]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
from langchain.document_loaders import PyMuPDFLoader
from langchain.llms import OpenAI
from operator import itemgetter
import numpy as np
import tiktoken
from langchain.prompts import BasePromptTemplate
from pydantic import BaseModel, validator

from typing import List, Dict, Tuple

# Functions

In [3]:
# this prevents the metadata from being None which causes errors with the vectorstore
def sanitize_metadata(data):
    for item in data:
        meta = item.metadata
        for key, value in meta.items():
            if value is None:
                meta[key] = ""
    return data

def unpack (data):
    return [{'page' : idx + 1, 'content' : page.page_content, 'metadata' : page.metadata} for idx, page in enumerate(data)]

def dot_product_similarity(doc_data: List[Dict], query_data: Dict) -> List[Tuple[int, float]]:
    query_embedding = query_data['embedding']
    doc_embeddings = [page['embedding'] for page in doc_data]
    tuples_list = [(page['page'], np.dot(query_embedding, embedding)) for page, embedding in zip(doc_data, doc_embeddings)]
    ordered_tuples = sorted(tuples_list, key=itemgetter(1), reverse=True)
    top_five_tuples = ordered_tuples[:5]
    return top_five_tuples

def embed_query(query: str):
    base_embeddings = OpenAIEmbeddings()
    embedding = base_embeddings.embed_query(query)
    return {"query" : query, "embedding" : embedding}

def embed_doc(text_list : List[str]):
    base_embeddings = OpenAIEmbeddings()
    doc_embeddings = base_embeddings.embed_documents(text_list)
    return doc_embeddings

def get_context(query_data, doc_data: List[Dict]) -> List[Dict]:
    top_five_tuples = dot_product_similarity(doc_data, query_data)
    context = []
    for item in top_five_tuples:
        page = item[0]
        data = {'page': page, 'similarity' : item[1], 'text': doc_data[page - 1]['content'], 'metadata' : doc_data[page - 1]['metadata']}
        context.append(data)
    return context

def get_tokens(string: str, model: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.encoding_for_model(model)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def format_context(context: List[Dict], model: str, token_limit : int) -> str:
    """Returns a string of the first 1024 tokens of the context."""
    context_string = ""
    meta_list = []
    encoding = tiktoken.encoding_for_model(model)
    for idx, item in enumerate(context):
        sanitized_text = item['text'].replace("\n", " ")
        context_string += f"Page: {item['page']}\n\nText: {sanitized_text}\n\n"
        meta_list.append(item['metadata'])
        tokens = get_tokens(context_string, model)
        if tokens > token_limit:
            encoded_text = encoding.encode(context_string)
            # cut it down to the token limit
            encoded_text = encoded_text[:token_limit]
            # decode it back to a string
            context_string = encoding.decode(encoded_text)
            # some testing to make sure it worked
            tokens = get_tokens(context_string, model)
            assert tokens <= token_limit, f"format context function failed to cut context down far enough. tokens: {tokens}"
            break
    return context_string, meta_list

class DocQAPromptTemplate(BasePromptTemplate, BaseModel):
    """ A custom prompt template that takes a query and document data, and formats the prompt template to provide the formatted context + query to the language model. """
    @validator("input_variables")
    def validate_input_variables(cls, v):
        """ Validate that the input variables are correct. """
        if len(v) != 2:
            raise ValueError("DocQAPromptTemplate must have two input variables: query and context.")
        return v
    
    def format(self, query, formatted_context) -> str:
        # Get the source code of the function
        instruction = "Answer the query with a lengthy, deatiled reponse, to the best of your ability based on the provided context. If the question isn't relevant to the context, tell me that and briefly describe the context."
        # Generate the prompt to be sent to the language model
        prompt = f"INSTRUCTION:\n{instruction}\n\nCONTEXT:\n{formatted_context}\n\nQUERY:\n{query}\n\nOUTPUT:\n"
        return prompt
    
    def _prompt_type(self):
        return "doc context + query"

In [5]:
def upload_doc(fpath):
    loader = PyMuPDFLoader(fpath)
    # load the data
    unsanitized = loader.load()
    # make sure the metadata is not None
    data = sanitize_metadata(unsanitized)

    # get the doc embeddings
    doc_embeddings = embed_doc([page.page_content for page in data])

    # unpack the data and add the embeddings
    mongodoc = unpack(data)
    mongodoc = [{**page, "embedding": embedding} for page, embedding in zip(mongodoc, doc_embeddings)]
    
    return mongodoc

In [11]:
def query_doc(query: str, mongodoc: List[Dict], model: str, token_limit: int):
    # embed the query
    query_data = embed_query(query)
    # get the context
    context = get_context(query_data, mongodoc)
    # format the context
    formatted_context, meta_list = format_context(context, model, token_limit)
    # format the prompt
    prompt_template = DocQAPromptTemplate(input_variables=["query", "formatted_context"])
    prompt = prompt_template.format(query, formatted_context)
    # query the language model
    llm = OpenAI(temperature=0.7)
    response = llm(prompt)
    # return the response, the metadata, and the context
    return response, meta_list, context

# Main

In [6]:
fpath = "../data/powers2017.pdf"
mongodoc = upload_doc(fpath)

In [7]:
query = "What is the most striking finding in the paper?"
model = "text-davinci-003"
token_limit = 2800

In [12]:
# embed the query
query_data = embed_query(query)
# get the context
context = get_context(query_data, mongodoc)
# format the context
formatted_context, meta_list = format_context(context, model, token_limit)
# format the prompt

In [16]:
class RelevantPassagesPromptTemplate(BasePromptTemplate, BaseModel):
    """ A custom prompt template that takes a query and document data, and formats the prompt template to provide the formatted context + query to the language model. """
    @validator("input_variables")
    def validate_input_variables(cls, v):
        """ Validate that the input variables are correct. """
        if len(v) != 2:
            raise ValueError("DocQAPromptTemplate must have two input variables: query and context.")
        return v
    
    def format(self, query, formatted_context) -> str:
        # Get the source code of the function
        instruction = "Identify 3-5 most relevant passages in the context to the query."
        # Generate the prompt to be sent to the language model
        prompt = f"INSTRUCTION:\n{instruction}\n\nCONTEXT:\n{formatted_context}\n\nQUERY:\n{query}\n\nOUTPUT:\n"
        return prompt
    
    def _prompt_type(self):
        return "doc context + query"

In [30]:
citation = PromptTemplate(
    input_variables=["metadata"],
    template="Generate a citation in APA format based on this metadata:\METADATA:\n {metadata}",
)

final = PromptTemplate(
    input_variables=["query", "passages", "citation"],
    template="Answer the query with a lengthy, deatiled reponse, to the best of your ability based on the provided context. Use the relevant passages, including in text citations for direct quotes and paraphrased content. Include a bibliography. Passages:\n\n{passages}\n\nCitations:\n\n{citation}\n\nQUERY:\n{query}\n\nOUTPUT:\n",
)

relavant_passages_template = RelevantPassagesPromptTemplate(input_variables=["query", "formatted_context"])

In [31]:
llm = OpenAI(temperature=0.7)

In [32]:
relevant_passages_prompt = relavant_passages_template.format(query, formatted_context)
relevant_passages = llm(relevant_passages_prompt)

citation_prompt = citation.format(metadata=meta_list)
citation = llm(citation_prompt)

final_prompt = final.format(query=query, passages=relevant_passages, citation=citation)
final_response = llm(final_prompt)

In [33]:
print(final_response)

The most striking finding in the paper is that cognitive therapy is as effective as pharmacotherapy in treating depression. This was found in the 1984 study by Simons et al., which showed that cognitive therapy and pharmacotherapy produced similar decreases in depressive symptoms. This finding was supported by DeRubeis et al.’s 1990 study, which showed that cognitive change and symptom change were equally present in cognitive therapy and pharmacotherapy. Strunk et al.’s 2007 study further demonstrated that cognitive therapy skills were related to the reduction of relapse risk following treatment for depression. This suggests that cognitive therapy is an effective method for treating depression. 

Bibliography:
Simons, A. D., Garfield, S. L., & Murphy, G. E. (1984). The process of change in cognitive therapy and pharmacotherapy for depression. Changes in mood and cognition. Archives of General Psychiatry, 41, 45-51.

DeRubeis, R. J., Evans, M. D., Hollon, S. D., et al. (1990). How does 

In [24]:
relavant_passages_template = RelevantPassagesPromptTemplate(input_variables=["query", "formatted_context"])
relevant_passages_prompt = relavant_passages_template.format(query, formatted_context)
relevant_passages = llm(relevant_passages_prompt)
citations_prompt = citations.format(passages=relevant_passages)
citations_response = llm(citations_prompt)
final_prompt = final.format(query=query, passages=relevant_passages, citations=citations_response)
final_response = llm(final_prompt)

In [25]:
print(final_response)

The most striking finding in the paper is that changes from pretreatment to mid treatment on cognitive measures significantly predicted change in depression from mid treatment to posttreatment (DeRubeis et al., 2010). This suggests that cognitive change can be a powerful predictor of changes in depression, and that cognitive therapies may be particularly effective in treating depression. Furthermore, cognitive change has been found to predict lower relapse rates (DeRubeis et al., 2010; 111-114), further underscoring the importance of cognitive change in the treatment of depression.


In [27]:
print(meta_list)

[{'file_path': '../data/powers2017.pdf', 'page_number': 6, 'total_pages': 13, 'format': 'PDF 1.7', 'title': 'Core Mechanisms of Cognitive Behavioral Therapy for Anxiety and Depression', 'author': 'Mark B. Powers PhD', 'subject': 'Psychiatric Clinics of NA, 40 (2017) 611-623. doi:10.1016/j.psc.2017.08.010', 'keywords': '', 'creator': 'Elsevier', 'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'creationDate': '24th October 2017', 'modDate': "D:20171024190454+05'30'", 'trapped': '', 'encryption': ''}, {'file_path': '../data/powers2017.pdf', 'page_number': 12, 'total_pages': 13, 'format': 'PDF 1.7', 'title': 'Core Mechanisms of Cognitive Behavioral Therapy for Anxiety and Depression', 'author': 'Mark B. Powers PhD', 'subject': 'Psychiatric Clinics of NA, 40 (2017) 611-623. doi:10.1016/j.psc.2017.08.010', 'keywords': '', 'creator': 'Elsevier', 'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'creationDate': '24th October 2017', 'modDate': "D:20171024190454+05'30'", 'trapped': '', 'encrypti