In [2]:
from dotenv import load_dotenv
import os
load_dotenv("../.env")
OPENAI_API_KEY = os.environ.get('OPENAI-API-KEY')
import openai
openai.api_key = OPENAI_API_KEY

In [3]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.document_loaders import PyMuPDFLoader
from operator import itemgetter
import numpy as np
import tiktoken

from typing import List, Dict, Tuple
import re
import time
import asyncio
from functools import wraps, partial

In [5]:
# this prevents the metadata from being None which causes errors with the vectorstore
def sanitize_metadata(data):
    for item in data:
        meta = item.metadata
        for key, value in meta.items():
            if value is None:
                meta[key] = ""
    return data

def unpack (data):
    return [{'page' : idx + 1, 'content' : page.page_content, 'metadata' : page.metadata} for idx, page in enumerate(data)]

def dot_product_similarity(doc_data: List[Dict], query_data: Dict) -> List[Tuple[int, float]]:
    query_embedding = query_data['embedding']
    doc_embeddings = [page['embedding'] for page in doc_data]
    tuples_list = [(page['page'], np.dot(query_embedding, embedding)) for page, embedding in zip(doc_data, doc_embeddings)]
    ordered_tuples = sorted(tuples_list, key=itemgetter(1), reverse=True)
    top_five_tuples = ordered_tuples[:5]
    return top_five_tuples

def embed_query(query: str):
    base_embeddings = OpenAIEmbeddings()
    embedding = base_embeddings.embed_query(query)
    return {"query" : query, "embedding" : embedding}

def embed_doc(text_list : List[str]):
    base_embeddings = OpenAIEmbeddings()
    doc_embeddings = base_embeddings.embed_documents(text_list)
    return doc_embeddings

def get_context(query_data, doc_data: List[Dict]) -> List[Dict]:
    top_five_tuples = dot_product_similarity(doc_data, query_data)
    context = []
    for item in top_five_tuples:
        page = item[0]
        data = {'page': page, 'similarity' : item[1], 'text': doc_data[page - 1]['content'], 'metadata' : doc_data[page - 1]['metadata']}
        context.append(data)
    return context

def get_tokens(string: str, model: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.encoding_for_model(model)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def format_context(context: List[Dict], model: str, token_limit : int) -> str:
    """Returns a string of the first 1024 tokens of the context."""
    context_string = ""
    meta_list = []
    encoding = tiktoken.encoding_for_model(model)
    for idx, item in enumerate(context):
        sanitized_text = item['text'].replace("\n", " ")
        context_string += f"Page: {item['page']}\n\nText: {sanitized_text}\n\n"
        meta_list.append(item['metadata'])
        tokens = get_tokens(context_string, model)
        if tokens > token_limit:
            encoded_text = encoding.encode(context_string)
            # cut it down to the token limit
            encoded_text = encoded_text[:token_limit]
            # decode it back to a string
            context_string = encoding.decode(encoded_text)
            # some testing to make sure it worked
            tokens = get_tokens(context_string, model)
            assert tokens <= token_limit, f"format context function failed to cut context down far enough. tokens: {tokens}"
            break
    return context_string, meta_list

In [6]:
def sanitize_text(text):
    # Replace any non-alphanumeric character with a space
    text = re.sub(r'[^\w\s]', ' ', text)
    # Replace any multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    # Strip leading and trailing whitespace
    text = text.strip()
    return text

# OpenAI Classes

In [30]:
class Completion:
    def __init__(self, temperature, max_tokens, stream=False, model="text-davinci-003", **kwargs):
        self.temperature = temperature
        self.max_tokens = max_tokens
        self.stream = stream
        self.model = model
        self.kwargs = kwargs

        openai.api_key = os.getenv("OPENAI_API_KEY")
    
    def __call__(self, text):
        raw_response = openai.Completion.create(
            model=self.model,
            prompt=text,
            temperature=self.temperature,
            max_tokens=self.max_tokens,
            **self.kwargs
        )
        if self.stream:
            return raw_response
        elif len(raw_response['choices']) > 1:
            return [choice['text'].strip() for choice in raw_response['choices']]
        else:
            return raw_response['choices'][0]['text'].strip()


class Chat:
    def __init__(self, temperature, system_message="You are a helpful assistant.", messages=None, model='gpt-3.5-turbo'):
        self.messages = []
        self.messages.append({"role": "system", "content": system_message})
        if messages is not None:
            self.messages += [{"role": "user", "content": message} for message in messages]
        self.model = model
        self.temperature = temperature

        openai.api_key = os.getenv("OPENAI_API_KEY")

    def __call__(self, user_message: str):
        user_message = {"role": "user", "content": user_message}
        self.messages.append(user_message)
        raw_response = openai.ChatCompletion.create(
            model=self.model,
            messages=self.messages,
            temperature=self.temperature,
        )
        response_message = raw_response['choices'][0]['message']['content'].strip()
        self.messages.append(response_message)
        return response_message

In [9]:
def async_wrap(func):
    @wraps(func)
    async def run(*args, loop=None, executor=None, **kwargs):
        if loop is None:
            loop = asyncio.get_event_loop()
        pfunc = partial(func, *args, **kwargs)
        return await loop.run_in_executor(executor, pfunc)
    return run

In [10]:
def chat_response(temperature, model, message):
    openai.api_key = os.getenv("OPENAI_API_KEY")
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": message}
    ]
    raw_response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=temperature
    )
    response_message = raw_response['choices'][0]['message']['content'].strip()
    return response_message

async_chat_response = async_wrap(chat_response)

async def asyncChatResponse(temperature, model, message, response_list, messages_list):
    start_time = time.perf_counter()
    response = await async_chat_response(temperature, model, message)
    elapsed = time.perf_counter() - start_time

    index = messages_list.index(message) + 1
    length = len(messages_list)
    print(f"Response {index} of {length} complete.")
    print(f"Response time: {elapsed:0.2f} seconds.")
    response_list.append(response)

async def run_chat_async(messages_list, response_list, temperature=0.7, model='gpt-3.5-turbo'):
    await asyncio.gather(*(asyncChatResponse(temperature, model, message, response_list, messages_list) for message in messages_list))

In [28]:
def get_response(temperature, model, prompt):
    openai.api_key = os.getenv("OPENAI_API_KEY")
    raw_response = openai.Completion.create(
        model=model,
        prompt=prompt,
        temperature=temperature
    )
    responses = [choice['text'].strip() for choice in raw_response['choices']]
    return responses

async_get_response = async_wrap(get_response)

async def asyncGetResponse(temperature, model, prompt, response_list, prompts_list):
    start_time = time.perf_counter()
    response = await async_get_response(temperature, model, prompt)
    elapsed = time.perf_counter() - start_time

    index = prompts_list.index(prompt) + 1
    length = len(prompts_list)
    print(f"Response {index} of {length} complete.")
    print(f"Response time: {elapsed:0.2f} seconds.")
    response_list.append(response)

async def run_completion_async(prompts_list, response_list, temperature=0.7, model='text-davinci-003'):
    await asyncio.gather(*(asyncGetResponse(temperature, model, prompt, response_list, prompts_list) for prompt in prompts_list))

In [31]:
com = Completion(temperature=0.7, max_tokens=1000, stream=False, model="text-davinci-003")

print(com(['who are you', 'what is your name', 'what is your favorite color?']))

['I am a digital assistant.', 'My name is Chris.', 'My favorite color is blue.']


In [27]:
res[0]

KeyError: 0

In [96]:
messages_list = [
    "What is your favorite color?",
    "What is your second favorite color?",
    "What is your third favorite color?", 
    "What is your favorite food?",
    "What is your favorite animal?", 
    "What is your favorite movie?", 
    "What is your favorite book?",
    "What is your favorite song?",
    "What is your favorite band?",
    "What is your favorite sport?",
]

response_list = []
await run_chat_async(messages_list, response_list)

Response 4 of 10 complete.
Response time: 1.27 seconds.
Response 8 of 10 complete.
Response time: 1.34 seconds.
Response 5 of 10 complete.
Response time: 1.41 seconds.
Response 2 of 10 complete.
Response time: 1.46 seconds.
Response 10 of 10 complete.
Response time: 1.68 seconds.
Response 1 of 10 complete.
Response time: 1.77 seconds.
Response 3 of 10 complete.
Response time: 1.78 seconds.
Response 9 of 10 complete.
Response time: 1.99 seconds.
Response 6 of 10 complete.
Response time: 2.29 seconds.
Response 7 of 10 complete.
Response time: 2.47 seconds.


In [103]:
class Mongodoc:
    def __init__(self, fpath : str):
        self.fpath = fpath

    def process_doc(self):
        loader = PyMuPDFLoader(self.fpath)
        # load the data
        unsanitized = loader.load()
        # make sure the metadata is not None
        data = sanitize_metadata(unsanitized)
        # get the doc embeddings
        doc_embeddings = embed_doc([page.page_content for page in data])
        # unpack the data and add the embeddings
        mongodoc = unpack(data)
        mongodoc = [{**page, "embedding": embedding} for page, embedding in zip(mongodoc, doc_embeddings)]

        self.data = mongodoc
        self.page_text = ' '.join([sanitize_text(page['content']) for page in mongodoc])
        self.metadata = [page['metadata'] for page in mongodoc]
        return self
    
    def get_chunks(self, chunk_size : int):
        enc = tiktoken.encoding_for_model("text-davinci-003")
        tokens = enc.encode(self.page_text)
        # split into chunks of 2800 tokens
        chunks = [tokens[i:i+2800] for i in range(0, len(tokens), 2800)]
        # decode chunks
        decoded = [enc.decode(chunk) for chunk in chunks]
        self.chunks = decoded
        return self
    
    def get_citation(self, format: str):
        citation_chat = Chat(temperature=0.9)
        get_citation_prompt = f"Use this metadata to generate a ciation in {format} format: \n\n{self.metadata}"
        final_citation = citation_chat(get_citation_prompt)
        self.citation = final_citation
        return self

In [51]:
class Chain:
    def __init__(self, mongodoc: Mongodoc):
        self.mongodoc = mongodoc
        # pass all the attributes from the mongodoc to the chain
        for attr in dir(mongodoc):
            if not callable(getattr(mongodoc, attr)) and not attr.startswith("__"):
                setattr(self, attr, getattr(mongodoc, attr))

    def link_1(self):
        print("Starting Link 1:")
        citation_qualifier = f"Use this citation: {self.citation} to cite your work."
        main_ideas_prompt = f"Identify and list 2-3 main ideas from the context. {citation_qualifier}"
        quotes_prompt = f"Identify and list 2-3 relevant quotes from the context. {citation_qualifier}"
        passages_prompt = f"Identify and list 2-3 relevant passages from the context. {citation_qualifier}"

        prompts = [
            {"type": "main_ideas", "prompt": main_ideas_prompt},
            {"type": "quotes", "prompt": quotes_prompt},
            {"type": "passages", "prompt": passages_prompt}
        ]

        system_message = "You are a helpful assistant that is very good at problem solving who thinks step by step. You always cite direct quotes and paraphrases with the appropriate in-text citation."

        responses = []
        for idx, chunk in enumerate(self.chunks):
            print(f"Chunk {idx+1} of {len(self.chunks)}")
            page_responses = []
            for prompt in prompts:
                print(prompt["type"])
                chat = Chat(temperature=0.9, system_message=system_message)
                response = chat(f"CONTEXT:{chunk}\n\nQUERY:{prompt['prompt']}")
                page_responses.append(
                    {"Chunk": idx+1, "prompt_type": prompt["type"], "response": response})
            responses.append(page_responses)
        self.link_1_responses = responses

        print("Link 1 Complete")
        return self
    
    def print_link_1(self):
        for page in self.link_1_responses:
            for response in page:
                print(f'Chunk: {response["Chunk"]}\nType: {response["prompt_type"]}\n\nResponse:\n{response["response"]}')
    
    def link_2(self):
        print("Starting Link 2:")
        llm = Completion(temperature=0.9, max_tokens=1000)
        summary_responses = []
        for page in self.link_1_responses:
            combine_prompt = f"Combine the following Main Ideas:\n{page[0]['response']}\n\nQuotes:\n{page[1]['response']}\n\nPassages:\n{page[2]['response']}\n\ninto a coherent writing. Retain any in-text citations, don't add any new citations except for {self.citation}\n\nSUMMARY:"
            summary_response = llm(combine_prompt)
            summary_responses.append(summary_response)
        self.link_2_responses = summary_responses
        print("Link 2 Complete")
        return self
    
    def print_link_2(self):
        for response in self.link_2_responses:
            print(response)

    def link_3(self):
        print("Starting Link 3:")
        llm = Completion(temperature=0.9, max_tokens=1000)
        prompt = f"combine the following passages:{' '.join([summary for summary in self.link_2_responses])} into an essay. Retain your in-text citations and make sure to include a reference list at the end of your essay using this citation: {self.citation}. Make sure you dont repeat anything."
        response = llm(prompt)
        self.link_3_response = response
        print("Link 3 Complete")
        return self
    
    def print_link_3(self):
        print(self.link_3_response)

    def link_4(self):
        with open("../data/apa_guidelines.txt", "r") as f:
            guidelines = f.read()

        print("Starting Link 4:")
        llm = Chat(temperature=0.9)
        guidelines = "https://owl.purdue.edu/owl/research_and_citation/apa_style/apa_formatting_and_style_guide/general_format.html"
        prompt = f"Essay:{self.link_3_response}\n\nFinalize thee essay based on the following APA guidelines: {guidelines}"
        response = llm(prompt)
        self.link_4_response = response
        print("Link 4 Complete")
        return self
    
    def print_link_4(self):
        print(self.link_4_response)

    def chain(self):
        start = time.perf_counter()
        print("Starting Chain")
        self.link_1().link_2().link_3().link_4()
        elapsed = time.perf_counter() - start
        print(f"Chain Complete in {elapsed:0.2f} seconds.")
        return self

In [141]:
class AsyncChain:
    def __init__(self, mongodoc: Mongodoc):
        self.mongodoc = mongodoc
        # pass all the attributes from the mongodoc to the chain
        for attr in dir(mongodoc):
            if not callable(getattr(mongodoc, attr)) and not attr.startswith("__"):
                setattr(self, attr, getattr(mongodoc, attr))

    async def link_1(self):
        print("Starting Link 1:")
        citation_qualifier = f"Use this citation: {self.citation} to cite your work."
        main_ideas_prompt = f"Identify and list 2-3 main ideas from the context. {citation_qualifier}"
        quotes_prompt = f"Identify and list 2-3 relevant quotes from the context. {citation_qualifier}"
        passages_prompt = f"Identify and list 2-3 relevant passages from the context. {citation_qualifier}"

        prompts = [
            {"type": "main_ideas", "prompt": main_ideas_prompt},
            {"type": "quotes", "prompt": quotes_prompt},
            {"type": "passages", "prompt": passages_prompt}
        ]

        system_message = "You are a helpful assistant that is very good at problem solving who thinks step by step. You always cite direct quotes and paraphrases with the appropriate in-text citation."

        responses = []
        hardcode_prompts = [[f"CONTEXT:{chunk}\n\nQUERY:{prompt['prompt']}" for prompt in prompts]
                            for idx, chunk in enumerate(self.chunks)]
        flattened_prompts = [
            item for sublist in hardcode_prompts for item in sublist]

        await run_chat_async(flattened_prompts, responses)

        # split the responses into chunks of 3
        response_chunks = []
        n = len(prompts)
        for i in range(0, len(responses), n):
            response_chunk = responses[i:i+n]
            response_chunks.append(response_chunk)

        # create a list of dicts
        response_dicts = []
        for idx, chunk in enumerate(response_chunks):
            page_dict = []
            for prompt, response in zip(prompts, chunk):
                page_dict.append(
                    {"Chunk": idx+1, "prompt_type": prompt["type"], "response": response})
            response_dicts.append(page_dict)
        self.link_1_responses = responses
        # format [[{chunk: 1, prompt_type: main_ideas, response: ...}, ...], ...]
        self.link_1_response_dicts = response_dicts
        print("Link 1 Complete")
        return self

    def print_link_1(self):
        for page in self.link_1_response_dicts:
            for response in page:
                print(f'Chunk: {response["Chunk"]}\nType: {response["prompt_type"]}\n\nResponse:\n{response["response"]}\n')

    async def link_2(self):
        print("Starting Link 2:")
        llm = Completion(temperature=0.9, max_tokens=1000)
    
    async def link_2(self):
        print("Starting Link 2:")
        prompts = []
        for page in self.link_1_response_dicts:
            combine_prompt = f"Combine the following Main Ideas:\n{page[0]['response']}\n\nQuotes:\n{page[1]['response']}\n\nPassages:\n{page[2]['response']}\n\ninto a coherent writing. Retain any in-text citations, don't add any new citations except for {self.citation}\n\nSUMMARY:"
            prompts.append(combine_prompt)
        responses = []
        await run_completion_async(prompts, responses)
        self.link_2_responses = responses
        print("Link 2 Complete")
        return self
    
    def print_link_2(self):
        for response in self.link_2_responses:
            print(response)

    def link_3(self):
        print("Starting Link 3:")
        chat = Chat(temperature=0.9)
        prompt = f"Combine the following passages:{' '.join([summary for summary in self.link_2_responses])} into an essay. Retain your in-text citations and make sure to include a reference list at the end of your essay using this citation: {self.citation}."
        response = chat(prompt)
        self.link_3_response = response
        print("Link 3 Complete")
        return self
    
    def print_link_3(self):
        print(self.link_3_response)

    async def chain(self):
        overall_start = time.perf_counter()
        start = time.perf_counter()
        print("Starting Chain")
        await self.link_1()
        elapsed = time.perf_counter() - start
        print(f"Link 1 Complete in {elapsed:0.2f} seconds.")
        start = time.perf_counter()
        await self.link_2()
        elapsed = time.perf_counter() - start
        print(f"Link 2 Complete in {elapsed:0.2f} seconds.")
        start = time.perf_counter()
        self.link_3()
        print(f"Link 3 Complete in {elapsed:0.2f} seconds.")
        elapsed = time.perf_counter() - start
        overall_elapsed = time.perf_counter() - overall_start
        print(f"Chain Complete in {overall_elapsed:0.2f} seconds.")
        return self

In [104]:
fpath = "../data/powers2017.pdf"
fpath = "../data/moore-et-al-2022.pdf"
doc = Mongodoc(fpath).process_doc().get_chunks(2800).get_citation("APA")

In [142]:
async_chain = AsyncChain(doc)
await async_chain.link_1()
await async_chain.link_2()
async_chain.link_3()
async_chain.print_link_3()

Starting Link 1:
Response 14 of 15 complete.
Response time: 3.49 seconds.
Response 8 of 15 complete.
Response time: 3.58 seconds.
Response 1 of 15 complete.
Response time: 4.21 seconds.
Response 3 of 15 complete.
Response time: 4.28 seconds.
Response 7 of 15 complete.
Response time: 4.31 seconds.
Response 11 of 15 complete.
Response time: 4.46 seconds.
Response 4 of 15 complete.
Response time: 4.92 seconds.
Response 9 of 15 complete.
Response time: 5.13 seconds.
Response 2 of 15 complete.
Response time: 5.30 seconds.
Response 10 of 15 complete.
Response time: 5.49 seconds.
Response 12 of 15 complete.
Response time: 5.84 seconds.
Response 6 of 15 complete.
Response time: 6.46 seconds.
Response 15 of 15 complete.
Response time: 6.99 seconds.
Response 13 of 15 complete.
Response time: 7.12 seconds.
Response 5 of 15 complete.
Response time: 10.45 seconds.
Link 1 Complete
Starting Link 2:
Response 1 of 5 complete.
Response time: 5.64 seconds.
Response 4 of 5 complete.
Response time: 6.04 se

In [143]:
async_chain.print_link_3()

Moore et al. (2022) conducted a comprehensive review of the safety and effectiveness of N-methyl-D-aspartate (NMDA) receptor antagonists for the treatment of depression in Pharmacotherapy. The article discusses the potential benefits of these drugs in treating depression and other psychiatric conditions while also highlighting the safety concerns associated with their use.

Ketamine and esketamine, two NMDA receptor antagonists, have shown promising results in small trials for the treatment of depression. However, larger trials of esketamine nasal spray failed to replicate the positive outcomes seen in smaller studies. Safety concerns have also arisen due to animal toxicology studies of drug-induced abnormal neuron structures. Moreover, chronic ketamine exposure has been found to induce permanent impairment of brain functions in adolescent cynomolgus monkeys and hyperphosphorylated tau in the brains of mice and monkeys with long-term administration of ketamine.

The altered mental stat