In [18]:
from dotenv import load_dotenv
import os
load_dotenv("../.env")
OPENAI_API_KEY = os.environ.get('OPENAI-API-KEY')
import openai
openai.api_key = OPENAI_API_KEY

In [52]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.document_loaders import PyMuPDFLoader
from operator import itemgetter
import numpy as np
import tiktoken

from typing import List, Dict, Tuple
import re
import time

import random
import matplotlib.colors as mcolors
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [7]:
# this prevents the metadata from being None which causes errors with the vectorstore
def sanitize_metadata(data):
    for item in data:
        meta = item.metadata
        for key, value in meta.items():
            if value is None:
                meta[key] = ""
    return data

def unpack (data):
    return [{'page' : idx + 1, 'content' : page.page_content, 'metadata' : page.metadata} for idx, page in enumerate(data)]

def dot_product_similarity(doc_data: List[Dict], query_data: Dict) -> List[Tuple[int, float]]:
    query_embedding = query_data['embedding']
    doc_embeddings = [page['embedding'] for page in doc_data]
    tuples_list = [(page['page'], np.dot(query_embedding, embedding)) for page, embedding in zip(doc_data, doc_embeddings)]
    ordered_tuples = sorted(tuples_list, key=itemgetter(1), reverse=True)
    top_five_tuples = ordered_tuples[:5]
    return top_five_tuples

def embed_query(query: str):
    base_embeddings = OpenAIEmbeddings()
    embedding = base_embeddings.embed_query(query)
    return {"query" : query, "embedding" : embedding}

def embed_doc(text_list : List[str]):
    base_embeddings = OpenAIEmbeddings()
    doc_embeddings = base_embeddings.embed_documents(text_list)
    return doc_embeddings

def get_context(query_data, doc_data: List[Dict]) -> List[Dict]:
    top_five_tuples = dot_product_similarity(doc_data, query_data)
    context = []
    for item in top_five_tuples:
        page = item[0]
        data = {'page': page, 'similarity' : item[1], 'text': doc_data[page - 1]['content'], 'metadata' : doc_data[page - 1]['metadata']}
        context.append(data)
    return context

def get_tokens(string: str, model: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.encoding_for_model(model)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def format_context(context: List[Dict], model: str, token_limit : int) -> str:
    """Returns a string of the first 1024 tokens of the context."""
    context_string = ""
    meta_list = []
    encoding = tiktoken.encoding_for_model(model)
    for idx, item in enumerate(context):
        sanitized_text = item['text'].replace("\n", " ")
        context_string += f"Page: {item['page']}\n\nText: {sanitized_text}\n\n"
        meta_list.append(item['metadata'])
        tokens = get_tokens(context_string, model)
        if tokens > token_limit:
            encoded_text = encoding.encode(context_string)
            # cut it down to the token limit
            encoded_text = encoded_text[:token_limit]
            # decode it back to a string
            context_string = encoding.decode(encoded_text)
            # some testing to make sure it worked
            tokens = get_tokens(context_string, model)
            assert tokens <= token_limit, f"format context function failed to cut context down far enough. tokens: {tokens}"
            break
    return context_string, meta_list

In [13]:
def sanitize_text(text):
    # Replace any non-alphanumeric character with a space
    text = re.sub(r'[^\w\s]', ' ', text)
    # Replace any multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    # Strip leading and trailing whitespace
    text = text.strip()
    return text

# OpenAI Classes

In [16]:
class Completion:
    def __init__(self, temperature, max_tokens, stream=False, model="text-davinci-003", **kwargs):
        self.temperature = temperature
        self.max_tokens = max_tokens
        self.stream = stream
        self.model = model
        self.kwargs = kwargs

        openai.api_key = os.getenv("OPENAI_API_KEY")
    
    def __call__(self, text):
        raw_response = openai.Completion.create(
            model=self.model,
            prompt=text,
            temperature=self.temperature,
            max_tokens=self.max_tokens,
            **self.kwargs
        )
        if self.stream:
            return raw_response
        else:
            return raw_response['choices'][0]['text'].strip()


class Chat:
    def __init__(self, temperature, system_message="You are a helpful assistant.", messages=None, model='gpt-3.5-turbo'):
        self.messages = []
        self.messages.append({"role": "system", "content": system_message})
        if messages is not None:
            self.messages += [{"role": "user", "content": message} for message in messages]
        self.model = model
        self.temperature = temperature

        openai.api_key = os.getenv("OPENAI_API_KEY")

    def __call__(self, user_message: str):
        user_message = {"role": "user", "content": user_message}
        self.messages.append(user_message)
        raw_response = openai.ChatCompletion.create(
            model=self.model,
            messages=self.messages,
            temperature=self.temperature,
        )
        response_message = raw_response['choices'][0]['message']['content'].strip()
        self.messages.append(response_message)
        return response_message

In [10]:
class Mongodoc:
    def __init__(self, fpath : str):
        self.fpath = fpath

    def process_doc(self):
        loader = PyMuPDFLoader(self.fpath)
        # load the data
        unsanitized = loader.load()
        # make sure the metadata is not None
        data = sanitize_metadata(unsanitized)
        # get the doc embeddings
        doc_embeddings = embed_doc([page.page_content for page in data])
        # unpack the data and add the embeddings
        mongodoc = unpack(data)
        mongodoc = [{**page, "embedding": embedding} for page, embedding in zip(mongodoc, doc_embeddings)]

        self.data = mongodoc
        self.page_text = ' '.join([sanitize_text(page['content']) for page in mongodoc])
        self.metadata = [page['metadata'] for page in mongodoc]
        return self
    
    def get_chunks(self, chunk_size : int):
        enc = tiktoken.encoding_for_model("text-davinci-003")
        tokens = enc.encode(self.page_text)
        # split into chunks of 2800 tokens
        chunks = [tokens[i:i+2800] for i in range(0, len(tokens), 2800)]
        # decode chunks
        decoded = [enc.decode(chunk) for chunk in chunks]
        self.chunks = decoded
        return self
    
    def get_citation(self, format: str):
        citation_chat = Chat(temperature=0.9)
        get_citation_prompt = f"Use this metadata to generate a ciation in {format} format: \n\n{self.metadata}"
        final_citation = citation_chat(get_citation_prompt)
        self.citation = final_citation
        return self

In [51]:
class Chain:
    def __init__(self, mongodoc: Mongodoc):
        self.mongodoc = mongodoc
        # pass all the attributes from the mongodoc to the chain
        for attr in dir(mongodoc):
            if not callable(getattr(mongodoc, attr)) and not attr.startswith("__"):
                setattr(self, attr, getattr(mongodoc, attr))

    def link_1(self):
        print("Starting Link 1:")
        citation_qualifier = f"Use this citation: {self.citation} to cite your work."
        main_ideas_prompt = f"Identify and list 2-3 main ideas from the context. {citation_qualifier}"
        quotes_prompt = f"Identify and list 2-3 relevant quotes from the context. {citation_qualifier}"
        passages_prompt = f"Identify and list 2-3 relevant passages from the context. {citation_qualifier}"

        prompts = [
            {"type": "main_ideas", "prompt": main_ideas_prompt},
            {"type": "quotes", "prompt": quotes_prompt},
            {"type": "passages", "prompt": passages_prompt}
        ]

        system_message = "You are a helpful assistant that is very good at problem solving who thinks step by step. You always cite direct quotes and paraphrases with the appropriate in-text citation."

        responses = []
        for idx, chunk in enumerate(self.chunks):
            print(f"Chunk {idx+1} of {len(self.chunks)}")
            page_responses = []
            for prompt in prompts:
                print(prompt["type"])
                chat = Chat(temperature=0.9, system_message=system_message)
                response = chat(f"CONTEXT:{chunk}\n\nQUERY:{prompt['prompt']}")
                page_responses.append(
                    {"Chunk": idx+1, "prompt_type": prompt["type"], "response": response})
            responses.append(page_responses)
        self.link_1_responses = responses

        print("Link 1 Complete")
        return self
    
    def print_link_1(self):
        for page in self.link_1_responses:
            for response in page:
                print(f'Chunk: {response["Chunk"]}\nType: {response["prompt_type"]}\n\nResponse:\n{response["response"]}')
    
    def link_2(self):
        print("Starting Link 2:")
        llm = Completion(temperature=0.9, max_tokens=1000)
        summary_responses = []
        for page in self.link_1_responses:
            combine_prompt = f"Combine the following Main Ideas:\n{page[0]['response']}\n\nQuotes:\n{page[1]['response']}\n\nPassages:\n{page[2]['response']}\n\ninto a coherent writing. Retain any in-text citations, don't add any new citations except for {self.citation}\n\nSUMMARY:"
            summary_response = llm(combine_prompt)
            summary_responses.append(summary_response)
        self.link_2_responses = summary_responses
        print("Link 2 Complete")
        return self
    
    def print_link_2(self):
        for response in self.link_2_responses:
            print(response)

    def link_3(self):
        print("Starting Link 3:")
        llm = Completion(temperature=0.9, max_tokens=1000)
        prompt = f"combine the following passages:{' '.join([summary for summary in self.link_2_responses])} into an essay. Retain your in-text citations and make sure to include a reference list at the end of your essay using this citation: {self.citation}. Make sure you dont repeat anything."
        response = llm(prompt)
        self.link_3_response = response
        print("Link 3 Complete")
        return self
    
    def print_link_3(self):
        print(self.link_3_response)

    def link_4(self):
        with open("../data/apa_guidelines.txt", "r") as f:
            guidelines = f.read()

        print("Starting Link 4:")
        llm = Chat(temperature=0.9)
        guidelines = "https://owl.purdue.edu/owl/research_and_citation/apa_style/apa_formatting_and_style_guide/general_format.html"
        prompt = f"Essay:{self.link_3_response}\n\nFinalize thee essay based on the following APA guidelines: {guidelines}"
        response = llm(prompt)
        self.link_4_response = response
        print("Link 4 Complete")
        return self
    
    def print_link_4(self):
        print(self.link_4_response)

    def chain(self):
        start = time.perf_counter()
        print("Starting Chain")
        self.link_1().link_2().link_3().link_4()
        elapsed = time.perf_counter() - start
        print(f"Chain Complete in {elapsed:0.2f} seconds.")
        return self

In [46]:
fpath = "../data/powers2017.pdf"
fpath = "../data/moore-et-al-2022.pdf"
doc = Mongodoc(fpath).process_doc().get_chunks(2800).get_citation("APA")

In [47]:
chain = Chain(doc)
chain.link_1()

Starting Link 1:
Chunk 1 of 5
main_ideas
quotes
passages
Chunk 2 of 5
main_ideas
quotes
passages
Chunk 3 of 5
main_ideas
quotes
passages
Chunk 4 of 5
main_ideas
quotes
passages
Chunk 5 of 5
main_ideas
quotes
passages
Link 1 Complete


<__main__.Chain at 0x7fc34af954f0>

In [48]:
chain.link_2()

Starting Link 2:
Link 2 Complete


<__main__.Chain at 0x7fc34af954f0>

In [49]:
chain.link_3()
chain.link_4()

Starting Link 3:
Link 3 Complete
Starting Link 4:
Link 4 Complete


<__main__.Chain at 0x7fc34af954f0>

In [50]:
chain.print_link_4()

Ketamine and esketamine are non-competitive non-selective antagonists of the N-methyl-D-aspartate (NMDA) receptor for glutamate, which have been repurposed as off-label treatments for depression and other psychiatric disorders. These drugs block the NMDA receptor, leading to varied changes in perception, mood, behavior, and sedation. Moore et al. (2022) provide an extensive review of the efficacy and safety of NMDA receptor antagonists for depression, with a focus on ketamine and esketamine. The authors assess the clinical trials establishing the effectiveness of ketamine for depression, treatment-resistant depression, and bipolar depression. While ketamine and esketamine have been shown to be effective in reducing depression scores, there are still substantial questions about their safety and effectiveness for psychiatric disorders.

Toxicology studies of esketamine were conducted to support its FDA approval in 2019, but these studies were limited to a single dose in adult rats and di