In [89]:
from dotenv import load_dotenv
import os
load_dotenv("../.env")
OPENAI_API_KEY = os.environ.get('OPENAI-API-KEY')
import openai
openai.api_key = OPENAI_API_KEY

In [91]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import TextSplitter
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
from langchain.document_loaders import PyMuPDFLoader
from langchain.llms import OpenAI
from langchain.llms import OpenAIChat
from operator import itemgetter
import numpy as np
import tiktoken

import random
import matplotlib.colors as mcolors
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

from typing import List, Dict, Tuple

# Functions

In [4]:
# this prevents the metadata from being None which causes errors with the vectorstore
def sanitize_metadata(data):
    for item in data:
        meta = item.metadata
        for key, value in meta.items():
            if value is None:
                meta[key] = ""
    return data

def unpack (data):
    return [{'page' : idx + 1, 'content' : page.page_content, 'metadata' : page.metadata} for idx, page in enumerate(data)]

def dot_product_similarity(doc_data: List[Dict], query_data: Dict) -> List[Tuple[int, float]]:
    query_embedding = query_data['embedding']
    doc_embeddings = [page['embedding'] for page in doc_data]
    tuples_list = [(page['page'], np.dot(query_embedding, embedding)) for page, embedding in zip(doc_data, doc_embeddings)]
    ordered_tuples = sorted(tuples_list, key=itemgetter(1), reverse=True)
    top_five_tuples = ordered_tuples[:5]
    return top_five_tuples

def embed_query(query: str):
    base_embeddings = OpenAIEmbeddings()
    embedding = base_embeddings.embed_query(query)
    return {"query" : query, "embedding" : embedding}

def embed_doc(text_list : List[str]):
    base_embeddings = OpenAIEmbeddings()
    doc_embeddings = base_embeddings.embed_documents(text_list)
    return doc_embeddings

def get_context(query_data, doc_data: List[Dict]) -> List[Dict]:
    top_five_tuples = dot_product_similarity(doc_data, query_data)
    context = []
    for item in top_five_tuples:
        page = item[0]
        data = {'page': page, 'similarity' : item[1], 'text': doc_data[page - 1]['content'], 'metadata' : doc_data[page - 1]['metadata']}
        context.append(data)
    return context

def get_tokens(string: str, model: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.encoding_for_model(model)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def format_context(context: List[Dict], model: str, token_limit : int) -> str:
    """Returns a string of the first 1024 tokens of the context."""
    context_string = ""
    meta_list = []
    encoding = tiktoken.encoding_for_model(model)
    for idx, item in enumerate(context):
        sanitized_text = item['text'].replace("\n", " ")
        context_string += f"Page: {item['page']}\n\nText: {sanitized_text}\n\n"
        meta_list.append(item['metadata'])
        tokens = get_tokens(context_string, model)
        if tokens > token_limit:
            encoded_text = encoding.encode(context_string)
            # cut it down to the token limit
            encoded_text = encoded_text[:token_limit]
            # decode it back to a string
            context_string = encoding.decode(encoded_text)
            # some testing to make sure it worked
            tokens = get_tokens(context_string, model)
            assert tokens <= token_limit, f"format context function failed to cut context down far enough. tokens: {tokens}"
            break
    return context_string, meta_list
    
def upload_doc(fpath):
    loader = PyMuPDFLoader(fpath)
    # load the data
    unsanitized = loader.load()
    # make sure the metadata is not None
    data = sanitize_metadata(unsanitized)

    # get the doc embeddings
    doc_embeddings = embed_doc([page.page_content for page in data])

    # unpack the data and add the embeddings
    mongodoc = unpack(data)
    mongodoc = [{**page, "embedding": embedding} for page, embedding in zip(mongodoc, doc_embeddings)]
    
    return mongodoc

## Kmeans

In [None]:
class DFKMeans:
    def __init__(self, data: List[Dict]):
        self.data = data

    def create_embedding_matrix(self):
        embeddings = self.data['']
        # if stored as csv then need to convert to list (from string)
        if type(embeddings[0]) != list:
            embeddings = [eval(embedding) for embedding in embeddings]
        self.matrix = np.vstack(embeddings)
        return self

    # TODO add optimization for n_clusters
    def cluster(self, n_clusters: int):
        if not hasattr(self, "matrix"):
            self.create_embedding_matrix()
        kmeans = KMeans(n_clusters=n_clusters, init="k-means++", random_state=42, n_init=10)
        kmeans.fit(self.matrix)
        labels = kmeans.labels_
        self.df["cluster"] = labels
        return self

    def cluster_optimized(self, max_clusters: int = 10):
        if not hasattr(self, "matrix"):
            self.create_embedding_matrix()
        best_score = -1
        best_clusters = None
        for n_clusters in range(2, max_clusters+1):
            kmeans = KMeans(n_clusters=n_clusters, init="k-means++", random_state=42, n_init=10)
            kmeans.fit(self.matrix)
            labels = kmeans.labels_
            score = silhouette_score(self.matrix, labels)
            if score > best_score:
                best_score = score
                best_clusters = n_clusters
        self.cluster(best_clusters)
        return self

    def get_cluster(self, cluster: int):
        return self.df[self.df["cluster"] == cluster]

    def tnse_plot(self):
        tsne = TSNE(n_components=2, perplexity=15, random_state=42, init="random", learning_rate=200)
        vis_dims2 = tsne.fit_transform(self.matrix)

        x = [x for x, y in vis_dims2]
        y = [y for x, y in vis_dims2]
        
        n_clusters = len(self.df['cluster'].unique())

        for category, color in enumerate(random_tableau_colors(n_clusters)):
            xs = np.array(x)[self.df.cluster == category]
            ys = np.array(y)[self.df.cluster == category]
            plt.scatter(xs, ys, color=color, alpha=0.3)

            avg_x = xs.mean()
            avg_y = ys.mean()

            plt.scatter(avg_x, avg_y, marker="x", color=color, s=100)
        plt.title("Clusters identified visualized in language 2d using t-SNE")

    def get_random_subsamble(self, cluster : int, n_samples : int, token_limit : int):
        # handle edge case where n_samples > number of samples in cluster
        if n_samples > len(self.df[self.df["cluster"] == cluster]):
            n_samples = len(self.df[self.df["cluster"] == cluster])

        # get random samples from cluster, process - set limits
        batch = self.df[self.df["cluster"] == cluster].sample(n_samples)[["text"]].values
        sample_token_limit = token_limit // n_samples
        est_char_limit = sample_token_limit * 4
        samples_text_list = [''.join(sample).replace('\n', ' ').strip() for sample in batch]
        
        # get subsamples
        subsamples = []
        for idx, sample_text in enumerate(samples_text_list):
            subsample = f'Chunk {idx+1}: ' + sample_text[:est_char_limit]
            tokens = sample_token_limit + 1
            while tokens > sample_token_limit:
                tokens = get_tokens(subsample)
                subsample = subsample[:-1]
            subsamples.append(subsample)
        final_subsample = '\n\n'.join(subsamples)
        # im adding 2*n_samples to the token limit to account for 2 more tokens per sample due to the new line character
        assert get_tokens(final_subsample) <= token_limit + 2*n_samples, "Subsample is too long."

        return final_subsample

# ! PLOTS ---------------------------------------------------------------------


def random_colors(n):
    # define a list of possible color names
    color_names = list(mcolors.CSS4_COLORS.keys())
    
    # shuffle the list of color names
    random.shuffle(color_names)
    # select the first n color names
    return color_names[:n]

def random_tableau_colors(n):
    tableau_colors = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'tab:brown', 'tab:pink', 'tab:gray', 'tab:olive', 'tab:cyan']
    random.shuffle(tableau_colors)
    colors = tableau_colors*(n//10) + tableau_colors[:n%10]
    return colors

In [45]:
def random_colors(n):
    # define a list of possible color names
    color_names = list(mcolors.CSS4_COLORS.keys())
    
    # shuffle the list of color names
    random.shuffle(color_names)
    # select the first n color names
    return color_names[:n]

def random_tableau_colors(n):
    tableau_colors = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'tab:brown', 'tab:pink', 'tab:gray', 'tab:olive', 'tab:cyan']
    random.shuffle(tableau_colors)
    colors = tableau_colors*(n//10) + tableau_colors[:n%10]
    return colors

class DocKMeans:
    def __init__(self, data: List[Dict]):
        self.data = data

    def create_embedding_matrix(self):
        embeddings = [page['embedding'] for page in self.data]
        # if stored as csv then need to convert to list (from string)
        if type(embeddings[0]) != list:
            embeddings = [eval(embedding) for embedding in embeddings]
        self.matrix = np.vstack(embeddings)
        return self
    
    def cluster(self, n_clusters: int):
        if not hasattr(self, "matrix"):
            self.create_embedding_matrix()
        kmeans = KMeans(n_clusters=n_clusters, init="k-means++", random_state=42, n_init=10)
        kmeans.fit(self.matrix)
        labels = kmeans.labels_
        self.data = [dict(page, **{"cluster": label}) for page, label in zip(self.data, labels)]
        return self

    def cluster_optimized(self, max_clusters: int = 10):
        if not hasattr(self, "matrix"):
            self.create_embedding_matrix()
        best_score = -1
        best_clusters = None
        for n_clusters in range(2, max_clusters+1):
            kmeans = KMeans(n_clusters=n_clusters, init="k-means++", random_state=42, n_init=10)
            kmeans.fit(self.matrix)
            labels = kmeans.labels_
            score = silhouette_score(self.matrix, labels)
            if score > best_score:
                best_score = score
                best_clusters = n_clusters
        self.cluster(best_clusters)
        return self


In [64]:
import re

def sanitize_text(text):
    # Replace any non-alphanumeric character with a space
    text = re.sub(r'[^\w\s]', ' ', text)
    # Replace any multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    # Strip leading and trailing whitespace
    text = text.strip()
    return text

# MAIN

In [5]:
fpath = "../data/powers2017.pdf"
mongodoc = upload_doc(fpath)

In [65]:
page_text = ' '.join([sanitize_text(page['content']) for page in mongodoc])

In [71]:
enc = tiktoken.encoding_for_model("text-davinci-003")
tokens = enc.encode(page_text)
# split into chunks of 2800 tokens
chunks = [tokens[i:i+2800] for i in range(0, len(tokens), 2800)]
# decode chunks
decoded = [enc.decode(chunk) for chunk in chunks]

In [132]:
class Completion:
    def __init__(self, temperature, max_tokens, stream=False, model="text-davinci-003", **kwargs):
        self.temperature = temperature
        self.max_tokens = max_tokens
        self.stream = stream
        self.model = model
        self.kwargs = kwargs

        openai.api_key = os.getenv("OPENAI_API_KEY")
    
    def __call__(self, text):
        raw_response = openai.Completion.create(
            model=self.model,
            prompt=text,
            temperature=self.temperature,
            max_tokens=self.max_tokens,
            **self.kwargs
        )
        if self.stream:
            return raw_response
        else:
            return raw_response['choices'][0]['text'].strip()


class Chat:
    def __init__(self, temperature, system_message="You are a helpful assistant.", messages=None, model='gpt-3.5-turbo'):
        self.messages = []
        self.messages.append({"role": "system", "content": system_message})
        if messages is not None:
            self.messages += [{"role": "user", "content": message} for message in messages]
        self.model = model
        self.temperature = temperature

    def __call__(self, user_message: str):
        user_message = {"role": "user", "content": user_message}
        self.messages.append(user_message)
        raw_response = openai.ChatCompletion.create(
            model=self.model,
            messages=self.messages,
            temperature=self.temperature,
        )
        response_message = raw_response['choices'][0]['message']['content'].strip()
        self.messages.append(response_message)
        return response_message

In [None]:
chat = Chat(temperature=0.9)
print(chat("Write a poem about a dog."))

# Get Citation

In [168]:
meta_list = [page['metadata'] for page in mongodoc]

citation_chat = Chat(temperature=0.9)
get_citation_prompt = f"Use this metadata to generate a ciation in APA format: \n\n{meta_list}"
final_citation = citation_chat(get_citation_prompt)

In [170]:
final_citation

'Powers, M. B. (2017). Core Mechanisms of Cognitive Behavioral Therapy for Anxiety and Depression. Psychiatric Clinics of North America, 40, 611-623. doi:10.1016/j.psc.2017.08.010.'

# Get intermediate data

In [175]:
citation_qualifier = f"Use this citation: {final_citation} to cite your work."

main_ideas_prompt = f"Identify and list 2-3 main ideas from the context. {citation_qualifier}"
quotes_prompt = f"Identify and list 2-3 relevant quotes from the context. {citation_qualifier}"
passages_prompt = f"Identify and list 2-3 relevant passages from the context. {citation_qualifier}"

prompts = [
    {"type": "main_ideas", "prompt": main_ideas_prompt},
    {"type": "quotes", "prompt": quotes_prompt},
    {"type": "passages", "prompt": passages_prompt}
]

context = decoded[:2]

system_message = "You are a helpful assistant that is very good at problem solving who thinks step by step. You always cite direct quotes and paraphrases with the appropriate in-text citation."

responses = []
for idx, chunk in enumerate(context):
    print(f"Chunk {idx+1} of {len(context)}")
    page_responses = []
    for prompt in prompts:
        print(prompt["type"])
        chat = Chat(temperature=0.9, system_message=system_message)
        response = chat(f"CONTEXT:{chunk}\n\nQUERY:{prompt['prompt']}")
        page_responses.append(
            {"Chunk": idx+1, "prompt_type": prompt["type"], "response": response})
    responses.append(page_responses)

Chunk 1 of 2
main_ideas
quotes
passages
Chunk 2 of 2
main_ideas
quotes
passages


In [176]:
for page in responses:
    for response in page:
        print(f'Chunk: {response["Chunk"]}\nType: {response["prompt_type"]}\n\nResponse:\n{response["response"]}')

Chunk: 1
Type: main_ideas

Response:
1. The article discusses the core mechanisms of Cognitive Behavioral Therapy (CBT) for anxiety and depression, including fear extinction and cognitive change through cognitive restructuring and behavioral activation.

2. The authors emphasize the importance of identifying treatment mediators to improve interventions and suggest seven criteria for establishing a causal relationship between the independent variable, the mediator, and the dependent variable.

3. The study also presents research on fear extinction and threat reappraisal as core mechanisms of change in CBT for anxiety disorders, with fear extinction primarily targeting behavior and inhibitory learning, and cognitive change targeting cognitive misappraisal activity.
Chunk: 1
Type: quotes

Response:
1. "Core mechanisms of CBT include the specific psychological factors responsible for symptom improvement with therapy." (Powers, 2017, p. 611)
2. "The authors first briefly define CBT and medi

# Summarize intermediate data

In [179]:
llm = Completion(temperature=0.9, max_tokens=1000)
summary_responses = []
for page in responses:
    combine_prompt = f"Combine the following Main Ideas:\n{page[0]['response']}\n\nQuotes:\n{page[1]['response']}\n\nPassages:\n{page[2]['response']}\n\ninto a short summary. Make sure to include appropriate citations as needed.\n\nSUMMARY:"
    summary_response = llm(combine_prompt)
    summary_responses.append(summary_response)

In [180]:
for response in summary_responses:
    print(response)

Cognitive Behavioral Therapy (CBT) for anxiety and depression is a form of psychotherapy that focuses on changing thought patterns and behaviors that are contributing to mental health issues. The article discusses the core mechanisms of CBT, including fear extinction and cognitive change through cognitive restructuring and behavioral activation, and puts emphasis on the importance of identifying treatment mediators to improve interventions (Powers, 2017, p. 611). Research presented in the study suggests that the core mechanisms of change in CBT for anxiety disorders are fear extinction, which targets behavior and inhibitory learning, and cognitive change, which targets cognitive misappraisals (Powers, 2017, p. 620). Threat reappraisal is proposed as a mediator for the efficacy of CBT in anxiety disorders and is associated with symptom reduction, however, there is less evidence that threat reappraisal causes symptom reduction in cases of OCD (Powers, 2017, p. 623).
According to Powers (

# Combine summaries into essay

In [183]:
print(llm(f"combine the following passages:{' '.join([summary for summary in summary_responses])} into an essay. Retain your in-text citations and make sure to include a reference list at the end of your essay using this citation: {final_citation}."))

Cognitive Behavioral Therapy (CBT) is a form of psychotherapy that focuses on changing thought patterns and behaviors that are contributing to mental health issues (Powers, 2017, p. 611). The core mechanisms of CBT that are discussed in Powers' study (2017) include fear extinction, cognitive change through cognitive restructuring and behavioral activation, and identifying treatment mediators to improve interventions (p.620-621). In particular, research suggests that threat reappraisal is proposed as a mediator for the efficacy of CBT in anxiety disorders and is associated with symptom reduction (Powers, 2017, p. 623). This process also plays an essential role in achieving treatment goals for depression, with cognitive change being a significant mediator for cognitive restructuring and behavioral activation therapies (Powers, 2017). Furthermore, both the limbic system and prefrontal cortex are intricately involved in the process of change for both anxiety and depression, and fear extinc