<a href="https://colab.research.google.com/github/ahmedHanzala/therapistGPT/blob/main/Therapist_Bot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


<h2>Upload PDF for book</h2>

In [None]:
from google.colab import files
uploaded = files.upload()

for filename in uploaded.keys():
    print(f'Uploaded file: {filename}')

Saving atomic-habbits-pdf.pdf to atomic-habbits-pdf.pdf
Uploaded file: atomic-habbits-pdf.pdf


<h2>Parsing Data</h2>

1. Converting pdf to csv
2. Removing Special Characters
3. Adding embeddings




In [None]:
!pip install pdfplumber
import csv
import pdfplumber
import re

def remove_special_characters(string):
    pattern = r'[^a-zA-Z0-9\s\.,!?\n]'  # Pattern to match special characters
    return re.sub(pattern, '', string)

def convert_pdf_to_csv(pdf_path, csv_path):
    with pdfplumber.open(pdf_path) as pdf:
        with open(csv_path, 'w', newline='') as csv_file:
            writer = csv.writer(csv_file,escapechar="\\")
            writer.writerow(['Page Number', 'Page Text'])

            for page_number, page in enumerate(pdf.pages, start=1):
                page_text = remove_special_characters(page.extract_text())
                writer.writerow([page_number, page_text])

file_name= next(iter(uploaded))

# Provide the paths for the PDF and CSV files
pdf_file_path = file_name
csv_file_path = 'dataset.csv'

# Call the function to convert PDF to CSV
convert_pdf_to_csv(pdf_file_path, csv_file_path)

<h3>Visualize the CSV file</h3>

In [None]:
import pandas as pd
dataset = pd.read_csv('dataset.csv')
dataset.head()

<h2>Getting embeddings for all pages</h2>

In [None]:
!pip install openai
import openai
EMBEDDINGS_MODEL = "text-embedding-ada-002"
openai.api_key = "YOUR API KEY"

In [None]:
def get_embeddings(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

In [None]:
import time
embeddings = []
run_count = 0
total = len(dataset)
while run_count <= total :
    for row in zip(dataset['Page Number'], dataset['Page Text']):
        result = get_embeddings(row[1])
        embeddings.append(result)
        print(f'Progress : {len(embeddings)}/{total}')
        run_count += 1
        if run_count >= total:
          break
        if run_count % 50 == 0 :
            time.sleep(60) #max requests per minute allowed = 60 so after 50 requests we wait for 60s


<h2>Adding the embeddings to the csv file</h3>

We store the embeddings in the csv file so that we dont have to make requests again and again for the embeddings

In [None]:
import csv
import tempfile
import shutil

csv_file = 'dataset.csv'
temp_file = tempfile.NamedTemporaryFile(mode='w', delete=False)


with open(csv_file, 'r') as file_in, open(temp_file.name, 'w', newline='') as file_out:
    reader = csv.reader(file_in)
    writer = csv.writer(file_out)

    header = next(reader)
    header.append('Embeddings')
    writer.writerow(header)  # Write the modified header to the temporary file

    for row in reader:
        embedding_value = embeddings.pop(0)  # Get the next embedding value from the list
        row.append(embedding_value)  # Append the embedding value to the row
        writer.writerow(row)  # Write the modified row to the temporary file

shutil.move(temp_file.name, csv_file)


'dataset.csv'

<h2>Updated CSV file</h2>

In [None]:
dataset = pd.read_csv('dataset.csv')
dataset.head()

Unnamed: 0,Page Number,Page Text,Embeddings
0,1,Introduction\nMy Story\nO\nN THE FINAL day of ...,"[-0.020569542422890663, -0.0034793440718203783..."
1,2,"Uh. Um. I stalled. Ten seconds passed.\nPatti,...","[-0.01841747760772705, -0.002075987635180354, ..."
2,3,"hospital happy, healthy, and cancer free. And ...","[0.0067032137885689735, -0.003371838014572859,..."
3,4,"get me down, but there were more than a few mo...","[-0.0019427494844421744, -0.008356345817446709..."
4,5,room neat and tidy. These improvements were mi...,"[0.0025397222489118576, 0.007604275830090046, ..."


<h1>Setting up functions
</h1>

In [None]:
import numpy as np
from collections import namedtuple
import ast

def parse_dataset():
   #returns a namedtuple with data and embeddings#
   page_number_list = dataset['Page Number'].tolist()
   page_text_list = dataset['Page Text'].tolist()
   page_embeddings_list = dataset['Embeddings'].tolist()
   for idx, element in enumerate(page_embeddings_list):
    page_embeddings_list[idx] = ast.literal_eval(element)
    for id , item in enumerate(page_embeddings_list[idx]):
      page_embeddings_list[idx][id] = float(item)

   page_embeddings_list = np.array(page_embeddings_list)
   top_k = min(3, len(page_text_list))
   return namedtuple('dataset',
    ['page_text_list',
    'page_embeddings_list',
    'page_numbers_list',
    'top_k'])(
        page_text_list,
        page_embeddings_list,
        page_number_list,
        top_k)

def cosine_distance(x,y):
      return np.dot(np.array(x), np.array(y))

def prepare_contexts(dataset):
    contexts = {}
    for page_text, page_number, embedding in zip(
        dataset.page_text_list,
        dataset.page_numbers_list,
        dataset.page_embeddings_list
    ):
        contexts[(page_text, page_number)] = embedding
    return contexts

def order_document_sections_by_query_similarity(query_embedding, context):
  similar = sorted([(cosine_distance(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in context.items()], reverse=True)
  return similar


def get_semantic_suggestions(prompt):
   Dataset = parse_dataset()
   query_embedding = np.array(get_embeddings(prompt), dtype=float)
   relevant_sections = order_document_sections_by_query_similarity(query_embedding, prepare_contexts(dataset=Dataset))
   top_three = relevant_sections[:Dataset.top_k]
   final = []
   for _, (page_text, page_number) in top_three:
    final.append({
                  'page_text': page_text,
                  'page_number': page_number
                  })

   return final



<h1>Prompt Engineering</h1>

In [None]:
CHAT_COMPLETIONS_MODEL = "gpt-3.5-turbo-0301"

def complete_chat(prompt_obj):
  reply = openai.ChatCompletion.create(
  messages=[
  {
  "role": "user",
  "content": prompt_obj['user']
  },
  {
  "role": "system",
  "content": prompt_obj['system']
  }
  ],
  model=CHAT_COMPLETIONS_MODEL,
  temperature=0.8
  )
  return reply


In [None]:
import sys
SYSTEM_DEFAULT_PROMPT= "You are a person’s personal therapist. The person is going to ask you different questions about life and how to grow and improve. You will answer the question referencing the context provided to you below in the context section. Stick to the context and have a positive attitude. If the user mentions suicide or other types of self-haram reply with: Please contact suicide prevention helpline: 111-222-333. Context: *insert text* "

user_prompt = input("Ask your personal Therapist anything: ")
string= ""
relevent_pages = get_semantic_suggestions(user_prompt)
for pages in relevent_pages:
  string = string+ f"Page Number: {pages['page_number']}\n Page Info: {pages['page_text'].strip()}\n"
updated_system_prompt = SYSTEM_DEFAULT_PROMPT.replace("*insert text*", string)

prompt_obj = {
    'user': user_prompt,
    'system': updated_system_prompt
}
reply = complete_chat(prompt_obj)
print("Therapist: ", reply['choices'][0]['message']['content'])



Ask your personal Therapist anything: How do i achieve my goals?
Therapist:  To achieve your goals, it's important to focus on developing systems that lead to those goals, rather than just setting specific goals. Goals are about the results you want to achieve, while systems are about the processes that lead to those results. By focusing on developing and sticking to effective systems, you can continuously improve and make progress towards your goals. Additionally, a systems-first mentality can help you find satisfaction and happiness in the process, rather than just in achieving the end result. Remember that goals are just a direction, and it's the systems you develop that will help you achieve them in the long-term.


<h2>Save the CSV to your drive</h2>

In [None]:
import os
drive_folder_path = '/content/drive/MyDrive/therapist-bot'
drive_folder_created= False
if not os.path.exists(drive_folder_path):
    os.makedirs(drive_folder_path)
    drive_folder_created=True
shutil.move('dataset.csv', drive_folder_path + '/dataset.csv')
if drive_folder_created:
    print("Folder therapist-bot created in Google Drive.")
print("CSV file saved in Google Drive folder 'therapist-bot'.")

CSV file saved in Google Drive folder 'therapist-bot'.
