In [1]:
!pip install openai==0.28


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import chromadb
from chromadb.config import Settings
from chromadb import Client, Settings
from chromadb.utils import embedding_functions

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.probability import FreqDist
from nltk.tokenize.treebank import TreebankWordDetokenizer

import string

from datetime import date

from markupsafe import Markup


import ssl
import openai

import logging

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

import os

In [3]:
# adding a new database
# 1: add new client variable with unique var name
# 2: update getCollection()

In [4]:
embedding = embedding_functions.OpenAIEmbeddingFunction( # Using openai 
                api_key=os.environ.get('OPENAI_API_KEY'),
                model_name="text-embedding-ada-002"
            )

bellevue_client = Client(Settings(chroma_db_impl="duckdb+parquet", persist_directory=os.environ.get("CHATBOT_PATH")+'/Data/city_bellevue_db'))
bellevue_collection = bellevue_client.get_collection('city_bellevue_db', embedding_function=embedding)

clyde_hill_client = Client(Settings(chroma_db_impl="duckdb+parquet", persist_directory=os.environ.get("CHATBOT_PATH")+'/Data/clyde_hill_db'))
clyde_hill_collection = clyde_hill_client.get_collection('clyde_hill_db', embedding_function=embedding)

bsd_client = Client(Settings(chroma_db_impl="duckdb+parquet", persist_directory=os.environ.get("CHATBOT_PATH")+'/Data/bsd_db'))
bsd_collection = bsd_client.get_collection('bsd_db', embedding_function=embedding)

# Add more databases here.

FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))

In [5]:
def getCollection(curOrg):
    if curOrg == "City of Bellevue":
        return bellevue_collection
    if curOrg == "City of Clyde Hill":
        return clyde_hill_collection
    if curOrg == "Bellevue School District":
        return bsd_collection
    
    # Add more databases here.

In [6]:
# Initialize Logging
logging.basicConfig(filename='chatbot.log', level=logging.INFO, format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

In [7]:
def gptMessage(query, processingHistory, qaHistory, curOrg, websiteTitleSuffix):
    collection = getCollection(curOrg)
    
    # Clarify the prompt using a GPT message
    processingHistory.append({"role": "user", "content": query})
    promptProcessing = openai.ChatCompletion.create(
        model = "gpt-3.5-turbo",
        messages=processingHistory)
    
    preprocessed_query = promptProcessing["choices"][0]["message"]["content"]
    processingHistory.append({"role": "assistant", "content": preprocessed_query})

    # ChromaDB Vector Search
    docs = collection.query(query_texts=[preprocessed_query], n_results=5, include=["documents", "metadatas", "distances"])
    
    # Website titles commonly end with a suffix that can be removed.
    for i in range(len(docs['metadatas'][0])):
        docs['metadatas'][0][i]['title'] = docs['metadatas'][0][i]['title'].replace(websiteTitleSuffix, "")
    
    
    
    # Some documents, especially PDFs, were given a placeholder title when scraping if a title could not be found.
    # This code attempts to generate a replacement title using the content of the document, denoted with an asterisk (*)
    for i in range(len(docs['documents'][0])):
        if curOrg.lower() in docs['metadatas'][0][i]['title'].lower():
            clean_text = docs['documents'][0][i].replace("\n", " ").replace("  ", " ")
            words = word_tokenize(clean_text)
            
            stop_words = set(stopwords.words('english'))
            filtered_words = [word for word in words if word.lower() not in stop_words and word not in set([",", ".", "'", "\"", ":", ";", "’"])]
            
            freq_dist = FreqDist(filtered_words)
            most_common_words = freq_dist.most_common(3)

            summary_words = [word for word, _ in most_common_words]

            summary_sentence = TreebankWordDetokenizer().detokenize(summary_words)

            docs['metadatas'][0][i]['title'] = "*" + summary_sentence
       
    # Start to construct a GPT query using the documents
    
    formatted_docs = "\n\n".join(docs['documents'][0])
    
    
    message = preprocessed_query + ". \nText:\n" + formatted_docs
    
    qaHistory.append({"role": "user", "content": message})
    
    response = openai.ChatCompletion.create(
        model = "gpt-3.5-turbo",
        messages=qaHistory)
    reply = response["choices"][0]["message"]["content"]
    qaHistory.append({"role": "assistant", "content": reply})
    
    logging.info(f"User Prompt: {query}")
    logging.info(f"Formatted Prompt: {preprocessed_query}")
    logging.info(f"Response: {reply}")
    
    sources = [dict(t) for t in {tuple(d.items()) for d in docs['metadatas'][0]}]
    return reply, sources, processingHistory, qaHistory

In [None]:
from flask import Flask, request, jsonify
from flask_cors import CORS

app = Flask(__name__)

CORS(app)
@app.route('/chroma', methods=['POST'])
def endpoint():
    data = request.get_json()
    message = data.get('message')
    
    messageHistory = []
    if 'messageHistory' in data:
        messageHistory = data.get('messageHistory')

    qaHistory = []
    if 'qaHistory' in data:
        qaHistory = data.get('qaHistory')
    
    curOrg = "City of Bellevue"
    if 'curOrg' in data:
        curOrg = data.get('curOrg')
    websiteTitleSuffix = "| City Of Bellevue"
    if 'websiteTitleSuffix' in data:
        websiteTitleSuffix = data.get('websiteTitleSuffix')
    
    # Process the message and generate a response
    res, sources, messageHistory, qaHistory = gptMessage(message, messageHistory, qaHistory, curOrg, websiteTitleSuffix)

    while len(str(messageHistory)) > 3500:
        messageHistory.pop(1)
        messageHistory.pop(1)
        
    while len(str(qaHistory)) > 3500:
        qaHistory.pop(1)
        qaHistory.pop(1)
                      
    print(res)
                      
    return jsonify({'response': res, 'sources': sources, 'messageHistory': messageHistory, 'qaHistory': qaHistory})

if __name__ == '__main__':
    app.run(port=5000)

 * Serving Flask app '__main__' (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off
Bellevue High School is located at 10416 SE Wolverine Way, Bellevue, WA 98004.
