In [106]:
"""
Retrieval-Augmented Generation (RAG) Model
Model: ChatGPTAPI-3.5T
Project: MenoLearn

Authors:
WI & SP25 Update: Andrew Gibson
"""

'\nRetrieval-Augmented Generation (RAG) Model\nModel: ChatGPTAPI-3.5T\nProject: MenoLearn\n\nAuthors:\nWI & SP25 Update: Andrew Gibson\n'

In [107]:
# Forces correct Python version (problem specific to Andrew G's Jupyter Notebooks)
import sys
!{sys.executable} -m pip install openai

# For most devs the following will be sufficient:
# ! pip install openai



In [110]:
# checkout current conda environment
!echo $CONDA_DEFAULT_ENV

ragenvconda


In [None]:
import openai

# Initialize the client with MenoLearn API Key
client = openai.OpenAI(api_key="")


In [115]:
# Source from multiple files
file_paths = ["Intl J Gynecology Obste 2024 Genazzani Counseling in menopausal women.pdf", 
              "JMM The 2020 Menopausal Hormone Therapy Guidelines.pdf", 
              "BMJ Diagnosis and management of menopause- summary of NICE guidance.pdf", 
             "nams 2022 hormone therapy position statement.pdf",
             "Flores 2021 Hormone Therapy in Menopause- Concepts, Controversies, and Approach to Treatment.pdf", 
             "Hill 2016 Hormone Therapy and Other Treatments for Symptoms of Menopause.pdf"]

In [None]:
# When you ingest each file, attach metadata for name and URL.
for path, url in [
    ("Intl J Gynecology Obste 2024 Genazzani Counseling in menopausal women.pdf", "https://obgyn.onlinelibrary.wiley.com/doi/10.1002/ijgo.15278"),
    ("JMM The 2020 Menopausal Hormone Therapy Guidelines.pdf", "https://pmc.ncbi.nlm.nih.gov/articles/PMC7475284/"),
    ("BMJ Diagnosis and management of menopause- summary of NICE guidance.pdf", "https://www.bmj.com/content/351/bmj.h5746/rapid-responses"),
    ("nams 2022 hormone therapy position statement.pdf", "https://menopause.org/wp-content/uploads/professional/nams-2022-hormone-therapy-position-statement.pdf"),
    ("Flores 2021 Hormone Therapy in Menopause- Concepts, Controversies, and Approach to Treatment.pdf", "https://academic.oup.com/edrv/article/42/6/720/6226912"),
    ("Hill 2016 Hormone Therapy and Other Treatments for Symptoms of Menopause.pdf", "https://www.aafp.org/pubs/afp/issues/2016/1201/p884.pdf")
]:
    with open(path, "rb") as f:
        client.vector_stores.files.upload_and_poll(
            vector_store_id=vector_store.id,
            file=f,
            metadata={"source_name": path, "source_url": url}
        )

In [116]:

vector_store = client.vector_stores.create(        # Create vector store
    name="MenoLearn RAG Vector Store",
)
print(vector_store.id)

"""
client.vector_stores.files.upload_and_poll(        # Upload file
    vector_store_id=vector_store.id,
    file=open("RAG Dataset.txt", "rb")
)
"""
for path in file_paths:
    with open(path, "rb") as file:
        client.vector_stores.files.upload_and_poll(
            vector_store_id=vector_store.id,
            file=file
        )

vs_68191e3cd5548191a1d7816d26e75a8a


In [117]:
client.vector_stores.list()

SyncCursorPage[VectorStore](data=[VectorStore(id='vs_68191e3cd5548191a1d7816d26e75a8a', created_at=1746476604, file_counts=FileCounts(cancelled=0, completed=7, failed=0, in_progress=0, total=7), last_active_at=1746476625, metadata={}, name='MenoLearn RAG Vector Store', object='vector_store', status='completed', usage_bytes=1402206, expires_after=None, expires_at=None), VectorStore(id='vs_681907c458a08191b49f951144f03341', created_at=1746470852, file_counts=FileCounts(cancelled=0, completed=3, failed=0, in_progress=0, total=3), last_active_at=1746470864, metadata={}, name='MenoLearn RAG Vector Store', object='vector_store', status='completed', usage_bytes=676625, expires_after=None, expires_at=None), VectorStore(id='vs_681906cf3a9c81918cf843528d36f211', created_at=1746470607, file_counts=FileCounts(cancelled=0, completed=3, failed=0, in_progress=0, total=3), last_active_at=1746470620, metadata={}, name='MenoLearn RAG Vector Store', object='vector_store', status='completed', usage_bytes=

In [118]:
# Send search query to get relevant results
user_query = "How should I manage cramps during perimenopause?"

# Retrieve relevant documents
results = client.vector_stores.search(
    vector_store_id=vector_store.id,
    query=user_query
)

# Manually select the top 2 results
top_results = results.data[:1]

"""
# Extract the text from the retrieved documents
context = "\n\n".join([doc.document.text for doc in results.data])

# Generate a concise answer using the retrieved context
response = client.chat.completions.create(
    model="gpt-4",
    messages=[
        {
            "role": "system",
            "content": "Answer the question concisely in 1-2 sentences and cite only the most relevant 1-2 documents."
        },
        {
            "role": "user",
            "content": f"{user_query}\n\nContext:\n{context}"
        }
    ],
    max_tokens=200  # Limits the length of the generated answer
)
"""

'\n# Extract the text from the retrieved documents\ncontext = "\n\n".join([doc.document.text for doc in results.data])\n\n# Generate a concise answer using the retrieved context\nresponse = client.chat.completions.create(\n    model="gpt-4",\n    messages=[\n        {\n            "role": "system",\n            "content": "Answer the question concisely in 1-2 sentences and cite only the most relevant 1-2 documents."\n        },\n        {\n            "role": "user",\n            "content": f"{user_query}\n\nContext:\n{context}"\n        }\n    ],\n    max_tokens=200  # Limits the length of the generated answer\n)\n'

In [119]:
import textwrap


# Loop through each search result in the page
for i, result in enumerate(top_results, start=1):
    print(f"\n--- Top Result {i} ---")

    # Extract and clean the text content
    for content_item in result.content:
        if hasattr(content_item, 'text'):
            raw_text = content_item.text
            clean_text = ' '.join(raw_text.split())  # Removes extra spaces and newlines
            wrapped_text = textwrap.fill(clean_text, width=100)
            print(wrapped_text)



--- Top Result 1 ---
When choosing a nonhormonal regimen, consideration should be given to use the lowest effective dose
to avoid the unwanted side effect of de- creased libido, as well as potential nausea, constipation,
and dry mouth (255). The following SSRIs (paroxetine, fluox- etine, sertraline) should specifically
be avoided in women taking tamoxifen as an adjuvant therapy in the manage- ment of breast cancer
because they can inhibit tamoxifen’s active metabolite (256, 257). The antiseizure medications
gabapentin and pregabalin also reduce VMS; however, side effects limit their use at high doses
(258-260). Gabapentin and pregabalin both can cause drowsiness and dizziness; pregabalin can also
decrease libido. In a randomized, placebo-controlled phase 2 trial, oxybutynin was also found to be
more effective than placebo (73% vs 26%) at relieving moderate-to-severe VMS, with dry mouth being
the most common side effect (261). As the neurokinin B/neurokinin 3 receptor (NK3R) signaling


In [33]:
"""
# Semantic search, search query
results = client.vector_stores.search(
    vector_store_id=vector_store.id,
    query="What hormone replacement options are there?",
)
"""

In [34]:
"""
client.vector_stores.retrieve(
    vector_store_id="vs_680fd331d46c81919bfe3f46bbdbd987"
)
"""

VectorStore(id='vs_680fd331d46c81919bfe3f46bbdbd987', created_at=1745867569, file_counts=FileCounts(cancelled=0, completed=1, failed=0, in_progress=0, total=1), last_active_at=1745868218, metadata={}, name='MenoLearn RAG Vector Store', object='vector_store', status='completed', usage_bytes=195889, expires_after=None, expires_at=None)