In [None]:
!pip install pdfplumber

In [1]:
# imports

import os
import glob
from dotenv import load_dotenv
import gradio as gr

In [2]:
#import for Email Reading

import imaplib
import email
from email.header import decode_header
import pdfplumber
import io
from openai import OpenAI
import requests

In [3]:
# imports for langchain

from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

In [4]:
MODEL = "gpt-4o-mini"
AUDIO_TRANSCRIBE_MODEL = "whisper-1"
TEXT_TO_AUDIO_MODEL = "gpt-4o-mini-tts"
db_name = "assistant_vector_db"

In [5]:
# Load environment variables in a file called .env

load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')
os.environ['EMAIL_CONNECT_APP_NAME'] = os.getenv('YAHOO_CONNECT_APP_NAME', 'email-app-name-if-not-using-env')
os.environ['EMAIL_CONNECT_APP_PASSWORD'] = os.getenv('YAHOO_CONNECT_APP_PASSWORD', 'email-app-password-if-not-using-env')
os.environ['EMAIL_CONNECT_IMAP_SERVER'] = os.getenv('YAHOO_CONNECT_IMAP_SERVER', 'email-imap-server-if-not-using-env')
os.environ['EMAIL_CONNECT_ACCOUNT'] = os.getenv('YAHOO_CONNECT_EMAIL_ACCOUNT', 'email-app-password-if-not-using-env')


In [6]:
# Check whether Environment Variables are Loaded 
print(os.environ['EMAIL_CONNECT_IMAP_SERVER'])
print(os.environ['EMAIL_CONNECT_ACCOUNT'])

IMAP Mail Server
Your Email Address


In [7]:
# Read in family documents using LangChain's loaders
# Take everything in all the sub-folders of the knowledgebase

folders = glob.glob("assistant-knowledge-base/*")

text_loader_kwargs = {'encoding': 'utf-8'}
# If that doesn't work, some Windows users might need to uncomment the next line instead
# text_loader_kwargs={'autodetect_encoding': True}

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()
    for doc in folder_docs:
        doc.metadata["doc_type"] = doc_type
        documents.append(doc)

In [8]:
documents[0]

Document(metadata={'source': 'assistant-knowledge-base\\family\\Andy Tapaswi.md', 'doc_type': 'family'}, page_content='# Family Record\n\n# Andy Tapaswi\n\n## Summary\n- **Date of Birth:** December 4, 1981\n- **Relationship:** Father  \n- **Profile:** Software Professional\n- **Company:** MITRE')

In [9]:
def fetch_emails_as_documents(
    imap_server: str,
    email_account: str,
    app_password: str,
    search_from: str = None,
    search_subject: str = None,
    since_date: str = None,       # e.g., "01-Jan-2024"
    before_date: str = None,      # e.g., "31-Dec-2024"
    category: str = "General",
    folder: str = "inbox"
):
    """
    Connects to an IMAP mailbox, searches emails by optional FROM, SUBJECT,
    and DATE filters, extracts body + PDF attachments, and returns a flat
    list of Document objects (same structure as family records).
    """

    documents = []

    # --- Connect to IMAP ---
    try:
        mail = imaplib.IMAP4_SSL(imap_server)
        mail.login(email_account, app_password)
        mail.select(folder)
    except Exception as e:
        print(f"❌ IMAP connection/login failed: {e}")
        return documents

    # --- Build search query dynamically ---
    query_parts = []
    if search_from:
        query_parts.append(f'(FROM "{search_from}")')
    if search_subject:
        query_parts.append(f'(SUBJECT "{search_subject}")')
    if since_date:
        query_parts.append(f'(SINCE "{since_date}")')
    if before_date:
        query_parts.append(f'(BEFORE "{before_date}")')

    query = " ".join(query_parts) if query_parts else "ALL"

    # --- Perform IMAP search ---
    try:
        status, messages = mail.search(None, query)
        email_ids = messages[0].split()
        print(f"🔍 Found {len(email_ids)} email(s) using query: {query}")
    except Exception as e:
        print(f"❌ IMAP search failed: {e}")
        return documents

    if not email_ids:
        print("No matching emails found.")
        return documents

    # --- Process each email ---
    for eid in email_ids:
        try:
            status, msg_data = mail.fetch(eid, "(RFC822)")
            msg = email.message_from_bytes(msg_data[0][1])

            email_body = ""
            pdf_text = ""
            email_subject = msg["subject"] or ""
            email_from = msg["from"] or ""
            email_date = msg["date"] or ""

            # --- Extract text + PDF ---
            for part in msg.walk():
                content_type = part.get_content_type()
                content_disposition = str(part.get("Content-Disposition") or "")

                # Extract plain text body
                if content_type == "text/plain" and "attachment" not in content_disposition:
                    try:
                        email_body += part.get_payload(decode=True).decode(errors="ignore")
                    except Exception:
                        pass

                # Extract PDF text
                if "application/pdf" in content_type:
                    file_data = part.get_payload(decode=True)
                    with pdfplumber.open(io.BytesIO(file_data)) as pdf:
                        for page in pdf.pages:
                            text = page.extract_text()
                            if text:
                                pdf_text += text + "\n"

            # --- Add email body as Document ---
            if email_body.strip():
                documents.append(
                    Document(
                        page_content=email_body.strip(),
                        metadata={
                            "source": email_from,
                            "subject": email_subject,
                            "date": email_date,
                            "category": category,
                            "part": "body",
                            "doc_type": "email"
                        }
                    )
                )

            # --- Add PDF content as Document ---
            if pdf_text.strip():
                documents.append(
                    Document(
                        page_content=pdf_text.strip(),
                        metadata={
                            "source": email_from,
                            "subject": email_subject,
                            "date": email_date,
                            "category": category,
                            "part": "attachment",
                            "doc_type": "email"
                        }
                    )
                )

        except Exception as e:
            print(f"⚠️ Error processing email ID {eid}: {e}")
            continue

    print(f"✅ Added {len(documents)} Document objects (emails + attachments).")
    return documents


In [10]:
# Assuming environment vars already set
IMAP_SERVER = os.environ['EMAIL_CONNECT_IMAP_SERVER']
EMAIL_ACCOUNT = os.environ['EMAIL_CONNECT_ACCOUNT']
APP_PASSWORD = os.environ['EMAIL_CONNECT_APP_PASSWORD']

# Existing global list (already contains family data)
print(f"Initial records in documents: {len(documents)}")

# Fetch emails — any combination of filters
eye_docs = fetch_emails_as_documents(
    imap_server=IMAP_SERVER,
    email_account=EMAIL_ACCOUNT,
    app_password=APP_PASSWORD,
    search_from="<Enter From Email Address>",
    search_subject="<ENter Specific Subject>",
    since_date="01-Jan-2024",
    category="<Category - for me it was Healthcare>"
)

# Merge into main list
documents.extend(eye_docs)

print(f"✅ Total combined documents: {len(documents)}")

# Fetch emails — any combination of filters
eye_docs = fetch_emails_as_documents(
    imap_server=IMAP_SERVER,
    email_account=EMAIL_ACCOUNT,
    app_password=APP_PASSWORD,
    search_from="<enter your Email Address>",
    since_date="01-Jan-2024",
    category="<Category, for me it was School>"
)

# Merge into main list
documents.extend(eye_docs)


Initial records in documents: 2
🔍 Found 1 email(s) using query: (FROM "info@metrowesteye.com") (SUBJECT "EYEGLASS RX") (SINCE "01-Jan-2024")
✅ Added 2 Document objects (emails + attachments).
✅ Total combined documents: 4
🔍 Found 4 email(s) using query: (FROM "KLovely@natickps.org") (SINCE "01-Jan-2024")
✅ Added 4 Document objects (emails + attachments).


In [11]:
print(documents[2])

page_content='Hello,

Please see attached for your daughter's eyeglass prescription!

Liz

*Dr. Yen Tran-**Pan**tano, O.D.*
metrowesteye.com
*508-655-1400*
*81 Speen Street*

*Natick, MA 01760*' metadata={'source': 'Metrowest Family Eye Care <info@metrowesteye.com>', 'subject': 'EYEGLASS RX', 'date': 'Mon, 9 Jun 2025 13:58:58 -0400', 'category': 'Vision Care', 'part': 'body', 'doc_type': 'email'}


In [12]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

In [13]:
len(chunks)

8

In [14]:
doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)
print(f"Document types found: {', '.join(doc_types)}")

Document types found: family, email


In [15]:
# Put the chunks of data into a Vector Store that associates a Vector Embedding with each chunk

embeddings = OpenAIEmbeddings()

# If you would rather use the free Vector Embeddings from HuggingFace sentence-transformers
# Then replace embeddings = OpenAIEmbeddings()
# with:
# from langchain.embeddings import HuggingFaceEmbeddings
# embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [16]:
# Check if a Chroma Datastore already exists - if so, delete the collection to start from scratch

if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

In [17]:
# Create our Chroma vectorstore!

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

Vectorstore created with 8 documents


In [18]:
# Get one vector and find how many dimensions it has

collection = vectorstore._collection
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions:,} dimensions")

The vectors have 1,536 dimensions


In [19]:
sample_embedding

array([-0.0066553 , -0.00715228, -0.02628758, ..., -0.02094943,
       -0.01834987, -0.00941821])

In [20]:
# create a new Chat with OpenAI
llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG
retriever = vectorstore.as_retriever()

# putting it together: set up the conversation chain with the GPT 4o-mini LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

  memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)


In [21]:
query = "Who is my daughter's eye Doctor and tell me the left eye measurements for ordering glasses?"
result = conversation_chain.invoke({"question":query})
print(result["answer"])

Apurba's Vision Care Doctor is Yen Tran-Pantano, OD. The left eye measurements for ordering glasses are as follows:

- Sphere: -4.00
- Cyl: -1.00
- Axis: 170


In [22]:
# set up a new conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# putting it together: set up the conversation chain with the GPT 4o-mini LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

In [23]:
openai_client = OpenAI()

In [24]:
# Audio to Text
def audio_to_text(audio_path: str) -> str:
    """
    Transcribes an audio file into text using OpenAI Whisper.
    """
    try:
        with open(audio_path, "rb") as f:
            transcript = openai_client.audio.transcriptions.create(
                model="whisper-1",
                file=f,
                response_format="text"
            )
        text = transcript.strip()
        print(f"🎧 Transcribed: {text}")
        return text
    except Exception as e:
        return f"⚠️ Error during transcription: {str(e)}"


In [25]:
def text_to_speech(answer_text: str, output_file="response.mp3") -> str:
    """
    Converts text to speech using OpenAI TTS.
    Returns path to the generated audio file.
    """
    try:
        speech = openai_client.audio.speech.create(
            model=TEXT_TO_AUDIO_MODEL,
            voice="alloy",   # you can try "verse", "coral", etc.
            input=answer_text
        )
        with open(output_file, "wb") as f:
            f.write(speech.read())
        return output_file
    except Exception as e:
        print("TTS Error:", e)
        return None


In [26]:
def ask_rag(question: str) -> str:
    """
    Sends a text question to the ConversationalRetrievalChain and returns an answer.
    """
    try:
        response = conversation_chain.invoke({"question": question})
        answer = response.get("answer", "⚠️ No response generated.")
        print(f"💬 Answer: {answer}")
        return answer
    except Exception as e:
        return f"⚠️ Error during RAG query: {str(e)}"


In [27]:
def voice_to_rag(audio_path: str):
    """
    Full pipeline: audio → text → RAG → speech.
    """
    question = audio_to_text(audio_path)
    if question.startswith("⚠️"):
        return question, None

    answer = ask_rag(question)

    # Convert answer to speech
    tts_file = text_to_speech(answer)

    response_text = f"🗣️ You asked: {question}\n\n💬 Answer: {answer}"
    return response_text, tts_file


In [28]:
def launch_gradio():
    gr.Interface(
        fn=voice_to_rag,
        inputs=gr.Audio(type="filepath", label="🎙️ Ask your question"),
        outputs=[
            gr.Textbox(label="Transcription and Answer"),
            gr.Audio(label="🔊 Hear the Answer")
        ],
        title="Andy's RAG Assistant",
        description="Andy's Personal RAG answers it.",
    ).launch(inbrowser=True)


In [29]:
launch_gradio()

* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.
