In [None]:
from __future__ import print_function
import os, base64
from bs4 import BeautifulSoup
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build

import faiss
import numpy as np
import tiktoken
from openai import OpenAI
from dotenv import load_dotenv

# ---------------------------
# Setup
# ---------------------------

SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# ---------------------------
# Gmail fetcher
# ---------------------------
def get_gmail_service():
    creds = None
    if os.path.exists('token.json'):
        creds = Credentials.from_authorized_user_file('token.json', SCOPES)
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file('creds.json', SCOPES)
            creds = flow.run_local_server(port=0)
        with open('token.json', 'w') as token:
            token.write(creds.to_json())
    return build('gmail', 'v1', credentials=creds)

def fetch_emails_from_senders(senders, max_per_sender=10):
    service = get_gmail_service()
    all_emails = []

    for sender in senders:
        query = f'from:{sender}'
        results = service.users().messages().list(
            userId='me', q=query, maxResults=max_per_sender
        ).execute()
        messages = results.get('messages', [])
        print(f"Fetched {len(messages)} messages from {sender}")

        for msg in messages:
            txt = service.users().messages().get(userId='me', id=msg['id']).execute()
            payload = txt['payload']
            headers = payload.get("headers")

            subject, from_email, date = "", "", ""
            for d in headers:
                if d['name'] == 'Subject': subject = d['value']
                if d['name'] == 'From': from_email = d['value']
                if d['name'] == 'Date': date = d['value']

            body = ""
            if 'parts' in payload:
                for part in payload['parts']:
                    if part['mimeType'] == 'text/plain':
                        data = part['body'].get('data')
                        if data:
                            body = base64.urlsafe_b64decode(data).decode('utf-8')
                    elif part['mimeType'] == 'text/html' and not body:
                        data = part['body'].get('data')
                        if data:
                            html = base64.urlsafe_b64decode(data).decode('utf-8')
                            body = BeautifulSoup(html, "html.parser").get_text()
            else:
                data = payload['body'].get('data')
                if data:
                    body = base64.urlsafe_b64decode(data).decode('utf-8')

            all_emails.append({
                "id": msg['id'],
                "from": from_email,
                "subject": subject,
                "date": date,
                "body": body.strip()
            })
    return all_emails

# ---------------------------
# Chunking + Embeddings + FAISS
# ---------------------------
def chunk_text(text, max_tokens=300, overlap=50):
    enc = tiktoken.get_encoding("cl100k_base")
    tokens = enc.encode(text)
    chunks = []
    start = 0
    while start < len(tokens):
        end = min(start + max_tokens, len(tokens))
        chunk = enc.decode(tokens[start:end])
        chunks.append(chunk)
        start += max_tokens - overlap
    return chunks

def get_embedding(text, model="text-embedding-3-small"):
    resp = client.embeddings.create(model=model, input=text)
    return np.array(resp.data[0].embedding, dtype=np.float32)

def build_faiss_index(emails):
    dim = 1536
    index = faiss.IndexFlatL2(dim)
    metadata = []
    vectors = []

    for e in emails:
        chunks = chunk_text(e["body"])
        for i, ch in enumerate(chunks):
            emb = get_embedding(ch)
            vectors.append(emb)
            metadata.append({
                "id": f'{e["id"]}_chunk{i}',
                "from": e["from"],
                "subject": e["subject"],
                "date": e["date"],
                "text": ch
            })

    vectors = np.vstack(vectors)
    index.add(vectors)
    return index, metadata

def search_emails(query, index, metadata, k=3):
    q_emb = get_embedding(query)
    D, I = index.search(np.array([q_emb]), k)
    results = [metadata[i] for i in I[0]]
    return results

# ---------------------------
# LLM Answering
# ---------------------------
def answer_query(query, index, metadata, k=3, model="gpt-4o-mini"):
    results = search_emails(query, index, metadata, k=k)

    context = ""
    for r in results:
        context += f"From: {r['from']}\nSubject: {r['subject']}\nDate: {r['date']}\nText: {r['text']}\n\n"

    prompt = f"""
    Use the following email excerpts to answer the question.
    If the answer isn't found, say "Not found in emails."

    Context:
    {context}

    Question: {query}
    Answer:
    """

    resp = client.chat.completions.create(
        model=model,
        messages=[{"role": "system", "content": "You are an assistant that answers based only on emails."},
                  {"role": "user", "content": prompt}]
    )
    return resp.choices[0].message.content


Fetched 5 messages from nyu536@nyu.edu

‚ùì Query: Research events around LLM this week
üí° Answer: Not found in emails.


In [10]:
def answer_query(query, index, metadata, k=3, model="gpt-4o-mini"):
    results = search_emails(query, index, metadata, k=k)

    # Build context from retrieved chunks
    context = ""
    for r in results:
        context += f"From: {r['from']}\nSubject: {r['subject']}\nDate: {r['date']}\nText: {r['text']}\n\n"

    prompt = f"""
    Use the following email excerpts to answer the question.
    If the answer isn't found, say "Not found in emails."

    Context:
    {context}

    Question: {query}
    Answer:
    """

    resp = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are an assistant that answers based only on emails."},
            {"role": "user", "content": prompt}
        ]
    )
    return resp.choices[0].message.content


In [11]:
if __name__ == '__main__':
    senders = ["nyu536@nyu.edu"]
    emails = fetch_emails_from_senders(senders, max_per_sender=5)

    index, metadata = build_faiss_index(emails)

    while True:
        query = input("\n‚ùì Ask a question about your emails (or type 'exit'): ")
        if query.lower() in ["exit", "quit"]:
            break
        answer = answer_query(query, index, metadata, k=3)
        print("\nüí° Answer:", answer)

Fetched 5 messages from nyu536@nyu.edu


KeyboardInterrupt: Interrupted by user