In [1]:
import faiss
import openai
import numpy as np
import tiktoken


In [None]:
from __future__ import print_function
import os.path
import base64
from bs4 import BeautifulSoup
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build

SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']

def get_gmail_service():
    creds = None
    if os.path.exists('token.json'):
        creds = Credentials.from_authorized_user_file('token.json', SCOPES)
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file('creds.json', SCOPES)
            creds = flow.run_local_server(port=0)
        with open('token.json', 'w') as token:
            token.write(creds.to_json())
    return build('gmail', 'v1', credentials=creds)

def fetch_emails_from_senders(senders, max_per_sender=20):
    """Fetch emails from a list of senders (max n per sender)."""
    service = get_gmail_service()
    all_emails = []

    for sender in senders:
        query = f'from:{sender}'
        results = service.users().messages().list(userId='me', q=query, maxResults=max_per_sender).execute()
        messages = results.get('messages', [])
        print(f"Fetched {len(messages)} messages from {sender}")

        for msg in messages:
            txt = service.users().messages().get(userId='me', id=msg['id']).execute()
            payload = txt['payload']
            headers = payload.get("headers")

            subject, from_email, date = "", "", ""
            for d in headers:
                if d['name'] == 'Subject': subject = d['value']
                if d['name'] == 'From': from_email = d['value']
                if d['name'] == 'Date': date = d['value']

            body = ""
            if 'parts' in payload:
                for part in payload['parts']:
                    if part['mimeType'] == 'text/plain':
                        data = part['body'].get('data')
                        if data:
                            body = base64.urlsafe_b64decode(data).decode('utf-8')
                    elif part['mimeType'] == 'text/html' and not body:
                        data = part['body'].get('data')
                        if data:
                            html = base64.urlsafe_b64decode(data).decode('utf-8')
                            body = BeautifulSoup(html, "html.parser").get_text()
            else:
                data = payload['body'].get('data')
                if data:
                    body = base64.urlsafe_b64decode(data).decode('utf-8')

            all_emails.append({
                "id": msg['id'],
                "from": from_email,
                "subject": subject,
                "date": date,
                "body": body.strip()
            })

    return all_emails




Fetched 3 messages from nyu536@nyu.edu
From: NYU Center for Data Science <nyu536@nyu.edu>
Date: Tue, 30 Sep 2025 14:59:17 -0000
Subject: CDS Weekly | Volume 6, Issue 5
Body:
NYU



### CDS WEEKLY | VOLUME 6, ISSUE 5 | SEPTEMBER 30, 2025



Important Dates & Reminders:

  * Post-Completion OPT Guidelines for International Students:

    *
Information and instructions on theÂ OPTÂ process (https://t.e2ma.net/click/5estrh/dt03qtsf/9t26dz). Please note that for t...

From: NYU Center for Data Science <nyu536@nyu.edu>
Date: Tue, 23 Sep 2025 14:30:49 -0000
Subject: CDS Weekly | Volume 6, Issue 4
Body:
NYU



### CDS WEEKLY | VOLUME 6, ISSUE 4 | SEPTEMBER 23, 2025



Important Dates & Reminders:

  * Post-Completion OPT Guidelines for International Students:

    *
Information and instructions on theÂ OPTÂ process (https://t.e2ma.net/click/1n8crh/dt03qtsf/9l0jcz). Please note that for t...

From: NYU Center for Data Science <nyu536@nyu.edu>
Date: Tue, 16 Sep 2025 14:25:51 -0000
Subject: CDS W

In [13]:
import faiss
import openai
import numpy as np
import tiktoken
import os

# Load API key from env
openai.api_key = os.getenv("api_key")

# ---- 1. Chunking ----
def chunk_text(text, max_tokens=300, overlap=50):
    enc = tiktoken.get_encoding("cl100k_base")
    tokens = enc.encode(text)
    chunks = []
    start = 0
    while start < len(tokens):
        end = min(start + max_tokens, len(tokens))
        chunk = enc.decode(tokens[start:end])
        chunks.append(chunk)
        start += max_tokens - overlap
    return chunks

# ---- 2. Embedding ----
def get_embedding(text, model="text-embedding-3-small"):
    resp = openai.Embedding.create(model=model, input=text)
    return np.array(resp['data'][0]['embedding'], dtype=np.float32)

# ---- 3. Build Index ----
def build_faiss_index(emails):
    dim = 1536  # embedding size for text-embedding-3-small
    index = faiss.IndexFlatL2(dim)
    metadata = []
    vectors = []

    for e in emails:
        chunks = chunk_text(e["body"])
        for i, ch in enumerate(chunks):
            emb = get_embedding(ch)
            vectors.append(emb)
            metadata.append({
                "id": f'{e["id"]}_chunk{i}',
                "from": e["from"],
                "subject": e["subject"],
                "date": e["date"],
                "text": ch
            })

    vectors = np.vstack(vectors)
    index.add(vectors)
    return index, metadata

# ---- 4. Search ----
def search_emails(query, index, metadata, k=3):
    q_emb = get_embedding(query)
    D, I = index.search(np.array([q_emb]), k)
    results = [metadata[i] for i in I[0]]
    return results


In [14]:
if __name__ == '__main__':
    senders = ["nyu536@nyu.edu"]
    emails = fetch_emails_from_senders(senders, max_per_sender=10)

    # Build index
    index, metadata = build_faiss_index(emails)

    # Ask a query
    query = "Did John confirm the meeting time?"
    results = search_emails(query, index, metadata, k=3)

    print("\nðŸ”Ž Retrieved Relevant Chunks:\n")
    for r in results:
        print("="*60)
        print(f"From: {r['from']}")
        print(f"Subject: {r['subject']}")
        print(f"Date: {r['date']}")
        print(f"Text: {r['text'][:300]}...\n")


Fetched 5 messages from nyu536@nyu.edu


APIRemovedInV1: 

You tried to access openai.Embedding, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742
