In [1]:
import openai
import email
from email import policy
from email.parser import BytesParser
import re


In [2]:

import os
from dotenv import load_dotenv

In [None]:
load_dotenv()
Open_API_KEY = os.environ.get("OPENAI_API_KEY")
openai.api_key = Open_API_KEY

In [9]:
# Function to compute semantic embeddings using OpenAI API
def get_embedding(text):
    response = openai.Embedding.create(input=[text], engine="text-embedding-ada-002")
    return response['data'][0]['embedding']


In [6]:
# Function to split an email chain into individual messages
def split_email_chain(body):
    # Simple heuristic: split on "From:" line, assuming it's the start of an email message.
    # This will not work perfectly for all email chains.
    messages = re.split(r'(?m)^From:\s.*$', body)
    # Clean up the messages and remove empty entries
    messages = [msg.strip() for msg in messages if msg.strip()]
    return messages

In [None]:
%pip install spicy

In [13]:
from scipy.spatial.distance import cosine

# Function to calculate cosine similarity
def calculate_similarity(embedding1, embedding2):
    # Cosine similarity is the dot product of the vectors divided by the product of their magnitudes.
    # The scipy.spatial.distance.cosine function actually calculates the cosine distance, 
    # which is 1 - cosine similarity, so we subtract from 1.
    return 1 - cosine(embedding1, embedding2)

In [16]:
# Function to find semantically similar emails in a chain
def semantic_email_search(eml_path, query, similarity_threshold=0.5):
    query_embedding = get_embedding(query)

    # Read and parse the .eml file
    with open(eml_path, 'rb') as f:
        msg = BytesParser(policy=policy.default).parse(f)
    
    # Extract message body
    body = msg.get_body(preferencelist=('plain', 'html')).get_content()

    # Split the email chain into individual messages
    messages = split_email_chain(body)

    for i, message in enumerate(messages):
        # Compute the semantic embedding of the email message
        email_embedding = get_embedding(message)

        # Calculate cosine similarity between query and email content embeddings
        similarity_score = calculate_similarity(query_embedding, email_embedding)
        if similarity_score > similarity_threshold:
            print(f"Message {i+1} in the chain is semantically similar to the query with a similarity score of {similarity_score}")
            print(f"Content: {message}...\n")  # Print first 200 chars of the content


In [None]:
# Example usage
eml_file_path = "email-data.eml" 
search_query = " WBS code"

semantic_email_search(eml_file_path, search_query)