In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import spacy
import re
from email.parser import BytesParser

In [None]:
!python -m spacy download en_core_web_md

In [None]:
# Load spaCy model
nlp = spacy.load('en_core_web_md')

# Load pre-trained model and tokenizer from Hugging Face
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

In [7]:
def get_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1)

In [14]:
from email import policy

In [15]:
# Function to split an email chain into individual messages
def split_email_chain(body):
    # Simple heuristic: split on "From:" line, assuming it's the start of an email message.
    # This will not work perfectly for all email chains.
    messages = re.split(r'(?m)^From:\s.*$', body)
    # Clean up the messages and remove empty entries
    messages = [msg.strip() for msg in messages if msg.strip()]
    return messages

In [28]:
# Function to parse emails and find semantically similar content
def semantic_email_search(eml_file_path, query, similarity_threshold=0.7):
    query_embedding = get_embedding(query).detach()

    # Read and parse the .eml file
    with open(eml_file_path, 'rb') as f:
        msg = BytesParser(policy=policy.default).parse(f)
    
    # Extract message body
    body = msg.get_body(preferencelist=('plain', 'html')).get_content()

    # Split the email chain into individual messages
    messages = split_email_chain(body)

    for i, message in enumerate(messages):
        # Compute the semantic embedding of the email message
        email_embedding = get_embedding(message)

        # Compute cosine similarity between query and email content
        cos_sim = torch.nn.functional.cosine_similarity(query_embedding, email_embedding)
        print(cos_sim.item())
        if cos_sim.item() > similarity_threshold:
            print(f"Message {i+1} in the chain is semantically similar to the query with a similarity score of {cos_sim.item()}")
            print(f"Content: {message[:200]}...")  # Print first 200 chars of the content


In [None]:
# Example usage
eml_file_path = "email-data.eml" 
search_query = " WBS code"

semantic_email_search(eml_file_path, search_query, similarity_threshold=0.3