In [45]:
# imports

import os
import glob
import re
from dotenv import load_dotenv
import gradio as gr
from langchain.document_loaders import DirectoryLoader, TextLoader, CSVLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
from langchain.vectorstores import FAISS
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
import time
import random

In [46]:
headers = ['Subject',
 'Body',
 'From: (Name)',
 'From: (Address)',
 'From: (Type)',
 'To: (Name)',
 'To: (Address)',
 'To: (Type)',
 'CC: (Name)',
 'CC: (Address)',
 'CC: (Type)',
 'BCC: (Name)',
 'BCC: (Address)',
 'BCC: (Type)',
 'Billing Information',
 'Categories',
 'Importance',
 'Mileage',
 'Sensitivity']

In [47]:
file_path = "combined_emails.csv"

In [48]:
loader = CSVLoader(
    file_path=file_path,
    csv_args={
        "delimiter": ",",
        "quotechar": '"',
        "fieldnames": headers,  
    },
    encoding="utf-8",
)
docs = loader.load()

In [49]:

def extract_desired_attributes(doc):
    """
    Extract only the desired attributes: Subject, Body, From (Name), To (Name), CC (Name).
    """
    email = {}

    # Extract the subject
    subject_match = re.search(r'Subject: (.+)', doc.page_content)
    email['subject'] = subject_match.group(1).strip() if subject_match else None

    # Extract the body (text between 'Body:' and the next marker)
    body_match = re.search(r'Body:\s*(.+?)(?=\nFrom: \(Name\):|$)', doc.page_content, re.DOTALL)
    email['body'] = body_match.group(1).strip() if body_match else None

    # Extract From (Name)
    from_name_match = re.search(r'From: \(Name\): (.+)', doc.page_content)
    email['from_name'] = from_name_match.group(1).strip() if from_name_match else None

    # Extract To (Name)
    to_name_match = re.search(r'To: \(Name\): (.+)', doc.page_content)
    email['to_name'] = to_name_match.group(1).strip() if to_name_match else None

    # Extract CC (Name)
    cc_name_match = re.search(r'CC: \(Name\): (.+)', doc.page_content)
    email['cc_name'] = cc_name_match.group(1).strip() if cc_name_match else None

    # Add metadata
    email['metadata'] = doc.metadata

    return email


def process_all_docs(docs):
    """
    Process all documents to extract the desired attributes.
    """
    processed_emails = [extract_desired_attributes(doc) for doc in docs]
    return processed_emails


# Apply the function to all documents
processed_emails = process_all_docs(docs)




In [50]:

# Convert parsed emails to LangChain Document objects
def convert_to_documents(processed_emails):
    documents = []
    for email in processed_emails:
        content = (
            f"Subject: {email['subject']}\n"
            f"Body: {email['body']}\n"
            f"From: {email['from_name']}\n"
            f"To: {email['to_name']}\n"
            f"CC: {email['cc_name']}\n"
        )
        documents.append(Document(page_content=content, metadata=email['metadata']))
    return documents

# Convert processed emails to Document objects
documents = convert_to_documents(processed_emails)


In [51]:
# Configuration
db_name = "email_db"
file_path = "combined_emails.csv"
openai_key = os.getenv("OPENAI_KEY")


# Rate limiting configuration
MAX_TOKENS_PER_MIN = 1_000_000
EMBEDDING_MODEL_TOKENS = 8191  # Max tokens per request for text-embedding-ada-002

In [52]:
# Split documents into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

# Ensure no empty chunks
chunks = [chunk for chunk in chunks if chunk.page_content.strip()]


In [53]:
print(chunks[10])

page_content='Subject: RE: south_scorecards_042024.xlsx
Body: Hi Kevin,

 

I have records and confirmation that I sent the scorecard to Taylor, it just apparently didn’t get moved into your scorecard book. Apologies for that. I added Taylor to the end of this scorecard workbook.

 



 

From: Kevin Christian <adris3216@yahoo.com> 
Sent: Friday, May 31, 2024 6:36 AM
To: Analytics <analytics@pedigo-usa.com>
Subject: Fwd: south_scorecards_042024.xlsx

 

Conner, I did not have a tab for Taylor Dailing in this report.  Can you please send that to me and to Taylor in case she also did not received?  Thank you.  Kevin C.





	Begin forwarded message:

	 

	From: Conner Boudreaux <conner.boudreaux@pedigo-usa.com <mailto:conner.boudreaux@pedigo-usa.com> >

	Subject: south_scorecards_042024.xlsx

	Date: May 15, 2024 at 10:27:21 AM CDT

	To: Kevin Christian <kevin@pedigo-usa.com <mailto:kevin@pedigo-usa.com> >

	 

	Hi Kevin,

	 

	Here is the updated and hopefully final version of scorecards

In [54]:

# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings(openai_api_key=openai_key)

# Create a vector database (e.g., FAISS) with the document chunks
vectorstore = FAISS.from_documents(chunks, embeddings)

# Save the vector database for future use
vectorstore.save_local("email_vectorstore")


RateLimitError: Error code: 429 - {'error': {'message': 'Request too large for text-embedding-ada-002 in organization org-mksYIsrXkSRYrNxIFytHgf2Z on tokens per min (TPM): Limit 1000000, Requested 1398321. The input or output tokens must be reduced in order to run successfully. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}

In [None]:
# Helper function for batching
def calculate_batch_size(chunks, max_tokens_per_min=MAX_TOKENS_PER_MIN):
    """Yield batches of chunks that stay within the token-per-minute limit."""
    batch = []
    token_count = 0
    for chunk in chunks:
        token_count += len(chunk.page_content)  # Approximate token count
        if token_count > max_tokens_per_min:
            yield batch
            batch = []
            token_count = len(chunk.page_content)
        batch.append(chunk)
    if batch:
        yield batch

# Retry mechanism with exponential backoff
def retry_with_backoff(func, *args, max_retries=5, **kwargs):
    retries = 0
    while retries < max_retries:
        try:
            return func(*args, **kwargs)
        except Exception as e:
            print(f"Retry {retries + 1}/{max_retries} failed: {e}")
            delay = (2 ** retries) + random.uniform(0, 1)
            time.sleep(delay)
            retries += 1
    raise Exception("Max retries reached.")

# Process chunks in batches and apply rate limiting
def process_batches(chunks, vectorstore):
    """Process chunks in batches to add them to the vector store."""
    for batch in calculate_batch_size(chunks):
        try:
            # Generate embeddings for the batch
            documents = [chunk.page_content for chunk in batch]
            metadatas = [chunk.metadata for chunk in batch]  # Ensure metadata exists
            retry_with_backoff(vectorstore.add_texts, documents, metadatas=metadatas)
            print(f"[INFO] Processed batch of size {len(batch)}.")
        except Exception as e:
            print(f"[ERROR] Error processing batch: {e}")
        # Ensure we respect token-per-minute rate
        time.sleep(60)  # Wait 1 minute after processing each batch

# Initialize vectorstore and process chunks
try:
    # Check if the vector store already exists
    vectorstore_path = "email_vectorstore"
    if os.path.exists(vectorstore_path):
        print("[INFO] Loading existing vectorstore...")
        vectorstore = FAISS.load_local(vectorstore_path, embeddings)
    else:
        print("[INFO] Creating a new vectorstore...")
        vectorstore = FAISS.from_documents(chunks, embeddings)
        vectorstore.save_local(vectorstore_path)

    print("[INFO] Adding document chunks to the vectorstore...")
    process_batches(chunks, vectorstore)

    # Verify vectorstore content
    print("[INFO] Fetching vector store collection...")
    collection = vectorstore._collection
    if collection.count() == 0:
        print("[ERROR] Vector store is empty. Exiting...")
        exit()

    result = collection.get(include=["embeddings", "documents", "metadatas"])
    vectors = np.array(result["embeddings"])
    documents = result["documents"]
    metadatas = result["metadatas"]

    print("[INFO] Vector store processing complete.")
    print(f"[INFO] Total documents in vectorstore: {len(documents)}")

except Exception as e:
    print(f"[ERROR] Failed to process vectorstore: {e}")


In [None]:
from sklearn.manifold import TSNE
import numpy as np
import plotly.graph_objects as go

# Step 4: Assign colors based on categories
print("[INFO] Assigning colors to categories...")
categories = list(set([metadata.get("category", "unknown") for metadata in metadatas]))
color_map = {category: f"rgb({random.randint(0, 255)}, {random.randint(0, 255)}, {random.randint(0, 255)})"
             for category in categories}

colors = [
    color_map[metadata.get("category", "unknown")]
    for metadata in metadatas
]

# Step 5: 2D Visualization with t-SNE
print("[INFO] Reducing dimensionality to 2D for visualization...")
n_samples = len(vectors)
perplexity = min(30, n_samples - 1)  # Perplexity must be < number of samples
tsne_2d = TSNE(n_components=2, random_state=42, perplexity=perplexity)
reduced_vectors_2d = tsne_2d.fit_transform(vectors)

print("[INFO] Creating 2D scatter plot...")
fig_2d = go.Figure(data=[go.Scatter(
    x=reduced_vectors_2d[:, 0],
    y=reduced_vectors_2d[:, 1],
    mode="markers",
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[
        f"Category: {metadata.get('category', 'unknown')}<br>Text: {d[:100]}..."
        for metadata, d in zip(metadatas, documents)
    ],
    hoverinfo="text"
)])

fig_2d.update_layout(
    title="2D Vector Store Visualization (t-SNE)",
    xaxis_title="t-SNE Dimension 1",
    yaxis_title="t-SNE Dimension 2",
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)
fig_2d.show()

# Step 6: 3D Visualization with t-SNE
print("[INFO] Reducing dimensionality to 3D for visualization...")
tsne_3d = TSNE(n_components=3, random_state=42, perplexity=perplexity)
reduced_vectors_3d = tsne_3d.fit_transform(vectors)

print("[INFO] Creating 3D scatter plot...")
fig_3d = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors_3d[:, 0],
    y=reduced_vectors_3d[:, 1],
    z=reduced_vectors_3d[:, 2],
    mode="markers",
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[
        f"Category: {metadata.get('category', 'unknown')}<br>Text: {d[:100]}..."
        for metadata, d in zip(metadatas, documents)
    ],
    hoverinfo="text"
)])

fig_3d.update_layout(
    title="3D Vector Store Visualization (t-SNE)",
    scene=dict(
        xaxis_title="t-SNE Dimension 1",
        yaxis_title="t-SNE Dimension 2",
        zaxis_title="t-SNE Dimension 3"
    ),
    width=900,
    height=700,
    margin=dict(r=20, b=10, l=10, t=40)
)
fig_3d.show()
