In [85]:
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
import os
import json
from pydantic import BaseModel
import pprint as pp
from src.utils import clean_file_id, clean_message


load_dotenv()

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))


def make_prompt(text):
    """Helper function to make the prompt for OpenAI."""
    prompt = f"""
    perform PII entity extraction from the below email message(s) using the provided `extract_pii_entities` function.
    
    we are interested in extracting the following PII entities:
    - names: These can come from email heading or body text.
    - phone numbers: Typically phone numbers may naturally appear somewhere in the message text.
    - email addresses: These can come from email to/from fields or body text of the message(s) themselves.
    - physical addresses: Typically physical addresses may naturally appear sometimes somewhere in the message text.

    message(s):
    ```
    {text}
    ```
    """
    return prompt


def get_tools():
    """Helper function to return the tools to make available for OpenAI."""

    class MessageEntities(BaseModel):
        """Message entities."""

        names: list[str] = []
        phone_numbers: list[str] = []
        email_addresses: list[str] = []
        physical_addresses: list[str] = []

    tools = [
        {
            "type": "function",
            "function": {
                "name": "extract_pii_entities",
                "description": "Extract any PII related entities found in the message(s). If there are none found, return an empty list for each type.",
                "parameters": MessageEntities.model_json_schema(),
            },
        }
    ]
    return tools

In [86]:
# params
tag = "dev"
openai_model = "gpt-3.5-turbo"
data_path = "./data/emails_train_small.csv"
nrows = 1000
# nrows = None

In [87]:
df = pd.read_csv(data_path, nrows=nrows)
print(df.shape)

(1000, 2)


In [88]:
display(df.head())

Unnamed: 0,file,message
0,germany-c/calp_hopewell/4.,Message-ID: <17014999.1075853725448.JavaMail.e...
1,campbell-l/all_documents/247.,Message-ID: <23887281.1075851883486.JavaMail.e...
2,kitchen-l/_americas/mrha/ooc/270.,Message-ID: <3290028.1075840876828.JavaMail.ev...
3,zufferli-j/sent_items/124.,Message-ID: <7771939.1075842030615.JavaMail.ev...
4,lokay-m/all_documents/906.,Message-ID: <19991611.1075844044421.JavaMail.e...


In [89]:
df_sample = df.sample(1)
file_id = df_sample.file.values[0]
file_id_clean = clean_file_id(file_id)
text = df_sample.message.values[0]
text_clean = clean_message(text)

print("=" * 100)
print(file_id)
print(file_id_clean)
print(text_clean)
print("=" * 100)

prompt = make_prompt(text_clean)
tools = get_tools()
chat_completion = client.chat.completions.create(
    messages=[{"role": "user", "content": prompt}],
    model=openai_model,
    tools=tools,
    tool_choice={
        "type": "function",
        "function": {"name": "extract_pii_entities"},
    },
)

# extract response
chat_completion_message = chat_completion.choices[0].message
tool_call = chat_completion_message.tool_calls[0]
extracted_data = json.loads(tool_call.function.arguments)
pp.pprint(extracted_data)

kean-s/archiving/untitled/962.
kean_s_archiving_untitled_962_
FYI.  I am sending Vince some materials we have used
----- Forwarded by Steven J Kean/NA/Enron on 09/20/2000 09:21 AM -----

	Vince J Kaminski@ECT
	09/18/2000 01:26 PM
		 
		 To: Steven J Kean/NA/Enron@Enron
		 cc: Charlene Jackson/Corp/Enron@ENRON, Celeste Roberts/HOU/ECT@ECT, Vince J 
Kaminski/HOU/ECT@ECT, Ashley Baxter/Corp/Enron@Enron
		 Subject: Presentation to faculty and students at Berkeley

Steve,

I am a lead recruiter at the University of California at Berkeley for
Enron Analyst/Associate program.

I contacted several friends who work at Berkeley and received an invitation
from one of them to make a presentation at the weekly Faculty Seminar
of the Dept. of Industrial Engineering and Operations Research.

The students and faculty members from the business school will be also 
invited.

Berkeley in general, and Department of Industrial Engineering and Operations 
Research in
particular, are important centers of aca

In [None]:
output_path = f"./data/labels_llm/{tag}/{file_id_clean}__{tag}.json"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
print(f"Saving to {output_path}")
extracted_data["file_id"] = file_id
with open(output_path, "w") as f:
    json.dump(extracted_data, f)
print("Done.")

Saving to ./data/labels_llm/dev/salisbury_h_inbox_299___dev.json
Done.
