In [148]:
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
import os
import json
from pydantic import BaseModel
import pprint as pp
from src.utils import clean_file_id, clean_message


load_dotenv()

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))


def make_prompt(text):
    """Helper function to make the prompt for OpenAI."""
    prompt = f"""
    perform PII entity extraction from the below email message(s) using the provided `extract_pii_entities` function.
    
    do not make up any entities or parts of entities that are not present in the message(s).

    message(s):
    ```
    {text}
    ```
    """
    return prompt


def get_tools():
    """Helper function to return the tools to make available for OpenAI."""

    class MessageEntities(BaseModel):
        """Message entities."""

        names: list[str] = []
        phone_numbers: list[str] = []
        email_addresses: list[str] = []
        physical_addresses: list[str] = []

    tools = [
        {
            "type": "function",
            "function": {
                "name": "extract_pii_entities",
                "description": "Extract any PII related entities found in the message(s). If there are none found, return an empty list for each type.",
                "parameters": MessageEntities.model_json_schema(),
            },
        }
    ]
    return tools

In [149]:
# params
tag = "dev"
openai_model = "gpt-3.5-turbo"
data_path = "./data/emails_train_small.csv"
nrows = 1000
# nrows = None

In [150]:
df = pd.read_csv(data_path, nrows=nrows)
print(df.shape)

(1000, 2)


In [151]:
display(df.head())

Unnamed: 0,file,message
0,germany-c/calp_hopewell/4.,Message-ID: <17014999.1075853725448.JavaMail.e...
1,campbell-l/all_documents/247.,Message-ID: <23887281.1075851883486.JavaMail.e...
2,kitchen-l/_americas/mrha/ooc/270.,Message-ID: <3290028.1075840876828.JavaMail.ev...
3,zufferli-j/sent_items/124.,Message-ID: <7771939.1075842030615.JavaMail.ev...
4,lokay-m/all_documents/906.,Message-ID: <19991611.1075844044421.JavaMail.e...


In [152]:
df_sample = df.sample(1)
file_id = df_sample.file.values[0]
file_id_clean = clean_file_id(file_id)
text = df_sample.message.values[0]
text_clean = clean_message(text)

print("=" * 100)
print(file_id)
print(file_id_clean)
print("." * 100)
print(text_clean)
print("=" * 100)

prompt = make_prompt(text_clean)
tools = get_tools()
chat_completion = client.chat.completions.create(
    messages=[{"role": "user", "content": prompt}],
    model=openai_model,
    tools=tools,
    tool_choice={
        "type": "function",
        "function": {"name": "extract_pii_entities"},
    },
)

# extract response
chat_completion_message = chat_completion.choices[0].message
tool_call = chat_completion_message.tool_calls[0]
extracted_data = json.loads(tool_call.function.arguments)
pp.pprint(extracted_data)

kean-s/calendar/untitled/7403.
kean_s_calendar_untitled_7403_
....................................................................................................
I'll get started on Phil's pc and ship Kathy's next week.  You owe $50 or
whatever you want to contribute.

Thanks Rob

-----Original Message-----
From: Steven.J.Kean@enron.com [mailto:Steven.J.Kean@enron.com]
Sent: Wednesday, February 07, 2001 6:46 AM
To: rob.kean@worldnet.att.net
Subject: Re: Phil's Birthday



I'm in.  A;lso, I don't think we have settled up on Mom and Dad's xmas
present yet.  What do I owe?



"Rob Kean"
<rob.kean@worldne        To:     "Doug & Karen Reiman
\(E-mail\)"
t.att.net>               <dkreiman@mcleodusa.net>,
"Steve & Melissa Kean
\(E-mail\)" <skean@enron.com>,
"Melissa Kean
02/05/2001 09:05         \(E-mail\)" <kean@rice.edu>
PM                       cc:
Please respond to        Subject:     Phil's Birthday
rob.kean





It's alittle early to be talking about Phil's birhtday but I have had some

In [153]:
output_path = f"./data/labels_llm/{tag}/{file_id_clean}__{tag}.json"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
print(f"Saving to {output_path}")
extracted_data["file_id"] = file_id
with open(output_path, "w") as f:
    json.dump(extracted_data, f)
print("Done.")

Saving to ./data/labels_llm/dev/kean_s_calendar_untitled_7403___dev.json
Done.
