In [143]:
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
import os
import json
from pydantic import BaseModel
import pprint as pp
from src.utils import clean_file_id, clean_message
from src.openai import get_tools


load_dotenv()

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))


def make_prompt(text):
    """Helper function to make the prompt for OpenAI."""
    prompt = f"""
    perform PII entity extraction from the below email message(s) using the provided `extract_pii_entities` function.
    
    do not make up any entities or parts of entities that are not present in the message(s).

    message(s):
    ```
    {text}
    ```
    """
    return prompt

In [144]:
# params
tag = "dev"
openai_model = "gpt-3.5-turbo"
data_path = "./data/emails_train_small.csv"
# nrows = 1000
nrows = None

In [145]:
df = pd.read_csv(data_path, nrows=nrows)
print(df.shape)

(10000, 2)


In [146]:
display(df.head())

Unnamed: 0,file,message
0,germany-c/calp_hopewell/4.,Message-ID: <17014999.1075853725448.JavaMail.e...
1,campbell-l/all_documents/247.,Message-ID: <23887281.1075851883486.JavaMail.e...
2,kitchen-l/_americas/mrha/ooc/270.,Message-ID: <3290028.1075840876828.JavaMail.ev...
3,zufferli-j/sent_items/124.,Message-ID: <7771939.1075842030615.JavaMail.ev...
4,lokay-m/all_documents/906.,Message-ID: <19991611.1075844044421.JavaMail.e...


In [147]:
df_sample = df.sample(1)
file_id = df_sample.file.values[0]
file_id_clean = clean_file_id(file_id)
text = df_sample.message.values[0]
text_clean = clean_message(text)

print("=" * 100)
print(file_id)
print(file_id_clean)
print("." * 100)
print(text_clean)
print("=" * 100)

prompt = make_prompt(text_clean)
tools = get_tools()
chat_completion = client.chat.completions.create(
    messages=[{"role": "user", "content": prompt}],
    model=openai_model,
    tools=tools,
    tool_choice={
        "type": "function",
        "function": {"name": "extract_pii_entities"},
    },
)

# extract response
chat_completion_message = chat_completion.choices[0].message
tool_call = chat_completion_message.tool_calls[0]
extracted_data = json.loads(tool_call.function.arguments)
pp.pprint(extracted_data)

dasovich-j/notes_inbox/146.
dasovich_j_notes_inbox_146_
....................................................................................................
California's Price Caps Raising Average Cost of Power=20
The retail price caps imposed in California are leading to higher average=
=20
prices for longer periods of time as demand responsiveness is dulled and=20
supply is retarded, according to a report released by Morgan Stanley Dean=
=20
Witter. Traders interviewed by the firm said that calendar strips for 2001=
=20
through 2003 have traded in excess of $100/MWh, or around $30/MWh higher th=
an=20
a month ago. What's more, the firm said, the expected trough in the forward=
=20
curve =01* projected in the 2002-2003 time frame =01* "continues to move ou=
t in=20
time."=20

The Dean Witter study reported that the change in futures pricing for the=
=20
California Oregon Border (a proxy for Northern California pricing) and at=
=20
Palo Verde (a proxy for Southern California pricing) t

In [None]:
output_path = f"./data/labels_llm/{tag}/{file_id_clean}__{tag}.json"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
print(f"Saving to {output_path}")
extracted_data["file_id"] = file_id
with open(output_path, "w") as f:
    json.dump(extracted_data, f)
print("Done.")

Saving to ./data/labels_llm/dev/germany_c_all_documents_685___dev.json
Done.
