# Prompt Engineering

This notebook pulls in from the training data and randomly samples some messages to be sent to the OpenAI API. 

The API will then return a response, which is then saved to `/data/labels_llm/{tag}/` as llm generated labels for later evaluation with by comparison to the ground truth human labels that live in `./data/labels/`.

The approach here is to use minimal prompt engineering and make use of OpenAI function calling to get back structured data similar to what is generated by the labeling app.

In [393]:
import pandas as pd
import numpy as np
from openai import OpenAI
from dotenv import load_dotenv
import os
import json
import pprint as pp
from src.utils import clean_file_id, clean_message
from src.openai import get_tools
from src.utils import get_files


load_dotenv()

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))


def make_prompt(text):
    """Helper function to make the prompt for OpenAI."""
    prompt = f"""
    perform PII entity extraction from the below email message(s) using the provided `extract_pii_entities` function.
    
    do not make up any entities or parts of entities that are not present in the message(s).

    message(s):
    ```
    {text}
    ```
    """
    return prompt


def make_prompt_few_shot(text, examples):
    """Helper function to make the prompt for OpenAI."""
    prompt = f"""
perform PII entity extraction from the below email message(s) using the provided `extract_pii_entities` function.

do not make up any entities or parts of entities that are not present in the message(s).

here are some example messages followed by the entities that should be extracted from them.

lastly there is a message that you should extract entities from.

{examples}

## MESSAGE:
```
{text}
```

## ENTITIES:

"""
    return prompt

In [394]:
# params

# "tag" is like an experiment id - it's used to keep track of different experiments/models/approaches etc
params = {
    "tag": "dev_gpt4_1106_preview_few_shot_1",
    "openai_model": "gpt-4-1106-preview",
    "data_path": "./data/emails_train_small.csv",
    "few_shot": True,
}
params["output_dir"] = f"./data/labels_llm/{params['tag']}"
params["nrows"] = None

# make output dir if it doesn't exist
if not os.path.exists(params["output_dir"]):
    os.makedirs(params["output_dir"])

# load data from ./data/few_shot_examples.txt
if params["few_shot"]==True:
    with open("./data/few_shot_examples.txt", "r") as f:
        examples = f.read()
    # save prompt
    with open(f"{params['output_dir']}/prompt.txt", "w") as f:
        f.write(make_prompt_few_shot("<text>",examples=examples))
else:
    examples = None
    # save prompt
    with open(f"{params['output_dir']}/prompt.txt", "w") as f:
        f.write(make_prompt("<text>"))

# save params
with open(f"{params['output_dir']}/params.json", "w") as f:
    json.dump(params, f)


In [395]:
# read data
df = pd.read_csv(params['data_path'], nrows=params['nrows'])
df['file_id_clean'] = df['file'].apply(clean_file_id)
print(df.shape)

(10000, 3)


In [396]:
display(df.head())

Unnamed: 0,file,message,file_id_clean
0,germany-c/calp_hopewell/4.,Message-ID: <17014999.1075853725448.JavaMail.e...,germany_c_calp_hopewell_4_
1,campbell-l/all_documents/247.,Message-ID: <23887281.1075851883486.JavaMail.e...,campbell_l_all_documents_247_
2,kitchen-l/_americas/mrha/ooc/270.,Message-ID: <3290028.1075840876828.JavaMail.ev...,kitchen_l__americas_mrha_ooc_270_
3,zufferli-j/sent_items/124.,Message-ID: <7771939.1075842030615.JavaMail.ev...,zufferli_j_sent_items_124_
4,lokay-m/all_documents/906.,Message-ID: <19991611.1075844044421.JavaMail.e...,lokay_m_all_documents_906_


In [397]:
# find labeled data that hasn't been processed yet
files_labels = get_files(f"./data/labels/")
files_labels = [f.split('/')[-1].replace('.json','') for f in files_labels]
print(f"files labeled: {len(files_labels)}")
files_labels_llm = get_files(f"{params['output_dir']}/")
files_labels_llm = [f.split('/')[-1].replace(f"__{params['tag']}.json",'') for f in files_labels_llm]
print(f"files labeled llm: {len(files_labels_llm)}")
files_to_process = list(set(files_labels) - set(files_labels_llm))
# only process files that are in the df
files_to_process = list(set(files_to_process) & set(df['file_id_clean'].unique()))
print(f"files to process: {len(files_to_process)}")

files labeled: 81
files labeled llm: 55
files to process: 1


In [398]:
# random sample from files to process
file_to_process = np.random.choice(files_to_process, size=1, replace=False)[0]
print(file_to_process)
df_sample = df[df['file_id_clean'] == file_to_process]

# sample a random message
# df_sample = df.sample(1)

# some data wrangling
file_id = df_sample.file.values[0]
file_id_clean = clean_file_id(file_id)
text = df_sample.message.values[0]
text_clean = clean_message(text)

# print what we have
print("=" * 100)
print(file_id)
print(file_id_clean)
print("." * 100)
print(text_clean)
print("=" * 100)

# call openai
if params['few_shot'] == True:
    prompt = make_prompt_few_shot(text_clean, examples=examples)
else:
    prompt = make_prompt(text_clean)
tools = get_tools()
chat_completion = client.chat.completions.create(
    messages=[{"role": "user", "content": prompt}],
    model=params['openai_model'],
    tools=tools,
    tool_choice={
        "type": "function",
        "function": {"name": "extract_pii_entities"},
    },
)

# extract response
chat_completion_message = chat_completion.choices[0].message
tool_call = chat_completion_message.tool_calls[0]
extracted_data = json.loads(tool_call.function.arguments)

# print response
pp.pprint(extracted_data)

nemec_g_notes_inbox_1396_
nemec-g/notes_inbox/1396.
nemec_g_notes_inbox_1396_
....................................................................................................
Please join Lexis-Nexis trainings specific tailored for legal.  Learn to=20
conduct Lexis-Nexis search from Enron Lexis-Nexis web page on our portal. T=
he=20
training schedule now is posted on our portal Enron Legal Edge=20
http://legaledge.corp.enron.com/  under Lexis Day Announcement banner and i=
n=20
the Legal Calendar.  Lexis-Nexis link is in Research category.

Join Lexis-Nexis Day at Enron Legal Houston to
 discover LEXIS-NEXIS and it=01,s vast
 collection of information on=20
All areas of Law and Business
Watch as LEXIS-NEXIS SEARCH ADVISOR practically does all of your research f=
or=20
you.

Tuesday, May 15, 2001
Conference Room EB48C2

9:00 am - 10:00 am- Lexis.com Basic
10:30 am-11:30 am- e-Commerce & Cyberlaw
Noon - 1:00 pm  - Area of Laws/Practice Pages=20
1:30 pm- 2:30 pm- Public Records
3:00 pm

In [399]:
# save llm extracted data in ./data/labels_llm/{tag}/{file_id_clean}__{tag}.json
output_path = f"{params['output_dir']}/{file_id_clean}__{params['tag']}.json"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
print(f"Saving to {output_path}")
extracted_data["file_id"] = file_id
with open(output_path, "w") as f:
    json.dump(extracted_data, f)
print("Done.")

Saving to ./data/labels_llm/dev_gpt4_1106_preview_few_shot_1/nemec_g_notes_inbox_1396___dev_gpt4_1106_preview_few_shot_1.json
Done.
