# Text Classification with GPT-4o
## ABB #6 - Session 2

Code authored by: Shaw Talebi

### imports

In [1]:
import time
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from openai import OpenAI
from dotenv import load_dotenv
import os

In [2]:
# load vars from .env
load_dotenv()

# connect to openai API
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

### Load Data

In [3]:
df = pd.read_csv("data/emails.csv").astype(str)
df.head()

Unnamed: 0,subject,from,body,label
0,Thank you Shawhin for your RSVP,Evite <info@mailva.evite.com>,Thank you for your RSVP\n\n\nYou replied Yes f...,personal
1,Thank you Shawhin for your RSVP,Evite <info@mailva.evite.com>,Thank you for your RSVP\n\n\nYou replied Yes f...,personal
2,"Folder shared with you: ""Knocking at The Door""","""Ifeoma Ahuna (via Google Drive)"" <drive-share...",I've shared an item with you:\r\n\r\nKnocking ...,personal
3,The Colony Shoreline Trail 5K and 15K Registra...,RunSignup <info+auto@runsignup.com>,[1]The Colony Shoreline Trail 5K and 15K\r\n\r...,personal
4,Please join us for a special event from Apple ...,Apple <News@InsideApple.Apple.com>,"Wonderlust.\r\n\r\nSeptember 12, 2023 10:00 a....",personal


### Feature Engineering (Software 2.0 Way)

In [4]:
def manual_feature_engineering(df):
    """
        Generate a suite of manually defined features
    """
    
    size = df.shape
    
    # contains "lol"
    df["contains_lol"] = df["body"].apply(lambda x: "lol" in x.lower())
    
    # contains "OMG"
    df["contains_omg"] = df["body"].apply(lambda x: "omg" in x.lower())
    
    # contains "attached" or "see attached" or "see attachment"
    df["contains_attached"] = df["body"].apply(lambda x: "attached" in x.lower())
    df["contains_attachment"] = df["body"].apply(lambda x: "attachment" in x.lower())
    
    # contains "Order Confirmation"
    df["contains_order_confirmation"] = df["body"].apply(lambda x: "order confirmation" in x.lower())
    
    # contains "payment summary"
    df["contains_payment_summary"] = df["body"].apply(lambda x: "payment summary" in x.lower())
    
    # sender is common person domain (gmail, yahoo, hotmail) 
    df["sender_has_common_domain"] = df["from"].apply(lambda x: any(domain in x.lower() for domain in ["gmail", "yahoo", "hotmail"]))
    
    # is personal email
    df["is_personal"] = (df["label"] == "personal")

    return df.iloc[:,size[1]:]

In [5]:
df_transformed = manual_feature_engineering(df)
df_transformed.to_csv("data/emails_transformed.csv")
df_transformed.head()

Unnamed: 0,contains_lol,contains_omg,contains_attached,contains_attachment,contains_order_confirmation,contains_payment_summary,sender_has_common_domain,is_personal
0,False,False,False,False,False,False,False,True
1,False,False,False,False,False,False,False,True
2,False,False,False,True,False,False,False,True
3,False,False,False,False,False,False,False,True
4,False,False,False,False,False,False,False,True


In [6]:
# split data by predictors and target
X = df_transformed.iloc[:, :-1]
y = df_transformed.iloc[:, -1]

# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# train logistic regression model
clf = LogisticRegression(random_state=0).fit(X_train, y_train)

In [7]:
# compute accuracy
train_acc = clf.score(X_train, y_train)
test_acc = clf.score(X_test, y_test)

print(train_acc)
print(test_acc)

0.8535564853556485
0.8


### Text Classification with GPT-4o (Software 3.0 Way)

In [8]:
def generate_label(system_prompt, subject, sender, body):
    """
        Function to generate 0-shot label for email based on subject, sender, and body
    """
    user_prompt = prompt_template(subject, sender, body)
    
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "developer", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ], 
        temperature = 0.25,
        #max_completion_tokens=25,
    )
    
    # extract response
    return response.choices[0].message.content

In [9]:
prompt_template = lambda subject, sender, body : f"""
Input Email:
Subject: {subject}
Sender: {sender}
Body: {body}

Output: [Your classification: 1 or 0]
"""

#### 0-shot

In [10]:
# prompt
system_prompt = f"""You are an intelligent assistant that classifies emails based on whether they are personal or not. \
Given an email's subject, sender, and body from the user, determine if the email is personal (indicated by 1) or not personal \
(indicated by 0). A personal email typically includes messages from friends, family, or individuals addressing personal topics. \
Non-personal emails include promotional content, work-related messages, newsletters, or automated notifications.

Instructions:
- Carefully analyze the subject, sender, and body to understand the context and tone of the email.
- Return: 1 if the email is personal or 0 if the email is not personal.
- ONLY return 1 or 0
"""

In [11]:
%%time
# intialize list to store labels
label_0shot_list = []

# generate labels for each row
for index, row in df.iterrows():
    label_0shot_list.append(generate_label(system_prompt, row['subject'], row['from'], row['body']))
    
    # pause every 15 calls to avoid throttles
    if index % 15 == 0:
        print(index)
        time.sleep(1)

0
15
30
45
60
75
90
105
120
135
150
165
180
195
210
225
240
255
270
285
CPU times: user 1.38 s, sys: 224 ms, total: 1.6 s
Wall time: 5min 6s


In [12]:
print(label_0shot_list)

['1', '1', '0', '0', '0', '0', '0', '1', '1', '1', '0', '1', '1', '1', '0', '0', '0', '0', '0', '0', '0', '0', '1', '1', '1', '1', '1', '1', '1', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '1', '1', '0', '1', '0', '0', '0', '0', '0', '1', '0', '1', '1', '1', '1', '1', '1', '0', '1', '0', '0', '0', '1', '1', '1', '0', '1', '1', '1', '0', '0', '0', '0', '1', '0', '0', '0', '1', '1', '1', '0', '0', '0', '1', '1', '1', '0', '1', '0', '0', '0', '0', '1', '1', '1', '0', '0', '1', '1', '1', '1', '1', '0', '0', '0', '1', '1', '0', '1', '1', '1', '1', '1', '0', '1', '0', '0', '0', '0', '0', '1', '0', '0', '0', '1', '1', '1', '1', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',

In [13]:
# add label to df
df['label_0shot'] = ["1" in label for label in label_0shot_list]
df['correct_0shot'] = df['is_personal']==df['label_0shot']

In [14]:
# compare ground truth to 0-shot label
print(df['correct_0shot'].sum()/len(df))

0.7190635451505016


#### few-shot

In [15]:
# prompt
system_prompt_fewshot = f"""You are an intelligent assistant that classifies emails based on whether they are personal or not. \
Given an email's subject, sender, and body from the user, determine if the email is personal (indicated by 1) or not personal \
(indicated by 0). A personal email typically includes messages from friends, family, or individuals addressing personal topics. \
Non-personal emails include promotional content, work-related messages, newsletters, or automated notifications.

Instructions:
- Carefully analyze the subject, sender, and body to understand the context and tone of the email.
- Return: 1 if the email is personal or 0 if the email is not personal.
- ONLY return 1 or 0

## Examples:

<user_input id="example-1">
Subject: {df['subject'][0]}
Sender: {df['from'][0]}
Body: {df['body'][0]}
</user_input id="example-1">

<assistant_response id="example-1">
{int(df['is_personal'][0])}
</assistant_response id="example-1">

<user_input id="example-2">
Subject: {df['subject'][150]}
Sender: {df['from'][150]}
Body: {df['body'][150]}
</user_input id="example-2">

<assistant_response id="example-2">
{int(df['is_personal'][150])}
</assistant_response id="example-2">
"""

In [16]:
%%time
# intialize list to store labels
label_fewshot_list = []

# generate labels for each row
for index, row in df.iterrows():
    label_fewshot_list.append(generate_label(system_prompt_fewshot, row['subject'], row['from'], row['body']))
    
    # pause every 10 calls to avoid throttles
    if index % 10 == 0:
        print(index)
        time.sleep(1)

0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
CPU times: user 1.39 s, sys: 234 ms, total: 1.62 s
Wall time: 4min 56s


In [17]:
print(label_fewshot_list)

['1', '1', '0', '0', '0', '0', '0', '1', '0', '1', '1', '1', '1', '1', '0', '0', '0', '0', '0', '0', '0', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '0', '0', '0', '0', '0', '0', '0', '1', '1', '1', '0', '0', '0', '0', '0', '0', '1', '0', '1', '0', '0', '0', '0', '1', '0', '1', '1', '1', '1', '0', '0', '0', '1', '1', '1', '0', '1', '1', '1', '1', '1', '1', '0', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '0', '0', '1', '0', '0', '0', '1', '1', '1', '1', '1', '1', '1', '1', '1', '0', '1', '0', '0', '1', '0', '1', '1', '1', '0', '0', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '0', '1', '0', '0', '0', '1', '0', '1', '1', '0', '0', '1', '1', '1', '1', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '1', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',

In [18]:
# add label to df
df['label_fewshot'] = ["1" in label for label in label_fewshot_list]
df['correct_fewshot'] = df['is_personal']==df['label_fewshot']

In [19]:
# compare ground truth to 0-shot label
print(df['correct_fewshot'].sum()/len(df))

0.7993311036789298


---

### 0-shot with more tokens

In [20]:
# prompt
system_prompt_moretokens = f"""You are an intelligent assistant that classifies emails based on whether they are personal or not. \
Given an email's subject, sender, and body from the user, determine if the email is personal (indicated by 1) or not personal \
(indicated by 0). A personal email typically includes messages from friends, family, or individuals addressing personal topics. \
Non-personal emails include promotional content, work-related messages, newsletters, or automated notifications.

Instructions:
1. Briefly describe the context and tone of the email using the subject, sender, and body.
2. Output your classification clearly in triple backticks:
   ```1``` for personal
   ```0``` for not personal
"""

In [21]:
%%time
# intialize list to store labels
label_0shot_moretokens_list = []

# generate labels for each row
for index, row in df.iterrows():
    label_0shot_moretokens_list.append(generate_label(system_prompt_moretokens, row['subject'], row['from'], row['body']))
    
    # pause every 15 calls to avoid throttles
    if index % 15 == 0:
        print(index)
        time.sleep(1)

0
15
30
45
60
75
90
105
120
135
150
165
180
195
210
225
240
255
270
285
CPU times: user 1.77 s, sys: 245 ms, total: 2.02 s
Wall time: 11min 58s


In [27]:
# add label to df
df['label_0shot_moretokens'] = ["1" in label[-9:-1] for label in label_0shot_moretokens_list]
df['correct_0shot_moretokens'] = df['is_personal']==df['label_0shot_moretokens']

In [28]:
# compare ground truth to 0-shot label
print(df['correct_0shot_moretokens'].sum()/len(df))

0.6956521739130435


In [29]:
label_0shot_moretokens_list

["The email is a confirmation from Evite regarding an RSVP for a personal event, specifically Ryan's 30th birthday. The tone is friendly and acknowledges the recipient's participation in a social gathering. The content is related to a personal invitation and includes links to view the invitation and send gifts, which further emphasizes its personal nature. \n\nGiven that the email is about a personal event and addresses the recipient directly regarding their RSVP, it can be classified as personal.\n\n```\n1\n```",
 "The email is a confirmation of an RSVP for a personal event, specifically Chelsea's 27th Birthday Bash. The sender, Evite, is a service that facilitates invitations for social gatherings, indicating that the context is personal in nature. The tone is friendly and acknowledges the recipient's participation in a social event, which aligns with personal communication.\n\nBased on this analysis, the email is classified as personal.\n\n```\n1\n```",
 'The email is a notification