In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from gpt_prompting import GPTApiHelper
from pydantic import BaseModel
import pandas as pd
import base64
import json

from tooldantic import ToolBaseModel, OpenAiResponseFormatGenerator
from tooldantic import OpenAiResponseFormatBaseModel as BaseModel


In [7]:
api = GPTApiHelper()

In [4]:
class Newsclassification(BaseModel):
    is_crime_news: bool
    perpetrator_name: str
    victim_name: str
    state: str

In [5]:
Newsclassification.model_json_schema()

{'type': 'json_schema',
 'json_schema': {'name': 'Newsclassification',
  'description': '',
  'strict': True,
  'schema': {'type': 'object',
   'properties': {'is_crime_news': {'type': 'boolean'},
    'perpetrator_name': {'type': 'string'},
    'victim_name': {'type': 'string'},
    'state': {'type': 'string'}},
   'required': ['is_crime_news', 'perpetrator_name', 'victim_name', 'state'],
   'additionalProperties': False}}}

In [29]:
sample = pd.read_csv('../data/test_sample_crime.csv')


In [30]:
# shuffle the sample

sample = sample.sample(frac=1,random_state=1).reset_index(drop=True)

In [31]:
sample

Unnamed: 0,summary,index,is_crime
0,Since the Portage Police Department upgraded i...,104986,False
1,Editor's note: This story originally published...,1045833,False
2,These are Khris Middleton's greatest playoff p...,13185,False
3,"Aug. 14, 2021 evening weather update for Kenos...",41670,False
4,SHEBOYGAN - Even if you're lucky enough to get...,74563,False
...,...,...,...
107,A man was arrested after a gun was fired in El...,402709,True
108,Expand\n\nThe nationwide power outages in Vene...,725557,False
109,A 26-year-old man was arrested after destroyin...,913022,True
110,1. Kentucky voters set stage for biggest gover...,113445,False


In [35]:
messages = [
    {"role": "system",
     "content":  """You are an honest classifier. 
     You receive a news excerpt in the user prompt. You should classify if the news is related to a crime or criminal case or not. 
     You should classify the news positively if it is explicetly about a crime taking place or a criminal case as the main topic. 
     You should classify negatively if the text generally mentions crime e.g., in political discussions or if it mentions problematic behavior that does not constitute a crime.
    If it is a crime news, you should also extract the name of the perpetrator and the victim if it is mentioned and set the respective field to None otherwise.
    If the state where the crime took place is explicitely mentioned, you should also extract it and set the respective field to None otherwise.
     Please return for each user prompt an output in the custom Newsclassification format which is inherited from Pydantics Basemodel."""},
]


In [40]:
Newsclassification

{'type': 'json_schema',
 'json_schema': {'name': 'Newsclassification',
  'description': '',
  'strict': True,
  'schema': {'type': 'object',
   'properties': {'is_crime_news': {'type': 'boolean'},
    'perpetrator_name': {'type': 'string', 'default': None},
    'victim_name': {'type': 'string', 'default': None},
    'state': {'type': 'string', 'default': None}},
   'required': ['is_crime_news', 'perpetrator_name', 'victim_name', 'state'],
   'additionalProperties': False}}}

In [94]:
responses = []
for i, row in sample[:10].iterrows():
    message_ = {"role": "user",
               "content": row['summary']}
    messages_ = messages + [message_]
    try:
        response = api.make_request_structured(messages_, schema=Newsclassification.model_json_schema())
        responses.append(response)
    except Exception as e:
        print(e)
        response = None
        responses.append(response)
        continue


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [149]:
len(responses)

112

In [95]:
results = [r.choices[0].message.parsed if r is not None else None for r in responses ] 

In [101]:
responses[0].choices[0].message.content

'{"is_crime_news":false,"perpetrator_name":"" ,"victim_name":"" ,"state":""}'

In [151]:
sample['is_crime_news'] = [r.is_crime_news if r.is_crime_news is not None else None for r in results ]
sample['perpetrator_name'] = [r.perpetrator_name if r.perpetrator_name is not None else None for r in results ]
sample['victim_name'] = [r.victim_name if r.victim_name is not None else None for r in results ]
sample['city'] = [r.city if r.city is not None else None for r in results ]
sample['state'] = [r.state if r.state is not None else None for r in results ]
sample['county'] = [r.county if r.county is not None else None for r in results ]

In [153]:
sample.to_csv('../data/sample_crime_results.csv', index=False)

In [156]:
accuracy = sample['is_crime_news'].eq(sample['is_crime']).mean()
false_positive_rate = sample[(sample['is_crime_news'] == True) & (sample['is_crime'] == False)].shape[0] / sample[sample['is_crime_news'] == True].shape[0]
false_negative_rate = sample[(sample['is_crime_news'] == False) & (sample['is_crime'] == True)].shape[0] / sample[sample['is_crime_news'] == False].shape[0]

In [157]:
print(f'Accuracy: {accuracy}, False Positive Rate: {false_positive_rate}, False Negative Rate: {false_negative_rate}')

Accuracy: 0.9196428571428571, False Positive Rate: 0.24324324324324326, False Negative Rate: 0.0


In [117]:
print(sample.iloc[24].summary)

Skip to content The New York Daily News Logo 1989 Exxon Valdez oil spill On Mar. 24, 1989, nearly 11 million gallons of crude oil gushed out of the Exxon Valdez oil tanker off the coast of Alaska. Take a look back at what was the largest oil spill in U.S. history at the time on the 33rd anniversary of the disaster. Tugboats hold the tanker Exxon Baton Rouge, right, up against the tanker Exxon Valdez as oil is pumped out of the damaged tanker that ran aground 25 miles from Valdez, Alaska, on Mar. 24, 1989, spilling over 270,000 barrels of crude oil. (Jack Smith/AP) A rescued sea otter is restrained and washed by workers at a local animal facility after five of the oil-covered mammals were captured in the fouled waters of Prince William Sound, Alaska, April 18, 1989. The list of animals injured and killed from the spill of the oil tanker Exxon Valdez includes sea otters, deer, eagles, owls and a host of other waterfowl gathered up by rescue workers. (John Gaps III/AP) Crude oil from the 

In [154]:
api.estimate_cost()

0.03686

# push to bash api 



In [6]:
news_sample_20k = pd.read_csv('../data/news_wisconsin_sample_20k.csv')

In [8]:
news_sample_20k["excerpt_summary"] = news_sample_20k["excerpt"].fillna("") + " \n " + news_sample_20k["summary"].fillna("")

In [9]:
system_message = [
    {"role": "system",
     "content":  """You are an honest classifier. 
     You receive a news excerpt in the user prompt. You should classify if the news is related to a crime or criminal case or not. 
     You should classify the news positively if it is explicetly about a crime taking place or a criminal case as the main topic. 
     You should classify negatively if the text generally mentions crime e.g., in political discussions or if it mentions problematic behavior that does not constitute a crime.
    If it is a crime news, you should also extract the name of the perpetrator and the victim if it is mentioned and set the respective field to None otherwise.
    If the state where the crime took place is explicitely mentioned, you should also extract it and set the respective field to None otherwise.
     Please return for each user prompt an output in the custom Newsclassification format which is inherited from Pydantics Basemodel."""},
]


In [10]:
class Newsclassification(BaseModel):
    is_crime_news: bool
    perpetrator_name: str
    victim_name: str
    state: str

In [70]:
Newsclassification.schema()

{'type': 'json_schema',
 'json_schema': {'name': 'Newsclassification',
  'description': '',
  'strict': True,
  'schema': {'type': 'object',
   'properties': {'is_crime_news': {'type': 'boolean'},
    'perpetrator_name': {'type': 'string'},
    'victim_name': {'type': 'string'},
    'state': {'type': 'string'}},
   'required': ['is_crime_news', 'perpetrator_name', 'victim_name', 'state'],
   'additionalProperties': False}}}

In [12]:
prompts_list = []
for i, row in news_sample_20k.iterrows():
    user_messsage = {"role": "user",
               "content": row['excerpt_summary']}
    prompt = system_message + [user_messsage]
    prompts_list.append(prompt)



In [18]:
jsonl = api.create_jsonl(prompts_list, schema=Newsclassification, custom_id_prefix="news_wisconsin_sample_2019_20k_")


In [52]:
jsonl[0]["body"]["response_format"]

__main__.Newsclassification

In [20]:
# save jsonl as .jsonl file

with open('../data/news_wisconsin_sample_2019_20k.jsonl', 'w') as jsonl_file:
    for item in jsonl:
        # Convert each dictionary to a JSON string and write it as a new line
        jsonl_file.write(json.dumps(item) + '\n')


In [21]:
with open('../data/news_wisconsin_sample_2019_20k.jsonl', 'rb') as jsonl_file:
    jsonl = [json.loads(line) for line in jsonl_file]



In [22]:
len(jsonl)

20000

In [23]:
upload_res = api.upload_jsonl('../data/news_wisconsin_sample_2019_20k.jsonl')

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/files "HTTP/1.1 200 OK"


'file-UsGxzPWMckr8hlIUNMBH2iKz'

In [29]:
start_bash_res = api.create_batch(upload_res.id, "first 20k news wisconsin sample 2019")

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/batches "HTTP/1.1 200 OK"


In [32]:
start_bash_res.id

'batch_671fb42025048190934d79e68a0d2c20'

In [33]:
api.check_batch_status(start_bash_res.id)

INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_671fb42025048190934d79e68a0d2c20 "HTTP/1.1 200 OK"


Batch(id='batch_671fb42025048190934d79e68a0d2c20', completion_window='24h', created_at=1730130976, endpoint='/v1/chat/completions', input_file_id='file-UsGxzPWMckr8hlIUNMBH2iKz', object='batch', status='in_progress', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1730217376, failed_at=None, finalizing_at=None, in_progress_at=1730130989, metadata={'description': 'first 20k news wisconsin sample 2019'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=20000))

In [111]:
api.get_file_content("file-gxbMN0aMnFgcPDTf5Zrzm8RN")

INFO:httpx:HTTP Request: GET https://api.openai.com/v1/files/file-gxbMN0aMnFgcPDTf5Zrzm8RN/content "HTTP/1.1 200 OK"


<openai._legacy_response.HttpxBinaryResponseContent at 0x10ca284d0>

# Check status

In [22]:
batch_result = api.get_file_content(api.check_batch_status("batch_671fb42025048190934d79e68a0d2c20").output_file_id)

INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_671fb42025048190934d79e68a0d2c20 "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/files/file-eV10NS7mumsmtjNXgrVmwRff/content "HTTP/1.1 200 OK"


In [23]:
with open('../data/news_wisconsin_sample_2019_20k_results_p1.jsonl', 'w') as jsonl_file:
    jsonl_file.write(batch_result.text)

In [None]:
with open('../data/news_wisconsin_sample_2019_20k_results_p1.jsonl', 'rb') as jsonl_file:
    jsonl = [json.loads(line) for line in jsonl_file]

    

In [46]:
jsonl[0]["response"]["body"]["choices"][0]["message"]["content"]

'{"is_crime_news":false,"perpetrator_name":"" ,"victim_name":"" ,"state":""}'

In [54]:
res_df = api.convert_response_list(jsonl)

In [101]:
res_df.is_crime_news.mean(), res_df.perpetrator_name.isna().mean(), res_df.victim_name.isna().mean(), res_df.state.isna().mean()

(np.float64(0.2128),
 np.float64(0.8149),
 np.float64(0.84965),
 np.float64(0.6713))

In [114]:
(res_df[res_df.is_crime_news].state == "Wisconsin").mean()

np.float64(0.3094454887218045)

In [68]:
api.estimate_cost(jsonl, is_batch=True)

1.44708

# Try to match with Wisoncion data


In [80]:
names_df = pd.read_csv('../data/names_def_2015_2019.csv')

In [82]:
res_df["perpetrator_last_name"]= res_df["perpetrator_name"].str.split(" ").str[-1]
res_df["perpetrator_first_name"]= res_df["perpetrator_name"].str.split(" ").str[0]

In [None]:
res_df.reset_index(inplace=True)
res_df.rename(columns={"index": "id"}, inplace=True)

In [89]:
res_df[~res_df["perpetrator_last_name"].isna()]

Unnamed: 0,id,is_crime_news,perpetrator_name,victim_name,state,perpetrator_last_name,perpetrator_first_name
15,15,True,Dharmesh Patel,,Illinois,Patel,Dharmesh
16,16,True,Vinicius Porto,23-year-old woman,New Jersey,Porto,Vinicius
24,24,True,Matt Lauer,Brooke Nevils,Russia,Lauer,Matt
31,31,True,Daniel Pantaleo,Eric Garner,New York,Pantaleo,Daniel
33,33,True,Angelo Nesimi,Michael Stewart,New York,Nesimi,Angelo
...,...,...,...,...,...,...,...
19964,19964,True,Brinelle Nabors,Park High School student,Wisconsin,Nabors,Brinelle
19969,19969,True,Jake Patterson,Jayme Closs,Wisconsin,Patterson,Jake
19975,19975,True,Dmitriy N. Andreychenko,,Missouri,Andreychenko,Dmitriy
19983,19983,True,David Terrell,Pedro Hernandez,New York,Terrell,David


In [None]:
res_df["perpetrator_name"].drop_duplicates().notna().sum()/20000

np.float64(0.0952)

In [140]:
res_df["victim_name"].drop_duplicates().notna().sum()/20000

np.float64(0.1069)

In [None]:
# exact match merge
(~names_df.merge(res_df, left_on=["last_name", "first_name"], right_on=["perpetrator_last_name", "perpetrator_first_name"], how="left").id.isna()).sum()


np.int64(4957)

In [124]:
names_df_merged = names_df.merge(res_df, left_on=["last_name", "first_name"], right_on=["perpetrator_last_name", "perpetrator_first_name"], how="left")

In [125]:
names_df_merged[names_df_merged.id.notna()]
names_df_merged.drop_duplicates(subset="id").id.notna().sum()
# this is more than there are reports about crimes. The middle names seem to matter as well 

np.int64(908)

In [None]:
908/res_df.perpetrator_name.notna().sum()

np.float64(0.24527282549972987)

In [95]:
res_df["perpetrator_middle_name"]= res_df["perpetrator_name"].apply(lambda x: " ".join(x.split(" ")[1:-1]) if (x is not None and len(x.split(" ")) > 2) else None)
res_df["perpetrator_middle_name"]= res_df["perpetrator_middle_name"].str.removesuffix(".")

In [None]:
(~names_df.merge(res_df, left_on=["last_name", "first_name", "middle_name"], right_on=["perpetrator_last_name", "perpetrator_first_name", "perpetrator_middle_name"], how="left").id.isna()).sum()
# even then a third of the names matched exactly

np.int64(1150)

In [98]:
names_df_merged = names_df.merge(res_df, left_on=["last_name", "first_name", "middle_name"], right_on=["perpetrator_last_name", "perpetrator_first_name", "perpetrator_middle_name"], how="left")

In [123]:
names_df_merged.drop_duplicates(subset="id").id.notna().sum()

np.int64(394)