In [1]:
import pandas as pd
from tqdm import tqdm
import json
from datetime import datetime
import time
import matplotlib.pyplot as plt
import random
import tiktoken
import requests
import ast
import logging
from dotenv import load_dotenv

from langchain.llms import OpenAI
from langchain_openai import ChatOpenAI
from langchain.callbacks import get_openai_callback
from langchain.schema import HumanMessage, AIMessage, SystemMessage, ChatMessage
from langchain.prompts import MessagesPlaceholder, PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate, AIMessagePromptTemplate, SystemMessagePromptTemplate
from langchain.output_parsers import StructuredOutputParser, ResponseSchema, CommaSeparatedListOutputParser
from langchain_core.output_parsers import JsonOutputParser
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory
from openai import RateLimitError, APIError, APIConnectionError, APITimeoutError, InternalServerError

import os
load_dotenv(dotenv_path='../../../../.env')
api_key = os.getenv('OPENAI_API_KEY')
os.environ['OPENAI_API_KEY'] = api_key



# Load Data

In [3]:
# # URL of the JSON file
# url = 'https://raw.githubusercontent.com/Yung-Chun/WTO-Event-Extraction/main/training_data/wto_eng_docs.json'

# try:
#     # Fetch the data from the URL
#     response = requests.get(url)
    
#     # Check if the request was successful (status code 200)
#     if response.status_code == 200:
        
#         # Parse JSON data
#         wto_eng_doc = response.json()
#         print("Seccessfully fetched data. Status code:", response.status_code)
        
#     else:
#         print("Failed to fetch data. Status code:", response.status_code)

# except requests.exceptions.RequestException as e:
#     print("Error fetching data:", e)

KeyboardInterrupt: 

In [2]:
# read from local file
with open('../WTO-Event-Extraction/training_data/wto_eng_docs.json') as f:
    wto_eng_doc = json.load(f)

In [3]:
len(wto_eng_doc)

8296

In [4]:
urls = list(wto_eng_doc.keys())

In [5]:
test_url = urls[0]
test_doc = wto_eng_doc[test_url]
test_doc

{'date': '29 NOVEMBER 2021',
 'doc': "WTO Director-General Ngozi Okonjo Iweala and International Finance Corporation (IFC) Managing Director Makhtar Diop agreed on 29 November to enhance cooperation between the two organizations in order to explore ways to improve the availability of trade financing for regions in need. In a joint statement, the two pledged to enhance existing cooperation to improve the analytics, identification and detection of trade finance gaps in order to better direct capacity building and other resources where unmet demand is greatest, particularly in Africa.\n“Our developing country members regularly identify a lack of trade finance as a major obstacle to participating in global trade — all the more so for micro, small and medium-sized enterprises, and businesses led by women,” DG Okonjo-Iweala said. “Working together, experts from our two organizations will be able to better analyse, detect and explain trade finance gaps, with a view to directing finite resourc

# Count Tokens

In [6]:
model_name='gpt-4-0125-preview'
# model_name='gpt-3.5-turbo-0125' # for testing

In [7]:
def calculate_tokens(string: str, model_name: str) -> int:
    encoding = tiktoken.get_encoding("cl100k_base")
    encoding = tiktoken.encoding_for_model(model_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [91]:
max_url = ''
max_token_cnt = 0

for url in wto_eng_doc.keys():
    content = wto_eng_doc[url]['doc']
    num_tokens = calculate_tokens(content, model_name)

    if num_tokens > max_token_cnt:
        max_url = url
        max_token_cnt =num_tokens

print(max_url)
print(max_token_cnt)

https://www.wto.org/english/news_e/pres95_e/ov11.htm
22206


In [None]:
min_token_cnt = max_token_cnt
for url in wto_eng_doc.keys():
    content = wto_eng_doc[url]['doc']
    num_tokens = calculate_tokens(content, model_name)
    min_token_cnt = min(num_tokens, max_token_cnt)

print(min_token_cnt)

148


# Prompt Format

In [7]:
template_string = """
This is the article: ```{article}```

If the article shows the United States' disappointments, please type "yes", else type "no" for the key isDisappointment.
If the article contains complaints about the United States from other countries, please type "yes", else type "no" for the key isComplain.
If the article contains criticism of the United States from the WTO, please type "yes", else type "no" for the key isCritic.
If the article implies that there is a need for a policy change within the United States, please type "yes", else type "no" for the key isAffect.

And then give a reason in one sentence to all keys. 

{format_instructions}
"""

In [8]:
response_schemas = [
    ResponseSchema(name="isDisappointment", description="response with only yes or no"),
    ResponseSchema(name="isDisappointment_reason", description="One-sentence reason for the response"),
    ResponseSchema(name="isComplain", description="response with only yes or no"),
    ResponseSchema(name="isComplain_reason", description="One-sentence reason for the response"),
    ResponseSchema(name="isCritic", description="response with only yes or no"),
    ResponseSchema(name="isCritic_reason", description="One-sentence reason for the response"),
    ResponseSchema(name="isAffect", description="response with only yes or no"),
    ResponseSchema(name="isAffect_reason", description="One-sentence reason for the response")
]

output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = output_parser.get_format_instructions()    

prompt_template = ChatPromptTemplate.from_template(template_string)

chat_model = ChatOpenAI(temperature=0, model_name=model_name)

In [9]:
print(prompt_template)

input_variables=['article', 'format_instructions'] messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['article', 'format_instructions'], template='\nThis is the article: ```{article}```\n\nIf the article shows the United States\' disappointments, please type "yes", else type "no" for the key isDisappointment.\nIf the article contains complaints about the United States from other countries, please type "yes", else type "no" for the key isComplain.\nIf the article contains criticism of the United States from the WTO, please type "yes", else type "no" for the key isCritic.\nIf the article implies that there is a need for a policy change within the United States, please type "yes", else type "no" for the key isAffect.\n\nAnd then give a reason in one sentence to all keys. \n\n{format_instructions}\n'))]


# Get Results

In [10]:
def save_output(structured_output_dict):
    with open(f'threat_detection_output.json', 'w') as f:
        json.dump(structured_output_dict, f)

In [11]:
#  RateLimitError, APIError, APIConnectionError, APITimeoutError, InternalServerError
def my_chat(url):
    try:
        input_content = wto_eng_doc[url]['doc']
        input_messages = prompt_template.format_messages(article = input_content, format_instructions=format_instructions)

        response = chat_model.invoke([
            SystemMessage(content='You are a political economics researcher, focusing on international relations.'),
            input_messages[0],
        ],
        max_tokens=150
        )

        structured_output = output_parser.parse(response.content)
        return structured_output

    except Exception as e:
        current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

        log_message_dict = {
            'Input': url,
            'Time': current_time,
            'Error': type(e).__name__,
            'Message': str(e)
        }

        log_message = str(log_message_dict)
        logging.error(log_message)
        return None

In [12]:
# create looging file
logging.basicConfig(level=logging.ERROR, filename='gpt_errors.log', filemode='a', format='%(message)s')

In [13]:
# Open the log file and read its contents
error_url = set()

with open('gpt_errors.log', 'r') as file:
    for line in file:
        dict_obj = ast.literal_eval(line.strip())
        error_url.add(dict_obj['Input'])

print(f'Number of failed cases: {len(error_url)}')

Number of failed cases: 493


In [14]:
idx = 0
start_time = datetime.now()

file_name = 'threat_detection_output.json'
if not os.path.exists(file_name):
    with open(file_name, 'w') as f:
        json.dump({}, f)
else:
    with open(file_name) as f:
        structured_output_dict = json.load(f)
        print(f'Loaded {len(structured_output_dict)} records')

with get_openai_callback() as cb:
    
    for url in tqdm(urls, desc='Now Processing'):
        if url not in structured_output_dict.keys() and url not in error_url:
            
            structured_output = my_chat(url)
            if structured_output is not None:
                structured_output_dict.update({url:structured_output})
                idx += 1

            if idx % 10 == 0:
                save_output(structured_output_dict)
        
            time.sleep(random.randint(1, 5))

save_output(structured_output_dict)

end_time = datetime.now()
print(f'used {cb} tokens and took {end_time - start_time} to process {len(urls)} articles')

Loaded 5126 records


Now Processing: 100%|██████████| 8296/8296 [7:26:54<00:00,  3.23s/it]  

used Tokens Used: 4544861
	Prompt Tokens: 4188536
	Completion Tokens: 356325
Successful Requests: 2677
Total Cost (USD): $0.0 tokens and took 7:26:54.970857 to process 8296 articles





In [17]:
save_output(structured_output_dict)
len(structured_output_dict)

5126

In [15]:
print(my_chat('https://www.wto.org/english/tratop_e/dispu_e/cases_e/ds360_e.htm'))

{'isDisappointment': 'no', 'isDisappointment_reason': "The article does not explicitly express the United States' disappointment regarding the WTO's decisions.", 'isComplain': 'yes', 'isComplain_reason': 'The United States requested consultations with India regarding additional duties, indicating a complaint.', 'isCritic': 'no', 'isCritic_reason': 'The article does not contain any direct criticism of the United States from the WTO.', 'isAffect': 'no', 'isAffect_reason': "The article does not imply a need for policy change within the United States; it focuses on India's import duties."}
