In [1]:
import pandas as pd
from tqdm import tqdm
import json
from datetime import datetime
import time
import matplotlib.pyplot as plt
import random
from dotenv import load_dotenv
from langchain.llms import OpenAI
from langchain_openai import ChatOpenAI
from langchain.callbacks import get_openai_callback
from langchain.schema import HumanMessage, AIMessage, SystemMessage, ChatMessage
from langchain.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate, AIMessagePromptTemplate, SystemMessagePromptTemplate
from langchain.output_parsers import StructuredOutputParser, ResponseSchema, CommaSeparatedListOutputParser
from langchain_core.output_parsers import JsonOutputParser

import os
load_dotenv(dotenv_path='../../../../.env')
api_key = os.getenv('OPENAI_API_KEY')
os.environ['OPENAI_API_KEY'] = api_key



In [5]:
with open('../WTO_crawler/WTO_data_article/all_article_content.json') as f:
    all_article_content = json.load(f)

In [6]:
len(all_article_content)

8680

# filter data

In [7]:
# urls = list(all_article_content.keys())

In [8]:
urls = []

for url in all_article_content.keys():
    try:
        if '2023' in all_article_content[url]['date']:
            urls.append(url)
    except:
        pass

In [9]:
len(urls)

532

In [10]:
sampled_urls = random.sample(urls, 100)

'20240130_120750'

# instruction 1

In [11]:
template_string = """
You will be given an article from the WTO website. You have to determine whether the article implies threats to the sovereignty of 
the United States. 

This is the article: ```{article}```

If the article implies threats to the sovereignty of the United States, please type "yes". If the article does 
not imply threats to the sovereignty of the United States, please type "no". And then give a reason in one sentence. 

{format_instructions}
"""

In [12]:
response_schemas = [
    ResponseSchema(name="isThreat", description="response with only yes or no"),
    ResponseSchema(name="reason", description="One-sentence reason for the response")
]

output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = output_parser.get_format_instructions()    

prompt_template = ChatPromptTemplate.from_template(template_string)

chat_model = ChatOpenAI(temperature=0, model_name='gpt-4-0125-preview')

In [13]:
# sampling
idx = 0
curr_time = datetime.now().strftime("%Y%m%d_%H%M%S")
structured_output_dict = {}

for url in tqdm(sampled_urls, desc="Now Processing"):
    # print(f'Now processing {url}')
    input_content = all_article_content[url]['content']
    input_messages = prompt_template.format_messages(article = input_content, format_instructions=format_instructions)
    response = chat_model.invoke([
            SystemMessage(content = 'You are a polieconomics researcher, focusing on international relations.'),
            input_messages[0]
        ])
    
    structured_output = output_parser.parse(response.content)
    structured_output_dict.update({url:structured_output})

    if idx % 10 == 0:
        with open(f'../sampling_{curr_time}.json', 'w') as f:
            json.dump(structured_output_dict, f)

    idx += 1

with open(f'../sampling_{curr_time}.json', 'w') as f:
    json.dump(structured_output_dict, f)

  warn_deprecated(
Now Processing: 100%|██████████| 100/100 [08:16<00:00,  4.97s/it]


In [19]:
set([d['reason'] for d in structured_output_dict.values()])

{'https://www.wto.org/english/blogs_e/data_blog_e/blog_dta_08jun23_e.htm',
 'https://www.wto.org/english/blogs_e/data_blog_e/blog_dta_14jul23_e.htm',
 'https://www.wto.org/english/news_e/news23_e/547_585r_e.htm',
 'https://www.wto.org/english/news_e/news23_e/601r_e.htm',
 'https://www.wto.org/english/news_e/news23_e/acc_05oct23_e.htm',
 'https://www.wto.org/english/news_e/news23_e/agng_03oct23_e.htm',
 'https://www.wto.org/english/news_e/news23_e/agri_05oct23_e.htm',
 'https://www.wto.org/english/news_e/news23_e/agri_07nov23_e.htm',
 'https://www.wto.org/english/news_e/news23_e/agri_28mar23_e.htm',
 'https://www.wto.org/english/news_e/news23_e/agri_31mar23_e.htm',
 'https://www.wto.org/english/news_e/news23_e/agri_31oct23_e.htm',
 'https://www.wto.org/english/news_e/news23_e/aid_12may23_e.htm',
 'https://www.wto.org/english/news_e/news23_e/aid_14nov23_e.htm',
 'https://www.wto.org/english/news_e/news23_e/chair_08jun23_e.htm',
 'https://www.wto.org/english/news_e/news23_e/chp_03feb23_e.

# Playground

In [199]:
import time
for i in tqdm(range(100), desc="Outer loop"):
    # for j in tqdm(range(10), desc="Inner loop", leave=False):
        # 执行一些耗时的操作
    time.sleep(0.01)

Outer loop: 100%|██████████| 100/100 [00:01<00:00, 80.11it/s]


In [218]:
idx = 0
for url in tqdm(urls, desc="Processing"):
    if idx % 80 == 0:
        input_content = all_article_content[url]['content']
    # time.sleep(0.1)
    idx += 1

Processing: 100%|██████████| 8680/8680 [00:00<00:00, 411076.27it/s]


In [219]:
idx

8680

In [180]:
d

{'https://www.wto.org/english/news_e/news19_e/dsb_agenda_22nov19_e.htm': {'isThreat': 'no',
  'reason': 'Proposed items for consideration at the DSB typically involve dispute resolution and trade policy reviews, not direct threats to national sovereignty.'}}

AttributeError: 'dict' object has no attribute 'append'

{'url': 'https://www.wto.org/english/news_e/news19_e/dsb_agenda_22nov19_e.htm',
 'isThreat': 'no',
 'reason': 'Proposed items for consideration at the DSB typically involve dispute resolution and trade policy reviews, not direct threats to national sovereignty.'}

In [56]:
# with get_openai_callback() as cb:
#     result = chat_model([
#                 SystemMessage(content = 'Your are a polieconomic researcher, focusing on international relations.'),
#                 HumanMessage(content = instructions),
#                 HumanMessage(content = text),
#             ])
#     print(cb)

Tokens Used: 359
	Prompt Tokens: 324
	Completion Tokens: 35
Successful Requests: 1
Total Cost (USD): $0.0


In [None]:
# track token usage
# from langchain.callbacks import get_openai_callback
# with get_openai_callback() as cb:
#     result = llm.invoke("Tell me a joke")
#     print(cb)