In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import json
from datetime import datetime
import time
import matplotlib.pyplot as plt
import tiktoken
import requests
import ast
import zipfile
import io 



## Threat Detection Output

In [10]:
with open('threat_detection_output.json') as f:
        threat_detection_output = json.load(f)
        print(f'Loaded {len(threat_detection_output)} records')

Loaded 7598 records


In [11]:
threat_urls = list(threat_detection_output.keys())

In [57]:
threat_cnt = 0

for url in threat_detection_output.keys():
    if 'yes' in threat_detection_output[url].values():
        threat_cnt += 1

print(f'There are {threat_cnt} articles with threats')
print(f'The proportion of articles with threats is {threat_cnt/len(threat_detection_output)}')

There are 891 articles with threats
The proportion of articles with threats is 0.11726770202684918


## Event Extraction Output

### Document info

In [6]:
document_info_merge = pd.read_csv('../WTO-Event-Extraction/results/full_wto_bertopic_document_info_merge_20240329_213505.csv')
print(f'length of merged_doc_info: {len(document_info_merge)}')
document_info_merge.head(3)

length of merged_doc_info: 8296


Unnamed: 0,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document,url,date
0,19,19_vaccine_dose_disaster_covax,"['vaccine', 'dose', 'disaster', 'covax', 'supp...",['deputy director general angela ellard discus...,vaccine - dose - disaster - covax - supply - c...,0.059756,False,https://www.wto.org/english/news_e/news21_e/ig...,2021-11-29
1,11,11_adjudication_litigation_satisfactory_consul...,"['adjudication', 'litigation', 'satisfactory',...",['european union request wto dispute consultat...,adjudication - litigation - satisfactory - con...,0.447425,False,https://www.wto.org/english/news_e/news18_e/ds...,2018-08-27
2,-1,-1_pandemic_gatt_shall_panel,"['pandemic', 'gatt', 'shall', 'panel', 'reques...","[""virtual address washington international tra...",pandemic - gatt - shall - panel - request - ge...,0.231426,False,https://www.wto.org/english/news_e/spra_e/spra...,2018-03-19


In [7]:
document_info_merge.columns

Index(['Topic', 'Name', 'Representation', 'Representative_Docs', 'Top_n_words',
       'Probability', 'Representative_document', 'url', 'date'],
      dtype='object')

In [12]:
threat_events = []
columns = ['Topic', 'Name', 'url', 'date', 'isThreat', 'ThreatCount', 
           'isDisappointment', 'isDisappointment_reason', 'isComplain', 'isComplain_reason', 
           'isCritic', 'isCritic_reason', 'isAffect', 'isAffect_reason']

for row in document_info_merge.itertuples():
    res = [row.Topic, row.Name, row.url, row.date]

    if row.url in threat_urls:
        threat_res = list(threat_detection_output[row.url].values())
        
        # add boolean result
        if 'yes' in threat_res:
            res += [True]
        else:
            res += [False]
        
        # make it a scale
        res += [threat_res.count('yes')]
        # add meta info
        res += threat_res
    
    # for not-done urls or urls in gpt_errors.log
    else:
        res += [None] * 10

    threat_events.append(res)

threat_events = pd.DataFrame(threat_events, columns=columns)
print(f'length of threat_events: {len(threat_events)}')
threat_events.head(3)

length of threat_events: 8296


Unnamed: 0,Topic,Name,url,date,isThreat,ThreatCount,isDisappointment,isDisappointment_reason,isComplain,isComplain_reason,isCritic,isCritic_reason,isAffect,isAffect_reason
0,19,19_vaccine_dose_disaster_covax,https://www.wto.org/english/news_e/news21_e/ig...,2021-11-29,False,0.0,no,The article does not express any disappointmen...,no,There are no complaints about the United State...,no,The article does not contain any criticism of ...,no,The article does not imply a need for policy c...
1,11,11_adjudication_litigation_satisfactory_consul...,https://www.wto.org/english/news_e/news18_e/ds...,2018-08-27,True,2.0,no,The article does not express any emotions such...,yes,China's request for consultations under the WT...,no,The article does not contain any direct critic...,yes,The initiation of a dispute suggests that ther...
2,-1,-1_pandemic_gatt_shall_panel,https://www.wto.org/english/news_e/spra_e/spra...,2018-03-19,True,3.0,no,The article does not explicitly express any di...,yes,The article mentions concerns over protectioni...,yes,The article criticizes the rise of protectioni...,yes,The call for resolving trade tensions and stre...


In [101]:
threat_events.to_csv('threat_events_document_info_merge.csv', index=False)

### Topic info

In [3]:
topic_info_merge = pd.read_csv('../WTO-Event-Extraction/results/full_wto_bertopic_topic_info_merge_20240329_213505.csv')
print(f'length of merged_topic_info: {len(topic_info_merge)}')
topic_info_merge.head(3)

length of merged_topic_info: 156


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs,Representation_n1-2
0,-1,1859,-1_pandemic_gatt_shall_panel,"['pandemic', 'gatt', 'shall', 'panel', 'reques...","[""virtual address washington international tra...","['panel', 'request', 'general', 'multilateral'..."
1,0,537,0_climate_environmental_food_energy,"['climate', 'environmental', 'food', 'energy',...",['day wto public forum october panel workshop ...,"['climate', 'environmental', 'food', 'climate ..."
2,1,283,1_university_rtpc_essay_academic,"['university', 'rtpc', 'essay', 'academic', 'c...",['wto issue young economist submit paper wto e...,"['university', 'rtpc', 'academic', 'essay', 'c..."


In [4]:
total_article = np.sum(topic_info_merge['Count'])

In [14]:
threat_events.head(3)

Unnamed: 0,Topic,Name,url,date,isThreat,ThreatCount,isDisappointment,isDisappointment_reason,isComplain,isComplain_reason,isCritic,isCritic_reason,isAffect,isAffect_reason
0,19,19_vaccine_dose_disaster_covax,https://www.wto.org/english/news_e/news21_e/ig...,2021-11-29,False,0.0,no,The article does not express any disappointmen...,no,There are no complaints about the United State...,no,The article does not contain any criticism of ...,no,The article does not imply a need for policy c...
1,11,11_adjudication_litigation_satisfactory_consul...,https://www.wto.org/english/news_e/news18_e/ds...,2018-08-27,True,2.0,no,The article does not express any emotions such...,yes,China's request for consultations under the WT...,no,The article does not contain any direct critic...,yes,The initiation of a dispute suggests that ther...
2,-1,-1_pandemic_gatt_shall_panel,https://www.wto.org/english/news_e/spra_e/spra...,2018-03-19,True,3.0,no,The article does not explicitly express any di...,yes,The article mentions concerns over protectioni...,yes,The article criticizes the rise of protectioni...,yes,The call for resolving trade tensions and stre...


In [15]:
# calculate ThreatRate
threat_events_topic = threat_events[['Topic', 'isThreat', 'ThreatCount']].groupby('Topic').agg({'isThreat': 'sum'}).reset_index()
threat_events_topic.fillna(0, inplace=True)

for row in threat_events_topic.itertuples():
#     # per topic
#     if row.ThreatCount != 0:
#         threat_events_topic.at[row.Index, 'LocalThreatRate'] = row.isThreat / row.Count
#     else:
#         threat_events_topic.at[row.Index, 'LocalThreatRate'] = 0

    # global
    threat_events_topic.at[row.Index, 'GlobalThreatRate'] = row.isThreat / total_article

threat_events_topic.head(3)

Unnamed: 0,Topic,isThreat,GlobalThreatRate
0,-1,219,0.026398
1,0,90,0.010849
2,1,0,0.0


In [21]:
threat_events_topic_info_merge = pd.merge(threat_events_topic, topic_info_merge, on='Topic', how='left')
threat_events_topic_info_merge['LocalThreatRate'] = threat_events_topic_info_merge['isThreat'] / threat_events_topic_info_merge['Count']
threat_events_topic_info_merge = threat_events_topic_info_merge.reindex(columns=['Topic', 'Name', 'isThreat', 'Count',
                                                                                 'LocalThreatRate', 'GlobalThreatRate', 
                                                                                 'Name', 'Representation', 'Representative_Docs', 'Representation_n1-2'])
print(f'length of threat_events_topic_info_merge: {len(threat_events_topic_info_merge)}')
threat_events_topic_info_merge.head(3)

length of threat_events_topic_info_merge: 156


Unnamed: 0,Topic,Name,isThreat,Count,LocalThreatRate,GlobalThreatRate,Name.1,Representation,Representative_Docs,Representation_n1-2
0,-1,-1_pandemic_gatt_shall_panel,219,1859,0.117805,0.026398,-1_pandemic_gatt_shall_panel,"['pandemic', 'gatt', 'shall', 'panel', 'reques...","[""virtual address washington international tra...","['panel', 'request', 'general', 'multilateral'..."
1,0,0_climate_environmental_food_energy,90,537,0.167598,0.010849,0_climate_environmental_food_energy,"['climate', 'environmental', 'food', 'energy',...",['day wto public forum october panel workshop ...,"['climate', 'environmental', 'food', 'climate ..."
2,1,1_university_rtpc_essay_academic,0,283,0.0,0.0,1_university_rtpc_essay_academic,"['university', 'rtpc', 'essay', 'academic', 'c...",['wto issue young economist submit paper wto e...,"['university', 'rtpc', 'academic', 'essay', 'c..."


In [22]:
threat_events_topic_info_merge.to_csv('threat_events_topic_info_merge.csv', index=False)