In [44]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import json
from datetime import datetime
import time
import plotly.express as px
import matplotlib.pyplot as plt
import tiktoken
import requests
import ast
import zipfile
import io 

## Threat Detection Output

In [2]:
with open('threat_detection_output.json') as f:
        threat_detection_output = json.load(f)
        print(f'Loaded {len(threat_detection_output)} records')

Loaded 7598 records


In [3]:
threat_urls = list(threat_detection_output.keys())

In [4]:
threat_cnt = 0

for url in threat_detection_output.keys():
    if 'yes' in threat_detection_output[url].values():
        threat_cnt += 1

print(f'There are {threat_cnt} articles with threats')
print(f'The proportion of articles with threats is {threat_cnt/len(threat_detection_output)}')

There are 891 articles with threats
The proportion of articles with threats is 0.11726770202684918


## Event Extraction Output

### Document info

In [5]:
document_info_merge = pd.read_csv('../WTO-Event-Extraction/results/full_wto_bertopic_document_info_merge_20240329_213505.csv')
print(f'length of merged_doc_info: {len(document_info_merge)}')
document_info_merge.head(3)

length of merged_doc_info: 8296


Unnamed: 0,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document,url,date
0,19,19_vaccine_dose_disaster_covax,"['vaccine', 'dose', 'disaster', 'covax', 'supp...",['deputy director general angela ellard discus...,vaccine - dose - disaster - covax - supply - c...,0.059756,False,https://www.wto.org/english/news_e/news21_e/ig...,2021-11-29
1,11,11_adjudication_litigation_satisfactory_consul...,"['adjudication', 'litigation', 'satisfactory',...",['european union request wto dispute consultat...,adjudication - litigation - satisfactory - con...,0.447425,False,https://www.wto.org/english/news_e/news18_e/ds...,2018-08-27
2,-1,-1_pandemic_gatt_shall_panel,"['pandemic', 'gatt', 'shall', 'panel', 'reques...","[""virtual address washington international tra...",pandemic - gatt - shall - panel - request - ge...,0.231426,False,https://www.wto.org/english/news_e/spra_e/spra...,2018-03-19


In [6]:
document_info_merge.columns

Index(['Topic', 'Name', 'Representation', 'Representative_Docs', 'Top_n_words',
       'Probability', 'Representative_document', 'url', 'date'],
      dtype='object')

In [7]:
threat_events_document = []
columns = ['Topic', 'Name', 'url', 'date', 'isThreat', 'ThreatLevel', 'ThreatCount', 
           'isDisappointment', 'isDisappointment_reason', 'isComplain', 'isComplain_reason', 
           'isCritic', 'isCritic_reason', 'isAffect', 'isAffect_reason']

for row in document_info_merge.itertuples():
    res = [row.Topic, row.Name, row.url, row.date]

    if row.url in threat_urls:
        threat_res = list(threat_detection_output[row.url].values())
        
        # add boolean result
        if 'yes' in threat_res:
            res += [True]
        else:
            res += [False]
        
        # make it a scale
        res += [int(threat_res.count('yes')+1), threat_res.count('yes')]
        # add meta info
        res += threat_res
    
    # for not-done urls or urls in gpt_errors.log
    else:
        res += [None] * 10

    threat_events_document.append(res)

threat_events_document = pd.DataFrame(threat_events_document, columns=columns)
print(f'length of threat_events: {len(threat_events_document)}')
threat_events_document.head(3)

length of threat_events: 8296


Unnamed: 0,Topic,Name,url,date,isThreat,ThreatLevel,ThreatCount,isDisappointment,isDisappointment_reason,isComplain,isComplain_reason,isCritic,isCritic_reason,isAffect,isAffect_reason
0,19,19_vaccine_dose_disaster_covax,https://www.wto.org/english/news_e/news21_e/ig...,2021-11-29,False,1.0,0.0,no,The article does not express any disappointmen...,no,There are no complaints about the United State...,no,The article does not contain any criticism of ...,no,The article does not imply a need for policy c...
1,11,11_adjudication_litigation_satisfactory_consul...,https://www.wto.org/english/news_e/news18_e/ds...,2018-08-27,True,3.0,2.0,no,The article does not express any emotions such...,yes,China's request for consultations under the WT...,no,The article does not contain any direct critic...,yes,The initiation of a dispute suggests that ther...
2,-1,-1_pandemic_gatt_shall_panel,https://www.wto.org/english/news_e/spra_e/spra...,2018-03-19,True,4.0,3.0,no,The article does not explicitly express any di...,yes,The article mentions concerns over protectioni...,yes,The article criticizes the rise of protectioni...,yes,The call for resolving trade tensions and stre...


In [8]:
threat_events_document.to_csv('threat_events_document_info_merge.csv', index=False)

### Topic info

In [9]:
topic_info_merge = pd.read_csv('../WTO-Event-Extraction/results/full_wto_bertopic_topic_info_merge_20240329_213505.csv')
print(f'length of merged_topic_info: {len(topic_info_merge)}')
topic_info_merge.head(3)

length of merged_topic_info: 156


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs,Representation_n1-2
0,-1,1859,-1_pandemic_gatt_shall_panel,"['pandemic', 'gatt', 'shall', 'panel', 'reques...","[""virtual address washington international tra...","['panel', 'request', 'general', 'multilateral'..."
1,0,537,0_climate_environmental_food_energy,"['climate', 'environmental', 'food', 'energy',...",['day wto public forum october panel workshop ...,"['climate', 'environmental', 'food', 'climate ..."
2,1,283,1_university_rtpc_essay_academic,"['university', 'rtpc', 'essay', 'academic', 'c...",['wto issue young economist submit paper wto e...,"['university', 'rtpc', 'academic', 'essay', 'c..."


In [16]:
len(threat_events_document)

8296

In [36]:
total_doc = len(threat_events_document)

# count isThreat
threat_events_topic = threat_events_document[['Topic', 'isThreat', 'ThreatCount']].groupby('Topic').agg({'isThreat': 'sum'}).reset_index()
threat_events_topic.fillna(0, inplace=True)

# GlobalThreatRate
threat_events_topic['GlobalThreatRate'] = threat_events_topic['isThreat'] / total_doc

# LocalThreatRate
threat_events_topic_info_merge = pd.merge(threat_events_topic, topic_info_merge, on='Topic', how='left')
threat_events_topic_info_merge['LocalThreatRate'] = threat_events_topic_info_merge['isThreat'] / threat_events_topic_info_merge['Count']

threat_events_topic_info_merge.head(3)

Unnamed: 0,Topic,isThreat,GlobalThreatRate,Count,Name,Representation,Representative_Docs,Representation_n1-2,LocalThreatRate
0,-1,219,0.026398,1859,-1_pandemic_gatt_shall_panel,"['pandemic', 'gatt', 'shall', 'panel', 'reques...","[""virtual address washington international tra...","['panel', 'request', 'general', 'multilateral'...",0.117805
1,0,90,0.010849,537,0_climate_environmental_food_energy,"['climate', 'environmental', 'food', 'energy',...",['day wto public forum october panel workshop ...,"['climate', 'environmental', 'food', 'climate ...",0.167598
2,1,0,0.0,283,1_university_rtpc_essay_academic,"['university', 'rtpc', 'essay', 'academic', 'c...",['wto issue young economist submit paper wto e...,"['university', 'rtpc', 'academic', 'essay', 'c...",0.0


In [33]:
processed_data = threat_events_document.replace({'isComplain': {'yes': 1, 'no': 0}, 'isCritic': {'yes': 1, 'no': 0}, 
                                                 'isAffect': {'yes': 1, 'no': 0}, 'isDisappointment': {'yes': 1, 'no': 0}})

summary_by_topic = processed_data.groupby('Topic', as_index=False).agg({'isComplain': 'sum', 'isCritic': 'sum',
                                                                        'isAffect': 'sum', 'isDisappointment': 'sum'})

summary_by_topic.rename(columns={'isComplain': 'isComplainCount', 'isCritic': 'isCriticCount',
                                 'isAffect': 'isAffectCount', 'isDisappointment': 'isDisappointmentCount'}, inplace=True)

# LocalRate for each defined threat
summary_by_topic['isComplainLocalRate'] = summary_by_topic['isComplainCount'] / threat_events_topic_info_merge['Count']
summary_by_topic['isCriticLocalRate'] = summary_by_topic['isCriticCount'] / threat_events_topic_info_merge['Count']
summary_by_topic['isAffectLocalRate'] = summary_by_topic['isAffectCount'] / threat_events_topic_info_merge['Count']
summary_by_topic['isDisappointmentLocalRate'] = summary_by_topic['isDisappointmentCount'] / threat_events_topic_info_merge['Count']

# GlobalRate for each defined threat
summary_by_topic['isComplainGlobalRate'] = summary_by_topic['isComplainCount'] / total_doc
summary_by_topic['isCriticGlobalRate'] = summary_by_topic['isCriticCount'] / total_doc
summary_by_topic['isAffectGlobalRate'] = summary_by_topic['isAffectCount'] / total_doc
summary_by_topic['isDisappointmentGlobalRate'] = summary_by_topic['isDisappointmentCount'] / total_doc

summary_by_topic.head(3)


Unnamed: 0,Topic,isComplainCount,isCriticCount,isAffectCount,isDisappointmentCount,isComplainLocalRate,isCriticLocalRate,isAffectLocalRate,isDisappointmentLocalRate,isComplainGlobalRate,isCriticGlobalRate,isAffectGlobalRate,isDisappointmentGlobalRate
0,-1,102.0,13.0,148.0,50.0,0.054868,0.006993,0.079613,0.026896,0.012295,0.001567,0.01784,0.006027
1,0,7.0,10.0,85.0,4.0,0.013035,0.018622,0.158287,0.007449,0.000844,0.001205,0.010246,0.000482
2,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
threat_events_topic_info_merge = threat_events_topic_info_merge.merge(summary_by_topic, on='Topic', how='left')
print(threat_events_topic_info_merge.columns)

Index(['Topic', 'isThreat', 'GlobalThreatRate', 'Count', 'Name',
       'Representation', 'Representative_Docs', 'Representation_n1-2',
       'LocalThreatRate', 'isComplainCount', 'isCriticCount', 'isAffectCount',
       'isDisappointmentCount', 'isComplainLocalRate', 'isCriticLocalRate',
       'isAffectLocalRate', 'isDisappointmentLocalRate',
       'isComplainGlobalRate', 'isCriticGlobalRate', 'isAffectGlobalRate',
       'isDisappointmentGlobalRate'],
      dtype='object')


In [39]:

threat_events_topic_info_merge = threat_events_topic_info_merge.reindex(columns=['Topic', 'Name', 'isThreat', 'Count',
                                                                                 'LocalThreatRate', 'GlobalThreatRate', 
                                                                                 'isComplainLocalRate', 'isCriticLocalRate',
                                                                                 'isAffectLocalRate', 'isDisappointmentLocalRate',
                                                                                 'isComplainGlobalRate', 'isCriticGlobalRate', 
                                                                                 'isAffectGlobalRate', 'isDisappointmentGlobalRate', 
                                                                                 'isComplainCount', 'isCriticCount', 'isAffectCount', 'isDisappointmentCount',
                                                                                 'Representation', 'Representation_n1-2', 'Representative_Docs'])
threat_events_topic_info_merge.rename(columns={'Count': 'Size'}, inplace=True)
print(f'length of threat_events_topic_info_merge: {len(threat_events_topic_info_merge)}')
threat_events_topic_info_merge.head(3)

length of threat_events_topic_info_merge: 156


Unnamed: 0,Topic,Name,isThreat,Size,LocalThreatRate,GlobalThreatRate,isComplainLocalRate,isCriticLocalRate,isAffectLocalRate,isDisappointmentLocalRate,...,isCriticGlobalRate,isAffectGlobalRate,isDisappointmentGlobalRate,isComplainCount,isCriticCount,isAffectCount,isDisappointmentCount,Representation,Representation_n1-2,Representative_Docs
0,-1,-1_pandemic_gatt_shall_panel,219,1859,0.117805,0.026398,0.054868,0.006993,0.079613,0.026896,...,0.001567,0.01784,0.006027,102.0,13.0,148.0,50.0,"['pandemic', 'gatt', 'shall', 'panel', 'reques...","['panel', 'request', 'general', 'multilateral'...","[""virtual address washington international tra..."
1,0,0_climate_environmental_food_energy,90,537,0.167598,0.010849,0.013035,0.018622,0.158287,0.007449,...,0.001205,0.010246,0.000482,7.0,10.0,85.0,4.0,"['climate', 'environmental', 'food', 'energy',...","['climate', 'environmental', 'food', 'climate ...",['day wto public forum october panel workshop ...
2,1,1_university_rtpc_essay_academic,0,283,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"['university', 'rtpc', 'essay', 'academic', 'c...","['university', 'rtpc', 'academic', 'essay', 'c...",['wto issue young economist submit paper wto e...


In [60]:
threat_events_topic_info_merge.to_csv('threat_events_topic_info_merge.csv', index=False)

# Other Statistics

In [42]:
threat_events_document.isThreat.value_counts()

False    6707
True      891
Name: isThreat, dtype: int64

In [41]:
threat_events_document.ThreatLevel.value_counts()

1.0    6707
2.0     571
3.0     264
4.0      55
5.0       1
Name: ThreatLevel, dtype: int64