In [2]:
%load_ext autoreload
%autoreload 2

In [93]:
import pandas as pd
import pickle
import random
import collections
from tqdm import tqdm
import json
import re
import seaborn as sns
from dime_take_home.utils import get_locations_from_text

# Start to extract locations from news articles

In [5]:
with open("../data/raw/id_english_location_name.pkl", "rb") as f:
    locations_dict_en = pickle.load(f)
with open("../data/raw/id_arabic_location_name.pkl", "rb") as f:
    locations_dict_ar = pickle.load(f)

In [6]:
with open("../data/processed/ar_locations_reversed.pkl", "rb") as f:
    locations_dict_ar_reversed = pickle.load(f)

with open("../data/processed/en_locations_reversed.pkl", "rb") as f:
    locations_dict_en_reversed = pickle.load(f)
    
# Grab reversed location dictionaries

### Define and test location regex

In [7]:
locations_pattern_en = re.compile(r"\b(?:%s)\b" % "|".join(map(re.escape, locations_dict_en_reversed.keys())),re.IGNORECASE)
locations_pattern_ar = re.compile(r"\b(?:%s)\b" % "|".join(map(re.escape, locations_dict_ar_reversed.keys())),re.IGNORECASE)
# Compile regex pattern for matching locations
# Important to add whitespace boundaries (\b) to avoid partial matches

In [8]:
assert locations_pattern_en.findall('Suwayda') == ['Suwayda']
assert locations_pattern_en.findall('Suwayda mosul') == ['Suwayda','mosul']
assert locations_pattern_en.findall('inSuwayda') == []
assert locations_pattern_ar.findall(u'ذهبت إلى صيدا') == [u'صيدا']
assert locations_pattern_ar.findall(u'ذهبت إلى صيدا و طرابلس') == [u'صيدا', u'طرابلس']
# Check if regex patterns work as expected


In [9]:
df_news_en = pd.read_csv("../data/raw/news-articles-eng.csv")
df_news_en['dateTime'] = pd.to_datetime(df_news_en['dateTime'])
df_news_en.sort_values('dateTime', inplace=True)

df_news_ar = pd.read_csv("../data/raw/news-articles-ara.csv")
df_news_ar['dateTime'] = pd.to_datetime(df_news_ar['dateTime'])
df_news_ar.sort_values('dateTime', inplace=True)

# Grab news articles, convert dateTime to datetime object and sort by date

In [10]:
df_news_en.shape[0]

86660

In [11]:
if True:
    df_news_en = df_news_en.iloc[:df_news_en.shape[0]//50, :]
# Grab a subset 

In [12]:
df_news_en.shape

(1733, 20)

In [13]:
test_article = df_news_en.iloc[1, :]['body']

get_locations_from_text(test_article, locations_pattern_en, locations_dict_en_reversed)

['iq_an_6', 'iq_an_6', 'jo']

In [14]:
test_article = df_news_ar.iloc[2, :]['body']

get_locations_from_text(test_article, locations_pattern_ar, locations_dict_ar_reversed)

['iq_bg', 'iq_bg', 'iq_bg', 'iq_bg_10', 'iq_bg']

### Define and test risk factor regex

In [15]:
df_risk_factors = pd.read_csv("../data/processed/risk-factors-with-arabic.csv")

In [17]:
df_risk_factors_categories = pd.read_csv("../data/processed/risk_factors_categories_with_arabic.csv")

In [26]:
df_risk_factors_categories.sample(5)

Unnamed: 0,risk_factor_english,risk_factor_arabic,cluster
6,military junta,المجلس العسكري,political instability
9,displaced,نازح,forced displacement
3,terrorism,الإرهاب,conflicts and violence
4,catastrophe,كارثة,other
8,slashed export,خفض الصادرات,economic issues


In [None]:
with open("../data/processed/risk_categories_dict_en.json", "r", encoding="utf-8") as f:
    risk_categories_dict_en = json.load(f)

with open("../data/processed/risk_categories_dict_ar.json", "r", encoding="utf-8") as f:
    risk_categories_dict_ar = json.load(f)
    
# Load back in mappings of terms to clusters


In [20]:
risk_categories_pattern_en = re.compile(r"\b(?:%s)\b" % "|".join(map(re.escape, df_risk_factors_categories['risk_factor_english'])),re.IGNORECASE)
risk_categories_pattern_ar = re.compile(r"\b(?:%s)\b" % "|".join(map(re.escape, df_risk_factors_categories['risk_factor_arabic'])),re.IGNORECASE)


In [29]:
assert risk_categories_pattern_en.findall('There is a risk of catastrophe') == ['catastrophe']
assert risk_categories_pattern_ar.findall(u'هناك خطر من كارثة') == [u'كارثة']
# 'There is a risk of catastrophe' -> 'catastrophe'

In [None]:
def get_risk_factors_from_text(text, lang = 'en'):
    
    if lang == 'en':
        matches = risk_categories_pattern_en.findall(text)
        return list(map(risk_categories_dict_en.get, matches))
    elif lang == 'ar':
        matches = risk_categories_pattern_ar.findall(text)
        return list(map(risk_categories_dict_ar.get, matches))
    return matches
    

In [63]:
print(get_risk_factors_from_text('There is risk of catastrophe'))
print(get_risk_factors_from_text(u'هناك خطر من كارثة', lang='ar'))
print(get_risk_factors_from_text('There is risk of catastrophe and slashed export due to the military junta'))

['other']
['other']
['other', 'economic issues', 'political instability']


### Now extract things from news article

In [84]:
test_article = df_news_en.iloc[11]['body']

In [85]:
test_article

'DETROIT (AP) - WHITE SOX 5, TIGERS 1\n\nDETROIT (AP) - Drew Thorpe pitched six scoreless innings and got his first major league win in his third start, pitching Chicago over Detroit.\n\nThorpe (1-1) allowed two hits and four walks while striking out five. The 23-year-old right-hander didn´t get a decision in his debut at Seattle on June 11, then lasted 3 1/3 innings in a 12-5 loss at Arizona on June 16.\n\nA second-round pick by the Yankees in the 2022 amateur draft, Thorpe was dealt to San Diego in the December trade that brought Juan Soto to New York, then was traded to Chicago in March as part of a package for right-hander Dylan Cease.\n\nKorey Lee homered, Nicky Lopez had three hits and an RBI, and Andrew Vaughn, Paul DeJong and Lenyn Sosa drove in runs as the major league-worst White Sox (21-57) stopped a three-game losing streak.\n\nJustin Anderson, John Brebbia and Michael Kopech pitched an inning each, with Brebbia allowing Mark Canha´s RBI groundout in the eighth.\n\nChicago 

In [86]:
get_locations_from_text(test_article,locations_lookup=locations_dict_en_reversed, locations_pattern=locations_pattern_en)

['iq_an_6',
 'jo',
 'iq_an_6',
 'iq_an_6',
 'iq_an_6',
 'iq_an_6',
 'iq_an_6',
 'iq_an_6']

In [None]:
matches = []

while len(matches) == 0:
    test_article = df_news_en.sample(1).iloc[0]['body']
    matches = get_risk_factors_from_text(test_article.lower())

In [89]:
matches

['other', 'other']

In [90]:
df_news_en.head()

Unnamed: 0,uri,lang,isDuplicate,date,time,dateTime,dateTimePub,dataType,sim,url,title,body,source,authors,image,eventUri,sentiment,wgt,relevance,userHasPermissions
15273,8190770956,eng,False,2024-06-23,00:00:16,2024-06-23 00:00:16+00:00,2024-06-22T23:58:12Z,news,0.0,https://www.ringtv.com/674552-jordan-panthen-k...,Jordan Panthen knocks out Victor Toney in thre...,Jordan Panthen continues to score knockout win...,"{'uri': 'ringtv.com', 'dataType': 'news', 'tit...","[{'uri': 'francisco_salazar@ringtv.com', 'name...",https://www.ringtv.com/wp-content/uploads/2024...,,-0.07451,53,53,
86659,8190773147,eng,False,2024-06-23,00:03:50,2024-06-23 00:03:50+00:00,2024-06-23T00:01:55Z,news,0.686275,https://www.reformer.com/ap/sports/ram-rez-kwa...,"Ramírez, Kwan homer as AL Central-leading Guar...",CLEVELAND (AP) -- José Ramírez and Bo Naylor h...,"{'uri': 'reformer.com', 'dataType': 'news', 't...","[{'uri': 'tom_withers@reformer.com', 'name': '...",https://bloximages.newyork1.vip.townnews.com/r...,eng-9672706,0.184314,1,1,
18870,8190775869,eng,False,2024-06-23,00:07:49,2024-06-23 00:07:49+00:00,2024-06-23T00:06:20Z,news,0.501961,https://www.yahoo.com/news/community-members-s...,Community members step up to help during storm...,Jun. 22 -- MITCHELL -- While the two-day rains...,"{'uri': 'yahoo.com', 'dataType': 'news', 'titl...",[],https://s.yimg.com/cv/apiv2/social/images/yaho...,eng-9669354,0.184314,52,52,
6351,2024-06-398429595,eng,False,2024-06-23,00:08:25,2024-06-23 00:08:25+00:00,2024-06-23T00:08:19Z,news,0.717647,https://www.iranintl.com/en/202406216837,Iran Warns Israel Against Waging War on Lebanon,Iran's mission to the UN in New York warned Is...,"{'uri': 'iranintl.com', 'dataType': 'news', 't...",[],https://i.iranintl.com/images/rdk9umy0/product...,eng-9669018,-0.403922,106,106,
36361,2024-06-398429564,eng,False,2024-06-23,00:08:25,2024-06-23 00:08:25+00:00,2024-06-22T23:42:48Z,news,0.0,https://clutchpoints.com/blue-jays-news-jordan...,Blue Jays' Jordan Romano dealt unfortunate inj...,Toronto's two-time All-Star closer continues t...,"{'uri': 'clutchpoints.com', 'dataType': 'news'...","[{'uri': 'peter_sampson@clutchpoints.com', 'na...",https://wp.clutchpoints.com/wp-content/uploads...,,0.121569,4,4,


In [94]:
tqdm.pandas()
df_news_en['risk_factors'] = df_news_en['body'].progress_apply(get_risk_factors_from_text)

100%|██████████| 1733/1733 [00:00<00:00, 3102.62it/s]


In [97]:
tqdm.pandas()
df_news_en['locations'] = df_news_en['body'].progress_apply(lambda x: get_locations_from_text(x,locations_lookup=locations_dict_en_reversed, locations_pattern=locations_pattern_en))

100%|██████████| 1733/1733 [00:13<00:00, 128.60it/s]


In [99]:
df_news_en[['locations', 'risk_factors']].head(10)

Unnamed: 0,locations,risk_factors
15273,[jo],[]
86659,"[iq_an_6, iq_an_6, jo]",[]
18870,"[jo, iq_an_6]",[]
6351,"[lb, lb, lb, ps_gz_2, ps_gz_2]",[]
36361,"[jo, jo]",[]
86658,[jo],[]
53068,"[ps_gz_2, jo]",[]
32891,"[ps_gz_2, ps, ps, ps, ps, ps]",[other]
10735,"[ps_gz_2, ps_gz_5, ps_gz_2, ps_gz_5, ps_gz_2, ...",[]
86657,"[iq_an_6, iq_an_6, jo]",[]
