In [4]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [1]:
from ipynb.fs.full.data_pipeline import get_url, get_acled_dataframe

url = get_url(limit=1000000, notes='coronavirus')
df = get_acled_dataframe(url)

In [2]:
sources_list = list()
for s in df['source']:
    for i in s.split('; '):
        sources_list.append(i)

In [6]:
# First, we need a list of distinct sources
sources_distinct = list(set(sources_list))
sources_distinct_df = pd.DataFrame({'source_singular': sources_distinct})

# since eventually we're using a LIKE clause for the join, we need to add percentage wildcards here because we can't in pandasql.
sources_distinct_df['source_singular'] = sources_distinct_df['source_singular'].apply(lambda x: f'%{x}%')

# Second, join this df with the sources df with the help of pandasql

from pandasql import sqldf 
sql = lambda q: sqldf(q, globals())

expanded_source_df = sql('''
    SELECT * FROM df s
    JOIN sources_distinct_df sd
    ON s.source LIKE sd.source_singular
''')

# removing percentage wildcards
expanded_source_df['source_singular'] = expanded_source_df['source_singular'].apply(lambda x: x.replace('%', ''))
expanded_source_df.head()

Unnamed: 0,data_id,iso,event_id_cnty,event_id_no_cnty,event_date,year,time_precision,event_type,sub_event_type,actor1,...,latitude,longitude,geo_precision,source,source_scale,notes,fatalities,timestamp,iso3,source_singular
0,9491030,410,KOR25174,25174,2022-09-16,2022,1,Protests,Peaceful protest,Protesters (South Korea),...,37.4744,127.0304,1,EDaily,National,"On 16 September 2022, members of the All-Korea...",0,1663685720,KOR,EDaily
1,9491038,156,CHN12137,12137,2022-09-15,2022,1,Protests,Peaceful protest,Protesters (China),...,22.2811,114.1598,1,HK01,Subnational,"On 15 September 2022, three representatives of...",0,1663685720,CHN,HK01
2,9491260,410,KOR25204,25204,2022-09-15,2022,1,Protests,Peaceful protest,Protesters (South Korea),...,37.5223,126.9075,1,YNA,National,"On 15 September 2022, members of the COVID-19 ...",0,1663685720,KOR,YNA
3,9492137,250,FRA18626,18626,2022-09-15,2022,1,Protests,Peaceful protest,Protesters (France),...,43.2951,-0.3708,1,France Bleu,National,"On 15 September 2022, around 30 opponents of c...",0,1663691322,FRA,France Bleu
4,9493712,840,USA47753,47753,2022-09-15,2022,1,Protests,Peaceful protest,Protesters (United States),...,40.7834,-73.9663,1,CBS News,National,"On 15 September 2022, parents and students hel...",0,1663704952,USA,CBS News


In [7]:
expanded_source_df[expanded_source_df['source_singular'].str.len()<expanded_source_df['source'].str.len()]

Unnamed: 0,data_id,iso,event_id_cnty,event_id_no_cnty,event_date,year,time_precision,event_type,sub_event_type,actor1,...,latitude,longitude,geo_precision,source,source_scale,notes,fatalities,timestamp,iso3,source_singular
8,9492040,276,DEU13144,13144,2022-09-12,2022,1,Protests,Peaceful protest,Protesters (Germany),...,51.4831,11.9737,1,Mitteldeutschen Zeitung; Mitteldeutscher Rundfunk,National,"On 12 September 2022, around 850 people demons...",0,1663691322,DEU,Mitteldeutschen Zeitung
9,9492040,276,DEU13144,13144,2022-09-12,2022,1,Protests,Peaceful protest,Protesters (Germany),...,51.4831,11.9737,1,Mitteldeutschen Zeitung; Mitteldeutscher Rundfunk,National,"On 12 September 2022, around 850 people demons...",0,1663691322,DEU,Mitteldeutscher Rundfunk
14,9492163,40,AUT1180,1180,2022-09-10,2022,1,Protests,Peaceful protest,Protesters (Austria),...,48.2088,16.3702,1,Heute - Osterreichs Neue Tageszeitung,National,"On 10 September 2022, around 3,000 people, inc...",0,1663691322,AUT,ERR
15,9466689,480,MUS297,297,2022-09-09,2022,1,Protests,Peaceful protest,Protesters (Mauritius),...,-10.3871,56.6179,2,Le Mauricien; Defi Media; L'Express (Mauritius),National,"On 9 September 2022, a group of people from Ag...",0,1663026144,MUS,Le Mauricien
16,9466689,480,MUS297,297,2022-09-09,2022,1,Protests,Peaceful protest,Protesters (Mauritius),...,-10.3871,56.6179,2,Le Mauricien; Defi Media; L'Express (Mauritius),National,"On 9 September 2022, a group of people from Ag...",0,1663026144,MUS,L'Express
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108703,8555697,408,PRK34,34,2020-01-24,2020,1,Strategic developments,Change to group/activity,Government of North Korea (2011-),...,39.0392,125.7625,3,KBS; NK Pro; VOA,Other-Regional,"On 24 January 2020, North Korea cancelled mult...",0,1632239616,PRK,NK Pro
108704,7536939,156,CHN1378,1378,2020-01-23,2020,1,Protests,Peaceful protest,Protesters (China),...,22.3201,114.1692,1,RTHK; HK01,Subnational,"On 23 January 2020, 16 district councillors fr...",0,1611094728,CHN,RTHK
108705,7536939,156,CHN1378,1378,2020-01-23,2020,1,Protests,Peaceful protest,Protesters (China),...,22.3201,114.1692,1,RTHK; HK01,Subnational,"On 23 January 2020, 16 district councillors fr...",0,1611094728,CHN,HK01
108706,8555702,408,PRK33,33,2020-01-22,2020,1,Strategic developments,Change to group/activity,Government of North Korea (2011-),...,39.0392,125.7625,3,NK Pro; YNA,Other-Regional,"On 22 January 2020, the Government of North Ko...",0,1632239616,PRK,NK Pro
