In [11]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [12]:
df = pd.read_csv('../data/acled_covid19.csv')

In [13]:
# First, we get a collective list of sources
sources_list = list()
for s in df['source']:
    for i in s.split(';'):
        sources_list.append(i.strip())

In [14]:
# From our collective list of sources we need a pandas DataFrame of distinct sources
sources_distinct = list(set(sources_list))
sources_distinct_df = pd.DataFrame({'source_singular': sources_distinct})

# since eventually we're using a LIKE clause for the join, we need to add percentage wildcards around each distinct source
# here in our pandas DataFrame, because we can't in pandasql.
sources_distinct_df['source_singular'] = sources_distinct_df['source_singular'].apply(lambda x: f'%{x}%')

# Second, join this distinct source df with the main df. I prefer using pandasql because of the like clause.
from pandasql import sqldf
sql = lambda q: sqldf(q, globals())

expanded_source_df = sql(
'''
    SELECT * FROM df main
    JOIN sources_distinct_df dst_src
    ON main.source LIKE dst_src.source_singular
'''
)

# We can now remove the percentage wildcards from the source_singular column, since we only needed them for the previous step.
expanded_source_df['source_singular'] = expanded_source_df['source_singular'].apply(lambda x: x.replace('%', ''))
expanded_source_df.head()

Unnamed: 0.1,Unnamed: 0,data_id,iso,event_id_cnty,event_id_no_cnty,event_date,year,time_precision,event_type,sub_event_type,...,latitude,longitude,geo_precision,source,source_scale,notes,fatalities,timestamp,iso3,source_singular
0,0,9498574,862,VEN12964,12964,2022-09-17,2022,1,Protests,Peaceful protest,...,8.1292,-63.5409,1,Diario Primicia,Subnational,"On 17 September 2022, in Ciudad Bolivar (Boliv...",0,1664226314,VEN,Diario Primicia
1,1,9491030,410,KOR25174,25174,2022-09-16,2022,1,Protests,Peaceful protest,...,37.4744,127.0304,1,EDaily,National,"On 16 September 2022, members of the All-Korea...",0,1663685720,KOR,EDaily
2,2,9491038,156,CHN12137,12137,2022-09-15,2022,1,Protests,Peaceful protest,...,22.2811,114.1598,1,HK01,Subnational,"On 15 September 2022, three representatives of...",0,1663685720,CHN,HK01
3,3,9491260,410,KOR25204,25204,2022-09-15,2022,1,Protests,Peaceful protest,...,37.5223,126.9075,1,YNA,National,"On 15 September 2022, members of the COVID-19 ...",0,1663685720,KOR,YNA
4,4,9492137,250,FRA18626,18626,2022-09-15,2022,1,Protests,Peaceful protest,...,43.2951,-0.3708,1,France Bleu,National,"On 15 September 2022, around 30 opponents of c...",0,1663691322,FRA,France Bleu


In [15]:
expanded_source_df[expanded_source_df['source_singular'].str.len()<expanded_source_df['source'].str.len()]

Unnamed: 0.1,Unnamed: 0,data_id,iso,event_id_cnty,event_id_no_cnty,event_date,year,time_precision,event_type,sub_event_type,...,latitude,longitude,geo_precision,source,source_scale,notes,fatalities,timestamp,iso3,source_singular
9,9,9492040,276,DEU13144,13144,2022-09-12,2022,1,Protests,Peaceful protest,...,51.4831,11.9737,1,Mitteldeutschen Zeitung; Mitteldeutscher Rundfunk,National,"On 12 September 2022, around 850 people demons...",0,1663691322,DEU,Mitteldeutscher Rundfunk
10,9,9492040,276,DEU13144,13144,2022-09-12,2022,1,Protests,Peaceful protest,...,51.4831,11.9737,1,Mitteldeutschen Zeitung; Mitteldeutscher Rundfunk,National,"On 12 September 2022, around 850 people demons...",0,1663691322,DEU,Mitteldeutschen Zeitung
14,13,9492163,40,AUT1180,1180,2022-09-10,2022,1,Protests,Peaceful protest,...,48.2088,16.3702,1,Heute - Osterreichs Neue Tageszeitung,National,"On 10 September 2022, around 3,000 people, inc...",0,1663691322,AUT,ERR
16,14,9466689,480,MUS297,297,2022-09-09,2022,1,Protests,Peaceful protest,...,-10.3871,56.6179,2,Le Mauricien; Defi Media; L'Express (Mauritius),National,"On 9 September 2022, a group of people from Ag...",0,1663026144,MUS,Defi Media
17,14,9466689,480,MUS297,297,2022-09-09,2022,1,Protests,Peaceful protest,...,-10.3871,56.6179,2,Le Mauricien; Defi Media; L'Express (Mauritius),National,"On 9 September 2022, a group of people from Ag...",0,1663026144,MUS,Le Mauricien
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110176,1922,8873745,380,ITA521,521,2020-02-06,2020,1,Protests,Peaceful protest,...,45.8115,9.0829,1,Varese7press; Qui Como,Subnational,"On 6 February 2020, about 50 people, including...",0,1646327097,ITA,Varese7press
110178,1925,8873632,380,ITA473,473,2020-02-02,2020,1,Protests,Peaceful protest,...,45.4613,9.1595,1,Milano Today; La Repubblica,Subnational-National,"On 2 February 2020, around 100 people, most of...",0,1646327097,ITA,Milano Today
110179,1925,8873632,380,ITA473,473,2020-02-02,2020,1,Protests,Peaceful protest,...,45.4613,9.1595,1,Milano Today; La Repubblica,Subnational-National,"On 2 February 2020, around 100 people, most of...",0,1646327097,ITA,Today
110180,1925,8873632,380,ITA473,473,2020-02-02,2020,1,Protests,Peaceful protest,...,45.4613,9.1595,1,Milano Today; La Repubblica,Subnational-National,"On 2 February 2020, around 100 people, most of...",0,1646327097,ITA,La Repubblica


In [16]:
expanded_source_df[expanded_source_df['source_singular'] == 'ERR']

Unnamed: 0.1,Unnamed: 0,data_id,iso,event_id_cnty,event_id_no_cnty,event_date,year,time_precision,event_type,sub_event_type,...,latitude,longitude,geo_precision,source,source_scale,notes,fatalities,timestamp,iso3,source_singular
14,13,9492163,40,AUT1180,1180,2022-09-10,2022,1,Protests,Peaceful protest,...,48.2088,16.3702,1,Heute - Osterreichs Neue Tageszeitung,National,"On 10 September 2022, around 3,000 people, inc...",0,1663691322,AUT,ERR
224,138,9439806,40,AUT1159,1159,2022-08-01,2022,1,Protests,Peaceful protest,...,47.0711,15.4383,1,Radio Osterreich 1; Kleine Zeitung,National,"On 1 August 2022, at the call of The Greens, a...",0,1660057400,AUT,ERR
228,139,9439807,40,AUT1162,1162,2022-08-01,2022,1,Protests,Peaceful protest,...,48.1566,14.0246,1,Radio Osterreich 1; Krone,National,"On 1 August 2022, around 150 people demonstrat...",0,1660057400,AUT,ERR
235,142,9440001,40,AUT1160,1160,2022-08-01,2022,1,Protests,Peaceful protest,...,48.3061,14.2857,1,Radio Osterreich 1,National,"On 1 August 2022, nearly 300 people demonstrat...",0,1660057401,AUT,ERR
238,143,9440002,40,AUT1161,1161,2022-08-01,2022,1,Protests,Peaceful protest,...,48.0391,14.4194,1,Radio Osterreich 1,National,"On 1 August 2022, people demonstrated in Steyr...",0,1660057401,AUT,ERR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104036,61669,8382459,380,ITA1017,1017,2020-04-06,2020,1,Violence against civilians,Attack,...,41.0800,14.2566,1,In Terris; Osservatorio Repressione; Today,Other-National,"On 6 April 2020, a group of prison guards beat...",0,1626210401,ITA,ERR
105538,62590,7818055,40,AUT87,87,2020-03-29,2020,1,Protests,Peaceful protest,...,48.2085,16.3721,1,Heute - Osterreichs Neue Tageszeitung; Indymed...,Other-National,"On 29 March 2020, around 30 masked people gath...",0,1618442048,AUT,ERR
106566,63231,9416160,694,SIE4825,4825,2020-03-24,2020,1,Strategic developments,Change to group/activity,...,8.4871,-13.2356,3,Sierra Leone Telegraph; Cocorioko; Xinhua; Sie...,New media-National,"On 24 March 2020, the government of Sierra Leo...",0,1658253392,SLE,ERR
108116,64103,8874994,380,ITA937,937,2020-03-09,2020,1,Riots,Violent demonstration,...,41.9376,12.5660,1,Roma Today; Blitz Quotidiano; In Terris; Italp...,Subnational-National,"On 9 March 2020, hundreds of inmates detained ...",0,1646327101,ITA,ERR
