In [45]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from pandasql import sqldf

sql = lambda q: sqldf(q, globals())


base_df = pd.read_csv('../data/acled_covid19.csv')
extd_df = pd.read_csv('../data/EXPANDED_acled_covid19.csv')

In [46]:
df = sql('''
SELECT DISTINCT sub_event_type
FROM extd_df
''')
df

Unnamed: 0,sub_event_type
0,Peaceful protest
1,Protest with intervention
2,Change to group/activity
3,Looting/property destruction
4,Attack
5,Violent demonstration
6,Abduction/forced disappearance
7,Other
8,Mob violence
9,Arrests


In [47]:
query_fmt = ''
for i, t in enumerate(df['sub_event_type']):
    t_cln = t.replace(" ", "_")
    t_cln = t_cln.replace("/", "_")
    query_fmt += f'sum(CASE WHEN sub_event_type = "{t}" THEN 1 ELSE 0 END ) AS "{t_cln}"'
    if i < len(df['sub_event_type']) - 1:
        query_fmt += ',\n'
        

type_cnt_df = sql(f'''
SELECT source_singular,
{query_fmt}
FROM extd_df
GROUP BY source_singular
''')
type_cnt_df

Unnamed: 0,source_singular,Peaceful_protest,Protest_with_intervention,Change_to_group_activity,Looting_property_destruction,Attack,Violent_demonstration,Abduction_forced_disappearance,Other,Mob_violence,Arrests,Disrupted_weapons_use,Excessive_force_against_protesters,Remote_explosive_landmine_IED,Sexual_violence,Armed_clash,Grenade,Agreement,Shelling_artillery_missile_attack,Air_drone_strike
0,061.ua,8,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1 News,4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,10 Tampa Bay,6,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,10/11 Now,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1010WINS,13,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5357,net.hr,29,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5358,news.com.au,23,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5359,nncMX,20,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
5360,stiripesurse.ro,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [48]:
tot_cnt_df = sql(f'''
SELECT source_singular,
count(*) total
FROM extd_df
GROUP BY source_singular
''')
tot_cnt_df

Unnamed: 0,source_singular,total
0,061.ua,10
1,1 News,5
2,10 Tampa Bay,7
3,10/11 Now,4
4,1010WINS,15
...,...,...
5357,net.hr,29
5358,news.com.au,23
5359,nncMX,21
5360,stiripesurse.ro,1


In [None]:
query_fmt = ''
for i, t in enumerate(df['sub_event_type']):
    t_cln = t.replace(" ", "_")
    t_cln = t_cln.replace("/", "_")
    t_cln_pct_nm = t_cln + '_pct'
    query_fmt += f'{t_cln} / total "{t_cln_pct_nm}"'
    if i < len(df['sub_event_type']) - 1:
        query_fmt += ',\n'
        
type_pct_df = sql(f'''
SELECT
    a.*,
    b.total,
{query_fmt}
FROM type_cnt_df a
JOIN tot_cnt_df b
    ON a.source_singular = b.source_singular
''')

type_pct_df.to_csv('../data/source_subeventtype.csv')

type_pct_df

In [None]:
event_lim = 100
type_pct_df_mod = type_pct_df[type_pct_df.total < event_lim]

fig, ax = plt.subplots(figsize=(12,10))
ax.set_title(f'Number of events by individual sources with less than {event_lim} events')
plt.hist(type_pct_df_mod.total, edgecolor="red", bins=event_lim)

In [None]:
event_lim = 100
type_pct_df_mod = type_pct_df[type_pct_df.total >= event_lim]

fig, ax = plt.subplots(figsize=(12,10))
ax.set_title(f'Number of events by individual sources with greater than {event_lim} events')
plt.hist(type_pct_df_mod.total, edgecolor="red", bins=100)