In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from pandasql import sqldf

sql = lambda q: sqldf(q, globals())


extd_df = pd.read_csv('../data/EXPANDED_acled_covid19.csv')

In [2]:
geo_df = sql('''
SELECT DISTINCT country
FROM extd_df
''')
geo_df

Unnamed: 0,country
0,Venezuela
1,South Korea
2,China
3,France
4,United States
...,...
215,Cayman Islands
216,Montserrat
217,Saint Kitts and Nevis
218,Anguilla


In [3]:
def _clean(c):
    c = c.replace(' ', '_')
    c = c.replace('-', '_')
    c = c.replace('.', '')
    c = c.replace(',', '')
    
    return c

In [4]:
query_fmt = ''
for i, t in enumerate(geo_df['country']):
    t_cln = _clean(t)
    query_fmt += f'sum(CASE WHEN country = "{t}" THEN 1 ELSE 0 END ) AS "{t_cln}"'
    if i < len(geo_df['country']) - 1:
        query_fmt += ',\n'
        

type_cnt_df = sql(f'''
SELECT source_singular,
{query_fmt}
FROM extd_df
GROUP BY source_singular
''')
type_cnt_df

Unnamed: 0,source_singular,Venezuela,South_Korea,China,France,United_States,Costa_Rica,Philippines,Germany,Austria,...,Chad,Tanzania,Falkland_Islands,United_Arab_Emirates,Eritrea,Cayman_Islands,Montserrat,Saint_Kitts_and_Nevis,Anguilla,Brunei
0,061.ua,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1 News,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,10 Tampa Bay,0,0,0,0,7,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,10/11 Now,0,0,0,0,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1010WINS,0,0,0,0,15,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5357,net.hr,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5358,news.com.au,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5359,nncMX,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5360,stiripesurse.ro,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
tot_cnt_df = sql(f'''
SELECT source_singular,
count(*) total
FROM extd_df
GROUP BY source_singular
''')
tot_cnt_df

Unnamed: 0,source_singular,total
0,061.ua,10
1,1 News,5
2,10 Tampa Bay,7
3,10/11 Now,4
4,1010WINS,15
...,...,...
5357,net.hr,29
5358,news.com.au,23
5359,nncMX,21
5360,stiripesurse.ro,1


In [6]:
query_fmt = ''
for i, t in enumerate(geo_df['country']):
    t_cln = _clean(t)
    t_pct_nm = t_cln + '_pct'
    query_fmt += f'cast({t_cln} AS DOUBLE) / cast(total AS DOUBLE) "{t_pct_nm}"'
    if i < len(geo_df['country']) - 1:
        query_fmt += ',\n'
        
type_pct_df = sql(f'''
SELECT
    a.*,
    b.total,
{query_fmt}
FROM type_cnt_df a
JOIN tot_cnt_df b
    ON a.source_singular = b.source_singular
''')

type_pct_df.to_csv('../data/source_country.csv')

type_pct_df[(type_pct_df.Germany_pct > 0) & (type_pct_df.Austria_pct > 0)][['source_singular', 'total', 'Germany', 'Austria', 'Germany_pct', 'Austria_pct']]

Unnamed: 0,source_singular,total,Germany,Austria,Germany_pct,Austria_pct
175,Aachener Zeitung,67,66,1,0.985075,0.014925
507,BZ Berlin,3,2,1,0.666667,0.333333
582,Berliner Zeitung,104,101,3,0.971154,0.028846
1990,Heute - Osterreichs Neue Tageszeitung,96,1,95,0.010417,0.989583
2176,Indymedia (Germany),80,79,1,0.9875,0.0125
2321,Judische Allgemeine,4,3,1,0.75,0.25
2546,Kleine Zeitung,31,1,30,0.032258,0.967742
2579,Krone,119,8,111,0.067227,0.932773
3076,Merkur,516,515,1,0.998062,0.001938
3617,Oe24.At,86,2,84,0.023256,0.976744


In [7]:
def generate_country_df(total_min, total_max, pct_min):
    df = pd.DataFrame(columns=['source_singular', 'country', 'country_total', 'country_pct'])
    for t in geo_df['country']:
        t_cln = _clean(t)
        t_cln_pct_nm = t_cln + '_pct'

        query = f'''
        SELECT
            source_singular,
            "{t_cln}_majority" AS "country",
            {t_cln} AS "country_total",
            {t_cln_pct_nm} AS country_pct
        FROM
            type_pct_df
        WHERE 
            total >= {total_min}
            AND total <= {total_max}
            AND {t_cln_pct_nm} >= {pct_min}
        '''

        df1 = sql(query)
        df = pd.concat([df, df1])
        
    return df

In [9]:
generate_country_df(10, 200, 0.99)

Unnamed: 0,source_singular,country,country_total,country_pct
0,Caraota Digital,Venezuela_majority,64,1.0
1,Diario 2001,Venezuela_majority,13,1.0
2,Diario El Tiempo,Venezuela_majority,27,1.0
3,Diario Primicia,Venezuela_majority,18,1.0
4,El Carabobeno,Venezuela_majority,72,1.0
...,...,...,...,...
0,FJ Portal,Egypt_majority,12,1.0
0,Africa Guinee,Guinea_majority,23,1.0
1,Guinea News,Guinea_majority,16,1.0
2,Guinee Matin,Guinea_majority,26,1.0
