In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from pandasql import sqldf

sql = lambda q: sqldf(q, globals())


extd_df = pd.read_csv('../data/EXPANDED_acled_covid19.csv')

In [None]:
geo_df = sql('''
SELECT DISTINCT country
FROM extd_df
''')
geo_df

In [None]:
def _clean(c):
    c = c.replace(' ', '_')
    c = c.replace('-', '_')
    c = c.replace('.', '')
    c = c.replace(',', '')
    
    return c

In [None]:
query_fmt = ''
for i, t in enumerate(geo_df['country']):
    t_cln = _clean(t)
    query_fmt += f'sum(CASE WHEN country = "{t}" THEN 1 ELSE 0 END ) AS "{t_cln}"'
    if i < len(geo_df['country']) - 1:
        query_fmt += ',\n'
        

type_cnt_df = sql(f'''
SELECT source_singular,
{query_fmt}
FROM extd_df
GROUP BY source_singular
''')
type_cnt_df

In [None]:
tot_cnt_df = sql(f'''
SELECT source_singular,
count(*) total
FROM extd_df
GROUP BY source_singular
''')
tot_cnt_df

In [None]:
query_fmt = ''
for i, t in enumerate(geo_df['country']):
    t_cln = _clean(t)
    t_pct_nm = t_cln + '_pct'
    query_fmt += f'cast({t_cln} AS DOUBLE) / cast(total AS DOUBLE) "{t_pct_nm}"'
    if i < len(geo_df['country']) - 1:
        query_fmt += ',\n'
        
type_pct_df = sql(f'''
SELECT
    a.*,
    b.total,
{query_fmt}
FROM type_cnt_df a
JOIN tot_cnt_df b
    ON a.source_singular = b.source_singular
''')

type_pct_df.to_csv('../data/source_country.csv')

type_pct_df[(type_pct_df.Germany_pct > 0) & (type_pct_df.Austria_pct > 0)][['source_singular', 'total', 'Germany', 'Austria', 'Germany_pct', 'Austria_pct']]

In [None]:
def generate_country_df(total_min, total_max, pct_min):
    df = pd.DataFrame(columns=['source_singular', 'country', 'country_total', 'country_pct'])
    for t in geo_df['country']:
        t_cln = _clean(t)
        t_cln_pct_nm = t_cln + '_pct'

        query = f'''
        SELECT
            source_singular,
            "{t_cln}_majority" AS "country",
            {t_cln} AS "country_total",
            {t_cln_pct_nm} AS country_pct
        FROM
            type_pct_df
        WHERE 
            total >= {total_min}
            AND total <= {total_max}
            AND {t_cln_pct_nm} >= {pct_min}
        '''

        df1 = sql(query)
        df = pd.concat([df, df1])
        
    return df

In [None]:
generate_country_df(10, 100000, 0.55)