In [None]:
import pandas as pd
from pandasql import sqldf
from sklearn.cluster import KMeans

sql = lambda q: sqldf(q, globals())

In [None]:
def _get_tot_cnt_df(df):
    tot_cnt_df = sql(f'''
        SELECT source_singular,
        count(*) total
        FROM df
        GROUP BY source_singular
    ''')
    
def _clean(c):
    c = c.replace(' ', '_')
    c = c.replace('-', '_')
    c = c.replace('.', '')
    c = c.replace(',', '')
    
    return c


def _generate_country_df(total_min, total_max, pct_min):
    df = pd.DataFrame(columns=['source_singular', 'country', 'country_total', 'country_pct'])
    for t in geo_df['country']:
        t_cln = _clean(t)
        t_cln_pct_nm = t_cln + '_pct'

        query = f'''
        SELECT
            source_singular,
            "{t_cln}_majority" AS "country",
            {t_cln} AS "country_total",
            {t_cln_pct_nm} AS country_pct
        FROM
            type_pct_df
        WHERE 
            total >= {total_min}
            AND total <= {total_max}
            AND {t_cln_pct_nm} >= {pct_min}
        '''

        df1 = sql(query)
        df = pd.concat([df, df1])
        
    return df


def _generate_sub_event_type_df(total_min, total_max, pct_min):
    df = pd.DataFrame(columns=['source_singular', 'sub_event_type', 'sub_event_type_total', 'sub_event_type_pct'])
    for t in src_df['sub_event_type']:
        t_cln = t.replace(" ", "_")
        t_cln = t_cln.replace("/", "_")
        t_cln_pct_nm = t_cln + '_pct'

        query = f'''
        SELECT
            source_singular,
            "{t_cln}_majority" AS "sub_event_type",
            {t_cln} AS "sub_event_type_total",
            {t_cln_pct_nm} AS sub_event_type_pct
        FROM
            type_pct_df
        WHERE 
            total >= {total_min}
            AND total <= {total_max}
            AND {t_cln_pct_nm} >= {pct_min}
        '''

        df1 = sql(query)
        df = pd.concat([df, df1])
        
    return df

Tagging by sub_event_type

In [None]:
# takes extended DF as parameter
def tag_sub_event_type(df, total_min, total_max, pct_min):
    src_df = sql('''
        SELECT DISTINCT sub_event_type
        FROM df
    ''')

    query_fmt = ''
    for i, t in enumerate(src_df['sub_event_type']):
        t_cln = t.replace(" ", "_")
        t_cln = t_cln.replace("/", "_")
        query_fmt += f'sum(CASE WHEN sub_event_type = "{t}" THEN 1 ELSE 0 END ) AS "{t_cln}"'
        if i < len(src_df['sub_event_type']) - 1:
            query_fmt += ',\n'

    type_cnt_df = sql(f'''
        SELECT source_singular,
        {query_fmt}
        FROM df
        GROUP BY source_singular
    ''')

    query_fmt = ''
    for i, t in enumerate(src_df['sub_event_type']):
        t_cln = t.replace(" ", "_")
        t_cln = t_cln.replace("/", "_")
        t_cln_pct_nm = t_cln + '_pct'
        query_fmt += f'cast({t_cln} AS DOUBLE) / cast(total AS DOUBLE) "{t_cln_pct_nm}"'
        if i < len(src_df['sub_event_type']) - 1:
            query_fmt += ',\n'

    type_pct_df = sql(f'''
        SELECT
            a.*,
            b.total,
            {query_fmt}
        FROM type_cnt_df a
        JOIN tot_cnt_df b
            ON a.source_singular = b.source_singular
    ''')
    
    sub_event_type_df = _generate_sub_event_type_df(total_min, total_max, pct_min)
    
    return sub_event_type_df

Tagging by country

In [None]:
# takes extended DF as parameter
def tag_country(df, total_min, total_max, pct_min):
    geo_df = sql('''
        SELECT DISTINCT country
        FROM df
    ''')

    query_fmt = ''
    for i, t in enumerate(geo_df['country']):
        t_cln = _clean(t)
        query_fmt += f'sum(CASE WHEN country = "{t}" THEN 1 ELSE 0 END ) AS "{t_cln}"'
        if i < len(geo_df['country']) - 1:
            query_fmt += ',\n'


    type_cnt_df = sql(f'''
    SELECT source_singular,
    {query_fmt}
    FROM df
    GROUP BY source_singular
    ''')
    
    tot_cnt_df = _get_tot_cnt_df(df)
    
    query_fmt = ''
    for i, t in enumerate(geo_df['country']):
        t_cln = _clean(t)
        t_pct_nm = t_cln + '_pct'
        query_fmt += f'cast({t_cln} AS DOUBLE) / cast(total AS DOUBLE) "{t_pct_nm}"'
        if i < len(geo_df['country']) - 1:
            query_fmt += ',\n'

    type_pct_df = sql(f'''
    SELECT
        a.*,
        b.total,
    {query_fmt}
    FROM type_cnt_df a
    JOIN tot_cnt_df b
        ON a.source_singular = b.source_singular
    ''')
    
    cntry_df = generate_country_df(total_min, total_max, pct_min)
#     cntry_df = generate_country_df(10, 1000, 0.75)
    
    return cntry_df

Tagging by time period

In [None]:
# takes extended DF as parameter
def tag_time_period(df, n_clusters=4, init='random', n_init=10, max_iter=100, tol=1e-04, random_state=0):
    km = KMeans(
        n_clusters=n_clusters, init=init,
        n_init=n_init, max_iter=max_iter, 
        tol=tol, random_state=random_state
    )

    cluster_df = df
    cluster_df.event_date_unix = cluster_df['event_date'].apply(lambda x: pd.Timestamp(x).timestamp())

    X = np.array(cluster_df.event_date_unix).reshape(-1, 1)
    y_km = km.fit_predict(X)

    cluster_df['time_period'] = y_km
    
    return cluster_df