Collect the current YTD comparison between 2019 and 2020 only for orgs that were active throughout this timeframe grouped by org segment.

I am defining relevant orgs as those with accepted transactions between 1/1/2019 and 11/9/2019 as well as between 1/1/2020 and 11/9/2020 as well as currently have the status 'active'.

In [1]:
import pandas as pd
import sys, datetime
sys.path.insert(1, '../../scripts/')
from s3_support import *
%matplotlib inline

# build dataset

## load data

In [2]:
# isolate current YTD for 2020 to isolate the same timeframe from 2019
today_day = datetime.datetime.now().day
today_month = datetime.datetime.now().month
month_thresholds = ('1-1', '12-31')

In [3]:
q_2019 = '''select
                sum(amount) as vol,
                count(id) as count,
                min(date) as min_date,
                max(date) as max_date,
                org
            from transactions
            where 
                status='A' and
                date>='{}' and
                date<='{}' and
                amount!=99999999.99
            group by org'''.format("2019-{}".format(month_thresholds[0]), "2019-{}".format(month_thresholds[1]))
trans_2019 = redshift_query_read(q_2019)

In [4]:
q_2020 = '''select
                sum(amount) as vol,
                count(id) as count,
                min(date) as min_date,
                max(date) as max_date,
                org
            from transactions
            where 
                status='A' and
                date>='{}' and
                date<='{}' and
                amount!=99999999.99
            group by org'''.format("2020-{}".format(month_thresholds[0]), "2020-{}".format(month_thresholds[1]))
trans_2020 = redshift_query_read(q_2020)

In [5]:
orgs = get_dataframe_from_file("qgiv-stats-data", "organizations.names.csv")

## prep data to relevant orgs w/ segments

In [6]:
# limit orgs to those with transactions in each set & currently active
orgs_2019 = trans_2019['org'].tolist()
orgs_2020 = trans_2020['org'].tolist()
orgs_active = orgs[orgs['status']=='active']['id'].tolist()
orgs_intersection = list(set(orgs_2019) & set(orgs_2020) & set(orgs_active))

In [7]:
"2019 orgs: {}; 2020 orgs: {}; active orgs: {}; intersection: {}".format(len(orgs_2019), len(orgs_2020), len(orgs_active), len(orgs_intersection))

'2019 orgs: 3081; 2020 orgs: 3458; active orgs: 4625; intersection: 2443'

In [8]:
trans_2019 = trans_2019[trans_2019['org'].isin(orgs_intersection)]
trans_2020 = trans_2020[trans_2020['org'].isin(orgs_intersection)]

In [9]:
def get_segment(x):
    if len(orgs[orgs['id']==x]) > 0:
        return orgs[orgs['id']==x]['segment'].iloc[0]
    else:
        return None
trans_2019['segment'] = trans_2019['org'].apply(get_segment)
trans_2020['segment'] = trans_2020['org'].apply(get_segment)

# segment aggregates

## aggregate & set percentages

In [10]:
segment_totals_2019 = trans_2019.groupby('segment')[['vol', 'count']].agg(['mean', 'sum']).reset_index()
segment_totals_2019.columns = ['segment', '2019_vol_mean', '2019_vol_sum', '2019_count_mean', '2019_count_sum']
segment_totals_2019['2019_vol_perc'] = segment_totals_2019['2019_vol_sum'] / segment_totals_2019['2019_vol_sum'].sum()
segment_totals_2019['2019_count_perc'] = segment_totals_2019['2019_count_sum'] / segment_totals_2019['2019_count_sum'].sum()
segment_totals_2019.head(3)

Unnamed: 0,segment,2019_vol_mean,2019_vol_sum,2019_count_mean,2019_count_sum,2019_vol_perc,2019_count_perc
0,"A - Arts, Culture, and Humanities",28774.989178,4201148.42,226.047945,33003,0.02378226,0.02156949
1,B - Educational Institutions,64766.096095,17745910.33,451.565693,123729,0.1004577,0.0808645
2,"B - Educational Institutions , O - Youth Devel...",50.0,50.0,1.0,1,2.830448e-07,6.535615e-07


In [11]:
segment_totals_2020 = trans_2020.groupby('segment')[['vol', 'count']].agg(['mean', 'sum']).reset_index()
segment_totals_2020.columns = ['segment', '2020_vol_mean', '2020_vol_sum', '2020_count_mean', '2020_count_sum']
segment_totals_2020['2020_vol_perc'] = segment_totals_2020['2020_vol_sum'] / segment_totals_2020['2020_vol_sum'].sum()
segment_totals_2020['2020_count_perc'] = segment_totals_2020['2020_count_sum'] / segment_totals_2020['2020_count_sum'].sum()
segment_totals_2020.head(3)

Unnamed: 0,segment,2020_vol_mean,2020_vol_sum,2020_count_mean,2020_count_sum,2020_vol_perc,2020_count_perc
0,"A - Arts, Culture, and Humanities",31436.757055,4589766.53,256.520548,37452,0.01878196,0.018986
1,B - Educational Institutions,77361.811861,21197136.45,412.248175,112956,0.08674163,0.057261
2,"B - Educational Institutions , O - Youth Devel...",180.0,180.0,5.0,5,7.36585e-07,3e-06


## merge years to single dataset and format

In [12]:
segment_totals = segment_totals_2019.merge(segment_totals_2020, on='segment')

In [13]:
for c in segment_totals.columns:
    if 'count_sum' in c:
        segment_totals[c] = segment_totals[c].apply(lambda x: "{:,.0f}".format(x))
    elif 'count_mean' in c:
        segment_totals[c] = segment_totals[c].apply(lambda x: "{:,.2f}".format(x))
    elif 'vol_mean' in c or 'vol_sum' in c:
        segment_totals[c] = segment_totals[c].apply(lambda x: "${:,.2f}".format(x))
    elif 'perc' in c:
        segment_totals[c] = segment_totals[c].apply(lambda x: "{:.2f}%".format(x * 100.))
segment_totals.head(3)

Unnamed: 0,segment,2019_vol_mean,2019_vol_sum,2019_count_mean,2019_count_sum,2019_vol_perc,2019_count_perc,2020_vol_mean,2020_vol_sum,2020_count_mean,2020_count_sum,2020_vol_perc,2020_count_perc
0,"A - Arts, Culture, and Humanities","$28,774.99","$4,201,148.42",226.05,33003,2.38%,2.16%,"$31,436.76","$4,589,766.53",256.52,37452,1.88%,1.90%
1,B - Educational Institutions,"$64,766.10","$17,745,910.33",451.57,123729,10.05%,8.09%,"$77,361.81","$21,197,136.45",412.25,112956,8.67%,5.73%
2,"B - Educational Institutions , O - Youth Devel...",$50.00,$50.00,1.0,1,0.00%,0.00%,$180.00,$180.00,5.0,5,0.00%,0.00%


# store to file

In [14]:
#segment_totals.to_csv("segment_yoy.csv", index=False)