In [26]:
import pandas as pd
import sys, datetime
sys.path.insert(1, '../../scripts/')
from s3_support import *
%matplotlib inline

# build dataset

## load data

In [27]:
# isolate current YTD for 2020 to isolate the same timeframe from 2019
today_day = datetime.datetime.now().day
today_month = datetime.datetime.now().month
month_thresholds = ('1-1', '12-31')

In [28]:
q_2019 = '''select
                sum(amount) as vol,
                count(id) as count,
                min(date) as min_date,
                max(date) as max_date,
                org
            from transactions
            where 
                status='A' and
                date>='{}' and
                date<='{}' and
                amount!=99999999.99
            group by org'''.format("2019-{}".format(month_thresholds[0]), "2019-{}".format(month_thresholds[1]))
trans_2019 = redshift_query_read(q_2019)

In [29]:
q_2020 = '''select
                sum(amount) as vol,
                count(id) as count,
                min(date) as min_date,
                max(date) as max_date,
                org
            from transactions
            where 
                status='A' and
                date>='{}' and
                date<='{}' and
                amount!=99999999.99
            group by org'''.format("2020-{}".format(month_thresholds[0]), "2020-{}".format(month_thresholds[1]))
trans_2020 = redshift_query_read(q_2020)

In [30]:
orgs = get_dataframe_from_file("qgiv-stats-data", "organizations.names.csv")

## narrow data to florida orgs

In [31]:
fl_orgs = orgs[(orgs['state']=='FL')|(orgs['state']=='Florida')]
len(orgs), len(fl_orgs)

(9700, 978)

In [32]:
orgs = fl_orgs
trans_2019 = trans_2019[trans_2019['org'].isin(orgs['id'].tolist())]
trans_2020 = trans_2020[trans_2020['org'].isin(orgs['id'].tolist())]

## prep data to florida orgs w/ segments

In [33]:
# limit orgs to those with transactions in each set & currently active
orgs_2019 = trans_2019['org'].tolist()
orgs_2020 = trans_2020['org'].tolist()
orgs_active = orgs[orgs['status']=='active']['id'].tolist()
orgs_intersection = list(set(orgs_2019) & set(orgs_2020) & set(orgs_active))

In [34]:
"2019 orgs: {}; 2020 orgs: {}; active orgs: {}; intersection: {}".format(len(orgs_2019), len(orgs_2020), len(orgs_active), len(orgs_intersection))

'2019 orgs: 397; 2020 orgs: 432; active orgs: 550; intersection: 326'

In [35]:
trans_2019 = trans_2019[trans_2019['org'].isin(orgs_intersection)]
trans_2020 = trans_2020[trans_2020['org'].isin(orgs_intersection)]

In [36]:
def get_segment(x):
    if len(orgs[orgs['id']==x]) > 0:
        return orgs[orgs['id']==x]['segment'].iloc[0]
    else:
        return None
trans_2019['segment'] = trans_2019['org'].apply(get_segment)
trans_2020['segment'] = trans_2020['org'].apply(get_segment)

# segment aggregates

## aggregate & set percentages

In [37]:
segment_totals_2019 = trans_2019.groupby('segment')[['vol', 'count']].agg(['mean', 'sum']).reset_index()
segment_totals_2019.columns = ['segment', '2019_vol_mean', '2019_vol_sum', '2019_count_mean', '2019_count_sum']
segment_totals_2019['2019_vol_perc'] = segment_totals_2019['2019_vol_sum'] / segment_totals_2019['2019_vol_sum'].sum()
segment_totals_2019['2019_count_perc'] = segment_totals_2019['2019_count_sum'] / segment_totals_2019['2019_count_sum'].sum()
segment_totals_2019.head(3)

Unnamed: 0,segment,2019_vol_mean,2019_vol_sum,2019_count_mean,2019_count_sum,2019_vol_perc,2019_count_perc
0,"A - Arts, Culture, and Humanities",25671.542143,359401.59,245.714286,3440,0.016075,0.02027
1,B - Educational Institutions,97685.91,3125949.12,553.96875,17727,0.139817,0.104457
2,C - Environmental Advocacy and Protection,41184.82,288293.74,336.285714,2354,0.012895,0.013871


In [38]:
segment_totals_2020 = trans_2020.groupby('segment')[['vol', 'count']].agg(['mean', 'sum']).reset_index()
segment_totals_2020.columns = ['segment', '2020_vol_mean', '2020_vol_sum', '2020_count_mean', '2020_count_sum']
segment_totals_2020['2020_vol_perc'] = segment_totals_2020['2020_vol_sum'] / segment_totals_2020['2020_vol_sum'].sum()
segment_totals_2020['2020_count_perc'] = segment_totals_2020['2020_count_sum'] / segment_totals_2020['2020_count_sum'].sum()
segment_totals_2020.head(3)

Unnamed: 0,segment,2020_vol_mean,2020_vol_sum,2020_count_mean,2020_count_sum,2020_vol_perc,2020_count_perc
0,"A - Arts, Culture, and Humanities",26910.292143,376744.09,351.428571,4920,0.01352,0.02488
1,B - Educational Institutions,108240.330313,3463690.57,506.03125,16193,0.124303,0.081887
2,C - Environmental Advocacy and Protection,39203.438571,274424.07,397.571429,2783,0.009848,0.014073


## merge years to single dataset and format

In [39]:
segment_totals = segment_totals_2019.merge(segment_totals_2020, on='segment')

In [40]:
for c in segment_totals.columns:
    if 'count_sum' in c:
        segment_totals[c] = segment_totals[c].apply(lambda x: "{:,.0f}".format(x))
    elif 'count_mean' in c:
        segment_totals[c] = segment_totals[c].apply(lambda x: "{:,.2f}".format(x))
    elif 'vol_mean' in c or 'vol_sum' in c:
        segment_totals[c] = segment_totals[c].apply(lambda x: "${:,.2f}".format(x))
    elif 'perc' in c:
        segment_totals[c] = segment_totals[c].apply(lambda x: "{:.2f}%".format(x * 100.))
segment_totals.head(3)

Unnamed: 0,segment,2019_vol_mean,2019_vol_sum,2019_count_mean,2019_count_sum,2019_vol_perc,2019_count_perc,2020_vol_mean,2020_vol_sum,2020_count_mean,2020_count_sum,2020_vol_perc,2020_count_perc
0,"A - Arts, Culture, and Humanities","$25,671.54","$359,401.59",245.71,3440,1.61%,2.03%,"$26,910.29","$376,744.09",351.43,4920,1.35%,2.49%
1,B - Educational Institutions,"$97,685.91","$3,125,949.12",553.97,17727,13.98%,10.45%,"$108,240.33","$3,463,690.57",506.03,16193,12.43%,8.19%
2,C - Environmental Advocacy and Protection,"$41,184.82","$288,293.74",336.29,2354,1.29%,1.39%,"$39,203.44","$274,424.07",397.57,2783,0.98%,1.41%


# store to file

In [42]:
#segment_totals.to_csv("segment_yoy_fl.csv", index=False)